def get_num_and_data(args): if not args.num: print('use test data') pre = Preprocessing('digits') pre.load_data(filename='test.csv', name='test') X_test = torch.tensor(pre.get('test').drop(columns=['0']).values, device=device, dtype=dtype) y_test = torch.tensor(pre.get('test')['0'].values, device=device, dtype=dtype) index = random.randint(0, len(y_test)) print('index {}'.format(index)) num = int(y_test[index].item()) print('real_numer {}'.format(num)) data_to_predict = X_test[index:index + 1, :] return num, data_to_predict else: print('use written images') num = int(args.num) im_imp = Image_Importer('digits') im_imp.load_image_as_grey(num) print('real_numer {}'.format(num)) data_to_predict = im_imp.get_image_as_256px_array(num) return num, data_to_predict
import argparse from models import LogReg import matplotlib.pyplot as plt if not __name__ == '__main_': parser = argparse.ArgumentParser(description='Digits') parser.add_argument('--s_model', default=True, help='save trained model') args=parser.parse_args() n_classes = 10 n_epochs = 200 pre = Preprocessing('digits') pre.load_data(filename='train.csv', name='train') X_df = pre.get(name='train').drop(columns=['0']) y_df = pre.get(name='train')['0'] dtype = torch.float device = torch.device("cpu") model_name = 'logreg_digits' model = LogReg(model_name, 256, n_classes) learning_rate = 0.0001 batch_size = 32 train_classifier = TrainClassifier(model, X_df, y_df)
from sklearn.model_selection import train_test_split import torch.nn as nn import models from models import LogReg if not __name__ == '__main_': parser = argparse.ArgumentParser(description='IMDBData') parser.add_argument('--n_feat', default=1000, help='number of features') parser.add_argument('--s_model', default=False, help='save trained model') args=parser.parse_args() pre = Preprocessing('IMDB') n_classes = 2 n_features = int(args.n_feat) n_epochs = 100 pre.load_data(filename=f'training_data_{n_features}.csv', name='training_data') X_df = pre.get(name='training_data').drop(columns=['target']) y_df = pre.get(name='training_data')['target'] model = LogReg('log_reg', n_features, n_classes) train_classifier = TrainClassifier(model, X_df, y_df) trained_model, optimizer, criterion, loss_hist, loss_validate_hist = train_classifier.run_train(n_epochs = n_epochs) pre.save_results(loss_hist, loss_validate_hist, f'log_reg_{100}')
import pickle import torch import torch.nn as nn from preprocessing_utils import Preprocessing if not __name__ == '__main_': pre = Preprocessing('digits') pre.load_data(filename='test.csv', name='test') dtype = torch.float device = torch.device("cpu") X_test = torch.tensor(pre.get('test').drop(columns=['0']).values, device=device, dtype=dtype) y_test = torch.tensor(pre.get('test')['0'].values, device=device, dtype=dtype) #print(y_test) filename = "../data/digits/softmax_torch_digits_model.sav" model = pickle.load(open(filename, 'rb')) y_pred = model(X_test) softmax = torch.nn.Softmax(dim=1) y_pred = softmax(y_pred).argmax(1) #print(y_pred)
from preprocessing_utils import Preprocessing, ModelImporter import torch if not __name__ == '__main_': pre = Preprocessing('digits') pre.load_data(filename='test.csv', name='test') X_df = pre.get(name='test').drop(columns=['0']) y_df = pre.get(name='test')['0'] dtype = torch.float device = torch.device("cpu") model_name = 'cnn_digits' m_importer = ModelImporter('digits') model = m_importer.load_nn_model(model_name, 0, 10, 100) X_test = model.reshape_data( torch.tensor(X_df.values, device=device, dtype=dtype)) y_test = torch.tensor(y_df.values, device=device, dtype=torch.long) y_pred = model(X_test).argmax(1) accuracy_soft = (y_pred == y_test).float().mean() print(f'test accuracy {accuracy_soft.item()}')
def get_mini_batching(X_train, y_train, batch_size): idx = np.random.choice(range(len(y_train)), batch_size, replace=False) X_train_mini = X_train[idx, :][:batch_size, :] y_train_mini = y_train.view(len(y_train), 1)[idx, :] return X_train_mini, y_train_mini.view(len(y_train_mini)) if not __name__ == '__main_': parser = argparse.ArgumentParser(description='digits') parser.add_argument('--s_model', default=False, help='save trained model') args = parser.parse_args() pre = Preprocessing('digits') pre.load_data(filename='train.csv', name='train') X_train_df, X_val_df, y_train_df, y_val_df = train_test_split( pre.get('train').drop(columns=['0']), pre.get('train')['0'], test_size=0.01) #transfom to torch striuctures dtype = torch.float device = torch.device("cpu") X_train = torch.tensor(X_train_df.values, device=device, dtype=dtype) y_train = torch.tensor(y_train_df.values, device=device, dtype=dtype) # Softmax regression model
from preprocessing_utils import Preprocessing import torch import argparse if not __name__ == '__main_': parser = argparse.ArgumentParser(description='IMDBData') parser.add_argument('--n_feat', default=1000, help='number of features') args = parser.parse_args() n_feat = args.n_feat pre = Preprocessing('IMDB') pre.load_data(filename=f'test_data_{n_feat}.csv', name='test_data') X_test_df = pre.get(name='test_data').drop(columns=['target']) y_test_df = pre.get(name='test_data')['target'] dtype = torch.float device = torch.device("cpu") X_test = torch.tensor(X_test_df.values, device=device, dtype=dtype) y_test = torch.tensor(y_test_df.values, device=device, dtype=torch.long) n_features = X_test.size()[1] name = f'logreg_{n_features}.pt' model = pre.load_model(name) y_pred = model(X_test).argmax(1) accuracy_soft = (y_pred == y_test).float().mean() #results = (y_test == y_pred)
device = torch.device("cpu") model_name = 'cnn_digits_2' m_importer = ModelImporter('digits') model = m_importer.load_nn_model(model_name, 0, 10, 100) image = model.reshape_data(torch.tensor(img_array, device=device, dtype=dtype)) y_test = torch.tensor(int(args.num), device=device, dtype=torch.long) y_pred = model(image).argmax(1) accuracy_soft = (y_pred == y_test).float().mean() print(f'number predicted {y_pred.item()}') pre = Preprocessing('digits') detected_patterns = model.get_detected_patterns1() plt.figure(1, figsize=(20, 10)) for p in range(model.n_patterns1): pattern = detected_patterns[0][p].reshape(detected_patterns.shape[2], detected_patterns.shape[3]) patern_np = pattern.detach().numpy().reshape(8, 8) plt.subplot(2, 3, 1 + p) plt.imshow(patern_np, cmap='hot', interpolation='none') pre.save_plt_as_image(plt, f'written_number {args.num}_patterns') detected_patterns = model.get_detected_patterns2() plt.figure(1, figsize=(20, 20)) for p in range(model.n_patterns2): pattern = detected_patterns[0][p].reshape(detected_patterns.shape[2], detected_patterns.shape[3])
"max_df": 1.0, "min_df": 0, "stop_words": [], # keep defaults "analyzer": "word", "tokenizer": None, "lowcase": False } lda_args = { "n_topics": 2, "max_iter": 1, "n_jobs": -1, # Note: thold is more prior than n_topic_words # But works slower "n_topic_words": 4, "word_thold": None } preprocessing_pipeline = Preprocessing() doc2topic = Docs2Topics(feature_extraction_args, lda_args) preprocessed_text = preprocessing_pipeline.preprocess_batch([texts]) topics, words_per_topic = doc2topic.fit_get_topics(preprocessed_text) import pprint pprint.pprint(topics) print(words_per_topic)
import torch import argparse from models import AnnAutoencoder import matplotlib.pyplot as plt if not __name__ == '__main_': parser = argparse.ArgumentParser(description='Digits') parser.add_argument('--s_model', default=True, help='save trained model') args = parser.parse_args() n_epochs = 50 pre = Preprocessing('digits') pre.load_data(filename='train.csv', name='train') X_train_df = pre.get(name='train').drop(columns=['0']) y_train_df = pre.get(name='train')['0'] dtype = torch.float device = torch.device("cpu") H1 = 128 n_features = len(X_train_df.columns) n_features_encoded = 1 print(f'features {n_features}') print(f'H1 {H1}') print(f'n_features_encoded {n_features_encoded}')
import time if not __name__ == '__main_': parser = argparse.ArgumentParser(description='IMDBData') parser.add_argument('--n_words', default=100, help='num words') parser.add_argument('--s_model', default=False, help='save transform model') args = parser.parse_args() n_words = args.n_words pre = Preprocessing('imdb') TRAIN_PATH_POS = 'train/pos/' TRAIN_PATH_NEG = 'train/neg/' pre.load_all_texts_from_directory(path=TRAIN_PATH_POS, name='raw_pos') pre.load_all_texts_from_directory(path=TRAIN_PATH_NEG, name='raw_neg') texts_list = pd.concat([pre.data['raw_pos'], pre.data['raw_neg']]) print('fit & transform training data') text_transformer = TextTransformer(num_words=n_words) train_features = text_transformer.fit_and_transform(texts_list) train_features['target'] = np.append(np.ones(len(pre.data['raw_pos'])), np.zeros(len(pre.data['raw_neg'])))
#idx = np.random.randint(0, len(y_train), batch_size) idx = np.random.choice(range(len(y_train)), batch_size, replace=False) X_train_mini = X_train[idx, :] y_train_mini = y_train.view(len(y_train), 1)[idx, :]#:batch_size, :] return X_train_mini, y_train_mini.view(len(y_train_mini)) if not __name__ == '__main_': parser = argparse.ArgumentParser(description='IMDBData') parser.add_argument('--n_feat', default=1000, help='number of features') parser.add_argument('--s_model', default=False, help='save trained model') args=parser.parse_args() pre = Preprocessing('IMDB') n_features = int(args.n_feat) pre.load_data(filename=f'training_data_{n_features}.csv', name='training_data') X_df = pre.get(name='training_data').drop(columns=['target']) y_df = pre.get(name='training_data')['target'] print(X_df.head()) #going to use torch dtype = torch.float device = torch.device("cpu") X = torch.tensor(X_df.values, device=device, dtype=dtype) y = torch.tensor(y_df.values, device=device, dtype=dtype)
from preprocessing_utils import Preprocessing from sklearn.model_selection import train_test_split if not __name__ == '__main_': pre_train = Preprocessing('digits') kwarg = {'header': None, 'sep': ' '} pre_train.load_data(filename='zip.train', name='raw', **kwarg) pre_train.cleanup(name='raw', drop_duplicates=True, dropna={ 'axis': 1, 'thresh': 2 }) print(pre_train.get('clean').head()) #classes = ['0_0.0', '0_1.0', '0_2.0', '0_3.0', '0_4.0', '0_5.0', '0_6.0', '0_7.0', '0_8.0', '0_9.0'] X = pre_train.get('clean').drop(columns=[0]) y = pre_train.get('clean')[0] pre_train.set(name='train', value=pre_train.get('clean')) pre_train.save(name='train') pre_test = Preprocessing('digits') kwarg = {'header': None, 'sep': ' '} pre_test.load_data(filename='zip.test', name='raw', **kwarg) pre_test.cleanup(name='raw',