def data_loader(status, shuffle=False, validation=False, num_workers=2): prepare_data = Preprocessing() if DATA_TYPE == 'original': data_X, data_y = prepare_data.original(status) elif DATA_TYPE == 'features': data_X, data_y = prepare_data.features(status) elif DATA_TYPE == 'trans': data_X, data_y = prepare_data.trans(status) data = DealDataset(data_X, data_y) size = data.len if validation: train, dev = random_split(data, [int(size*SPLIT_RATE), size-int(size*SPLIT_RATE)]) train, dev = DealDataset(train[:][0],train[:][1]), DealDataset(dev[:][0],dev[:][1]) train_loader = DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=num_workers) dev_loader = DataLoader(dataset=dev, batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=num_workers) return train_loader, dev_loader else: loader = DataLoader(dataset=data, batch_size=BATCH_SIZE, shuffle=shuffle, num_workers=num_workers) return loader
def main(): # Setup and get current configuration config = get_configuration() # Print parameters print_configuration() #Initialize class - preprocessing preprocess = Preprocessing(config=config) # Perform preprocessing train_input, train_length_input, train_labels, test_input, test_length_input = preprocess.prepare_data( ) # Initialize class and select mode and model - embeddings if config.mode != "infer": if config.emb_model == "glove": model_emb = GloVeModel(config=config, dict_vocab=preprocess.dict_vocab) else: model_emb = Word2VecModel(config=config, dict_vocab=preprocess.dict_vocab) # Fit corpus model_emb.fit_to_corpus() # Train embeddings model_emb.train() # Train model RNN_Model(config, preprocess.dict_vocab_reverse, train_input, train_length_input, train_labels, test_input, test_length_input)
def __init__( self, name, data='fer2013', file='train.csv', ): self.name = name.lower() self.image = {} root_dir = os.path.dirname(__file__) directory_template = '{root_dir}/../../data/{name}/' self.directory = directory_template.format(root_dir=root_dir, name=name) prep = Preprocessing(name=data) self.data = prep.load_data(name='img_arrays', filename=file)
def main(): # Setup and get current configuration config = get_configuration() # Print parameters print_configuration() # Perform preprocessing preprocess = Preprocessing(config=config) train_input_encoder, train_input_decoder, \ test_input_encoder, test_input_decoder, = preprocess.prepare_data() # Initialize model class - train or infer: select mode Seq2seq(config, train_input_encoder, train_input_decoder, test_input_encoder, test_input_decoder, preprocess.dict_vocab_reverse, mode=None)
def prepare_data(self): # Initialize preprocessor object preprocessing = Preprocessing() # The 'file' is loaded and split by char text = preprocessing.read_dataset(self.file) # Given 'text', it is created two dictionaries # a dictiornary about: from char to index # a dictorionary about: from index to char self.char_to_idx, self.idx_to_char = preprocessing.create_dictionary(text) # Given the 'window', it is created the set of training sentences as well as # the set of target chars self.sequences, self.targets = preprocessing.build_sequences_target(text, self.char_to_idx, window=self.window) # Gets the vocabuly size self.vocab_size = len(self.char_to_idx)
from utils import ModelImporter, Preprocessing import torch if not __name__ == '__main_': pre = Preprocessing('fer2013_DatasetA') pre.load_data(filename='test_public_norm.csv', name='test') X_df = pre.get(name='test').drop(columns=['emotion']) y_df = pre.get(name='test')['emotion'] n_classes = 7 n_totalepochs = 200 learning_rate = 0.00005 batch_size = 32 epoch_n = 40#for the temp folder dtype = torch.float device = torch.device("cpu") model_name = f'cnn_double_layer_reduced_{learning_rate}_{batch_size}_{n_totalepochs}_{n_classes}' model_name_bestvalloss = f'{model_name}_epoch150' m_importer = ModelImporter('fer2013_DatasetA') model = m_importer.load_nn_model(model_name_bestvalloss) model.eval() X_test = model.reshape_data(torch.tensor(X_df.values, device=device, dtype=dtype)) y_test = torch.tensor(y_df.values, device=device, dtype=torch.long) y_pred = model(X_test).argmax(1) print(y_pred)
import os import time import argparse if not __name__ == '__main_': print('train learning transfer VGG16 DatasetA') parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args=parser.parse_args() current_working_dir = os.getcwd() print('current_working_dir: ', current_working_dir) pre = Preprocessing('fer2013', root_dir=current_working_dir) pre.load_data('train_reduced_norm.csv.gz', name='train') pre.load_data('test_public_norm.csv.gz', name='val') X = pre.get('val').drop(columns=['emotion']) y = pre.get('val')['emotion'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42) val = pd.DataFrame(X_test) val['emotion'] = y_test pre.set(name='val', value=val) print(pre.get(name='val').head()) train_pixels = pre.get(name='train').drop(columns=['emotion']) val_pixels = pre.get(name='val').drop(columns=['emotion'])
from models import CnnSimple import time import matplotlib.pyplot as plt if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() script_root_dir = os.path.dirname(__file__) pre = Preprocessing('fer2013', root_dir=script_root_dir) pre.load_data(filename='train_reduced_norm_centered.csv', name='train') pre.load_data(filename='test_public_norm_centered.csv', name='validate') X_train_df = pre.get(name='train').drop(columns=['emotion']) y_train_df = pre.get(name='train')['emotion'] X_val_df = pre.get(name='validate').drop(columns=['emotion']) y_val_df = pre.get(name='validate')['emotion'] n_classes = 7 n_epochs = 100 learning_rate = 0.001 batch_size = 64 model_name = f'cnn_simple_reduced_bs_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnSimple(model_name, d_out=n_classes)
import argparse from models import CnnSimple import time import matplotlib.pyplot as plt if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013_DatasetA') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() pre = Preprocessing('fer2013') pre.load_data(filename='DatasetB.csv', name='train') X_df = pre.get(name='train').drop(columns=['emotion']) y_df = pre.get(name='train')['emotion'] dtype = torch.float device = torch.device("cpu") n_classes = 7 n_epochs = 100 learning_rate = 0.0001 batch_size = 32 model_name = f'cnn_simple_B_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnSimple(model_name, d_out=n_classes)
import pickle import pdb from utils import generate_batches from utils import Preprocessing from configuration import get_configuration from configuration import print_configuration from models.basic_model import BasicModel from models.seq2seq_model import Seq2SeqModel from models.seq2seq_model import Seq2SeqModelAttention # Load configuration and create model config = get_configuration() model = Seq2SeqModelAttention(config) # Prepare vocabulary and triples data preprocess = Preprocessing(config=config) preprocess.create_vocabulary("Training_Shuffled_Dataset.txt") preprocess.prepare_data() # Preprocess Cornell data preprocess = Preprocessing(train_path_file="cornell_dataset.txt", test_path_file="Validation_Shuffled_Dataset.txt", train_path_file_target="input_train_cornell", test_path_file_target="input_test_triples", triples=False, config=config) preprocess.prepare_data() # Preprocess Twitter data preprocess = Preprocessing(train_path_file="twitter_dataset.txt", test_path_file="Validation_Shuffled_Dataset.txt", train_path_file_target="input_train_twitter", test_path_file_target="input_test_triples",
config['wd'] = 0.0001 config['epoch'] = args.epoch # config['lr_decay'] = np.arange(2, 50) config['experiment_name'] = args.net config['save_prediction_path'] = 'second_phase_data' data_path = 'data%s' % args.data train_data, train_label, test_data, test_label = get_data(data_path) # del test_data, test_label if not os.path.exists(config['experiment_name']): os.mkdir(config['experiment_name']) os.chdir(config['experiment_name']) pre = Preprocessing(config['view'], normalize_mode=config['norm_axis']) c = pre.transform_data(test_data, normalize=True) c = torch.from_numpy(c) testset = torch.utils.data.TensorDataset(c) test_loader = torch.utils.data.DataLoader(testset, batch_size=config['batch_size'], shuffle=False, num_workers=1) if args.net == 'ours': net = DTS() elif args.net == 'unet': net = UNet(n_channels=2, n_classes=2) elif args.net == 'uresnet': net = UResNet(num_classes=2, input_channels=2, inplanes=16)
print(train.head()) col = train.columns print(train.isna().sum()) # class Preprocessing: # def __init__(self, data): # self.data = data # def preprocess(self): # data = self.data # train = data.dropna(thresh=2) # train = train.drop(['Survived','PassengerId','Cabin', 'Embarked', 'Age', 'Name'], axis=1) # train = pd.get_dummies(train) # return train from utils import Preprocessing pr = Preprocessing(train) X = pr.preprocess() print(train.columns) y = train['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) rf = RandomForestClassifier() rf.fit(X_train, y_train) file = open('model.pkl', 'wb') pickle.dump(rf, file) joblib.dump(rf, 'job.pkl') #file = open('model.pkl', 'rb')
import pandas as pd import numpy as np from utils.Preprocessing import * def load_data(file_path): data = pd.read_csv(file_path, sep="\s+") return data # X = load_data("../data/train/X_train.txt") # y = load_data("../data/train/y_train.txt") # y = np.asarray(y.values) # actionA = X.iloc[np.argwhere(y==5)[:,0]] # print(len(actionA)) a = Preprocessing() X, Y = a.trans('train') print(X.shape) X, Y = a.trans('test') print(X.shape)
config['wd'] = 0.0001 config['epoch'] = args.epoch config['lr_decay'] = np.arange(2, config['epoch']) config['experiment_name'] = args.net data_path = 'data%s' % args.data train_data, train_label, test_data, test_label = get_data(data_path) # del test_data, test_label torch.manual_seed(config['seed']) if not os.path.exists(config['experiment_name']): os.mkdir(config['experiment_name']) os.chdir(config['experiment_name']) pre = Preprocessing(config['view'], normalize_mode=config['norm_axis']) a, b = pre.transform_pair(train_data, train_label, normalize=True) c = pre.transform_data(test_data, normalize=True) index = b.max(axis=(1, 2)) == 1 trainset = torch.utils.data.TensorDataset(torch.from_numpy(a[index]), torch.from_numpy(b[index])) a, b, c = torch.from_numpy(a), torch.from_numpy(b), torch.from_numpy(c) trainset2 = torch.utils.data.TensorDataset(a, b) testset = torch.utils.data.TensorDataset(c) valset = torch.utils.data.TensorDataset(a) train_loader = torch.utils.data.DataLoader(trainset, batch_size=config['batch_size'], shuffle=True, num_workers=1)
if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() n_classes = 7 n_epochs = 300 learning_rate = 0.0001 batch_size = 32 pre = Preprocessing('fer2013') pre.load_data(filename='DatasetC.csv', name='train') X_df = pre.get(name='train').drop(columns=['emotion']) y_df = pre.get(name='train')['emotion'] dtype = torch.float device = torch.device("cpu") model_name = f'cnn_double_layer_C_bs_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnDoubleLayer(model_name, d_out=n_classes) model.train() train_classifier = TrainClassifier2(model, X_df, y_df) t = time.time() trained_model, optimizer, criterion, \
from sklearn import preprocessing from sklearn.svm import SVC from utils.Preprocessing import * from utils.Plot import * from utils.constants import * prepare_data = Preprocessing() train_X, train_y = prepare_data.statistics("train") test_X, test_y = prepare_data.statistics("test") # train_X = preprocessing.scale(train_X) # test_X = preprocessing.scale(test_X) clf = SVC(gamma='auto') clf.fit(train_X, np.ravel(train_y)) pred_y = clf.predict(test_X) heatmap(test_y, pred_y, "SVM" + str(NUM_FEATURES_USED))
import torch import argparse from models import AnnAutoencoder import matplotlib.pyplot as plt if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013_DatasetA') parser.add_argument('--s_model', default=True, help='save trained model') args = parser.parse_args() n_epochs = 100 pre = Preprocessing('fer2013_DatasetA') pre.load_data(filename='DatasetA.csv', name='train') X_train_df = pre.get(name='train').drop(columns=['emotion']) y_train_df = pre.get(name='train')['emotion'] dtype = torch.float device = torch.device("cpu") H1 = 1764 n_features = len(X_train_df.columns) n_features_encoded = 1296 print(f'features {n_features}') print(f'H1 {H1}') print(f'n_features_encoded {n_features_encoded}')
if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() n_classes = 7 n_epochs = 300 learning_rate = 0.0001 batch_size = 32 pre = Preprocessing('fer2013') pre.load_data(filename='DatasetD.csv', name='train') X_df = pre.get(name='train').drop(columns=['emotion']) y_df = pre.get(name='train')['emotion'] dtype = torch.float device = torch.device("cpu") model_name = f'cnn_double_layer_D_bs_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnDoubleLayer(model_name, d_out=n_classes) model.train() train_classifier = TrainClassifier2(model, X_df, y_df) t = time.time() trained_model, optimizer, criterion, loss_hist, loss_val_hist, f1_val_hist = train_classifier.run_train(
import matplotlib.pyplot as plt import os if not __name__ == '__main_': parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() root_dir = os.path.dirname(__file__) pre = Preprocessing('fer2013', root_dir) pre.load_data(filename='train_reduced_norm.csv', name='train') X_df = pre.get(name='train').drop(columns=['emotion']) y_df = pre.get(name='train')['emotion'] dtype = torch.float device = torch.device("cpu") n_classes = 7 n_epochs = 100 learning_rate = 0.0001 batch_size = 32 model_name = f'cnn_simple_reduced_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnSimple(model_name, d_out=n_classes)
parser = argparse.ArgumentParser(description='fer2013') parser.add_argument('--s_model', default=True, help='save trained model') parser.add_argument('--s_patterns', default=False, help='save patterns images') args = parser.parse_args() n_classes = 7 n_epochs = 200 learning_rate = 0.001 batch_size = 64 current_working_dir = os.getcwd() print('current_working_dir: ', current_working_dir) pre = Preprocessing('fer2013', root_dir=current_working_dir) pre.load_data(filename='DatasetD.csv', name='train') pre.load_data(filename='test_public_norm.csv', name='validate') X_train_df = pre.get(name='train').drop(columns=['emotion']) y_train_df = pre.get(name='train')['emotion'] X_val_df = pre.get(name='validate').drop(columns=['emotion']) y_val_df = pre.get(name='validate')['emotion'] dtype = torch.float model_name = f'cnn_triple_layer_D_bs_{learning_rate}_{batch_size}_{n_epochs}_{n_classes}' model = CnnTripleLayer(model_name, d_out=n_classes) model.train() train_classifier = TrainClassifier2(model,
from models.seq2seq_model import Seq2SeqModel from models.seq2seq_model import Seq2SeqModelAttention from configuration import get_configuration from utils import Preprocessing # Initialize the model config = get_configuration() preprocess = Preprocessing(config=config) model = Seq2SeqModelAttention(config) checkpoint_file = 'runs/baseline-cornell-twitter-attn-dropout/model-18000' # Launch chat interface print( "*** Hi there. Ask me a question. I will try my best to reply to you with something intelligible.\ If you think that is not happening, enter \"q\" and quit ***") query = input(">") while query != "q": # Tokenize the query preprocess.initialize_vocabulary() token_ids = preprocess.sentence_to_token_ids(query) # Reverse the token ids and feed into the RNN reverse_token_ids = [list(reversed(token_ids))] output_tokens = model.infer(checkpoint_file, reverse_token_ids, verbose=False) # Convert token ids back to words and print to output output = preprocess.token_ids_to_sentence(output_tokens) print(output[0]) query = input(">")