def nn_inference_pipeline(model_name: Text, model_path: Text, data_params: Dict, dataloader_params: Dict, save_dir=None): model = init_model(model_name) model = model.load(model_path) x_test, y_test, dataset = preprocessing_pipeline(data_params) dataloader = generate_dataloader(x_test, y_test, dataloader_params) pred = [model.predict(x)[0] for x in dataloader['valid']] post_pred = dataset.postprocessing(pred, model_name) y_true = [model.predict(x)[1] for x in dataloader['valid']] report_df = report(y_true, post_pred) logger.info(f' > Model: {model.name}') logger.info(f' > Dataset: {dataset.__class__.__name__}') logger.info(f' > Test result: \n {report_df}') if save_dir: save_test(model_name, data_params, report_df, save_dir)
def inference_without_trained_model(model_name: Text, data_params: Dict, model_params: Dict, save_dir=None): x, y, dataset, _ = preprocessing_pipeline(data_params) model_func = init_model(model_name) model = model_func(model_name, model_params) y_pred = model.predict(x) y_pred = model.postprocessing(y_pred) report_df = report(y, y_pred) logger.info(f' > Model: {model.name}') logger.info(f' > Dataset: {dataset.__class__.__name__}') logger.info(f' > Test result: \n {report_df}') if save_dir: save_model_data(None, data_params, model_params, report_df, save_dir=save_dir) return model
def model_training(model_name: Text, data_params: Dict, model_params: Dict, seed: int, save_dir=None): x, y, dataset, vectorizer = preprocessing_pipeline(data_params) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=data_params['test_size'], shuffle=True, random_state=seed) model_func = init_model(model_name) model = model_func(model_name, model_params) model.fit(x_train, y_train) y_pred = model.predict(x_test) y_pred = dataset.postprocessing(y_pred, model_name) report_df = report(y_test, y_pred) logger.info(f' > Model: {model.name}') logger.info(f' > Test result: \n {report_df}') if save_dir: save_model_data(model, data_params, model_params, report_df, save_dir=save_dir) return model
def model_training_nn(model_name: Text, data_params: Dict, dataloader_params: Dict, model_params: Dict, save_dir=None): x, y, dataset, vectorizer = preprocessing_pipeline(data_params) dataloader = generate_dataloader(x, y, dataloader_params) model_params['dataloader'] = dataloader model_params['network']['n_words'] = vectorizer.get_n_words() if vectorizer is not None else None model_func = init_model(model_name) model = model_func(model_name, model_params) model.fit() y_pred = np.ravel([model.predict(x) for x in dataloader['valid']]) y_pred_post = dataset.postprocessing(y_pred, model_name) y_true = np.ravel([x[1].numpy() for x in dataloader['valid']]) report_df = report(y_true, y_pred_post) logger.info(f' > Model: {model.name}') logger.info(f' > Test result: \n {report_df}') # if save_dir: # save_model_data(model, # data_params, # model_params, # report_df, # save_dir=save_dir) return model
def test_network(): seed = 2021 emb_dim = 100 dataset_params = { 'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv', 'dataset_type': MOVIE_DATASET, 'preprocessed': True, # 'target_scaling': (0, 1), 'vectorization': TOKENIZER, # 'vector_params': {'ngram_range': (1, 3), # 'max_features': emb_dim}, 'imbalance': None, # 'imb_params': {'random_state': seed, # 'k_neighbors': 3}, 'train': True } dataloader_params = { 'split_size': 0.7, 'shuffle': True, 'batch_size': 32, 'random_seed': seed } network_params = { 'emb_dim': emb_dim, 'dataset_type': TOKENIZER, 'kernel_size': [3, 5, 7], 'out_channels': 30, 'batch_size': 32, 'stride': 1, 'padding': [0, 1, 2], 'pooling_kernel': 2, 'dropout': 0.4 } training_params = {'epochs': 10, 'lr': 0.001} x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params) network_params['n_words'] = vectorizer.get_n_words() dataloader = generate_dataloader(x, y, dataloader_params) network = Conv1D_Network(network_params) loss = nn.BCELoss() optimizer = torch.optim.Adam model = NetworkModel(network, dataloader, loss, optimizer) model._init_optimizer(training_params['lr']) model.train(training_params['epochs']) return
def test_bert_model(self): logger = logging.getLogger() logger.setLevel(logging.INFO) seed = 2021 dataset_params = { 'data_path': None, 'dataset_type': SST_DATASET, 'preprocessed': False, 'vectorization': None, 'imbalance': None, 'train': True } dataloader_params = { 'split_size': 0.7, 'shuffle': True, 'batch_size': 64, 'random_seed': seed } network_params = {'dropout': 0.2, 'device': torch.device('cpu:0')} training_params = { 'epochs': 10, 'lr': 0.001, 'save_dir': 'resources/models/', 'patience': 5 } x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params) dataloader = generate_dataloader(x, y, dataloader_params, Bert_NN_Dataset) network = Pretrained_Bert_Network(network_params).to( network_params['device']) loss = nn.BCELoss() optimizer = AdamW model = Pretrained_Bert_Model(network, dataloader, loss, optimizer, training_params['save_dir'], network_params['device']) model._init_optimizer(training_params['lr']) model.train(training_params['epochs'], patience=training_params['patience'])
def test_tokenizing(self): seed = 2021 emb_dim = 100 dataset_params = {'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv', 'dataset_type': MOVIE_DATASET, 'preprocessed': True, # 'target_scaling': (0, 1), 'vectorization': TOKENIZER, 'imbalance': None, 'imb_params': {'random_state': seed, 'k_neighbors': 3}, 'train': True} x, y, dataset = preprocessing_pipeline(dataset_params) self.assertEqual(len(x), len(y))
def inference_pipeline(model_name: Text, model_path: Text, data_params: Dict, save_dir=None): model = init_model(model_name) model = model.load(model_path) x_test, y_test, dataset = preprocessing_pipeline(data_params) pred = model.predict(x_test) post_pred = dataset.postprocessing(pred, model_name) report_df = report(y_test, post_pred) logger.info(f' > Model: {model.name}') logger.info(f' > Dataset: {dataset.__class__.__name__}') logger.info(f' > Test result: \n {report_df}') if save_dir: save_test(model_name, data_params, report_df, save_dir)
def test_train_model(self): logger = logging.getLogger() logger.setLevel(logging.INFO) seed = 2021 emb_dim = 100 dataset_params = { 'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv', 'dataset_type': MOVIE_DATASET, 'preprocessed': True, # 'target_scaling': (0, 1), 'vectorization': TOKENIZER, # 'vector_params': {'ngram_range': (1, 3), # 'max_features': emb_dim}, 'imbalance': None, # 'imb_params': {'random_state': seed, # 'k_neighbors': 3}, 'train': True } dataloader_params = { 'split_size': 0.7, 'shuffle': True, 'batch_size': 64, 'random_seed': seed, 'dataset_class': NN_Dataset } network_params = { 'emb_dim': MAX_WORD_SENTENCE, 'dataset_type': TOKENIZER, 'kernel_size': [5, 7, 9], 'out_channels': 10, 'stride': 1, 'padding': [0, 1, 2], 'pooling_kernel': 2, 'dropout': 0.2, 'device': torch.device('cpu:0') } training_params = { 'epochs': 10, 'lr': 0.001, 'save_dir': 'resources/models/', 'patience': 5 } x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params) network_params['n_words'] = vectorizer.get_n_words() dataloader = generate_dataloader(x, y, dataloader_params) network = Conv1D_Network(network_params).to(network_params['device']) loss = nn.CrossEntropyLoss() optimizer = torch.optim.Adam model = ConvModel(network, dataloader, loss, optimizer, training_params['save_dir']) model._init_optimizer(training_params['lr']) model.train(training_params['epochs'], patience=training_params['patience'])