def get_supervised_result(model, train_iterator, val_iterator, test_iterator, EPOCHS=5, cls_thresh=None, n_classes=cfg['data']['num_classes']): """ Train and Predict on full supervised mode. Returns: """ model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer( model, train_iterator, val_iterator, N_EPOCHS=EPOCHS) # logger.debug(losses) # evaluate the model test_loss, test_preds_trues = predict_with_label(model_best, test_iterator) if cls_thresh is None: cls_thresh = [0.5] * n_classes predicted_labels = logit2label(DataFrame( test_preds_trues['preds'].numpy()), cls_thresh, drop_irrelevant=False) result = calculate_performance_pl(test_preds_trues['trues'], predicted_labels) logger.info("Supervised result: {}".format(dumps(result, indent=4))) return result, model_best
def test_graph_classifier(model: GAT_Graph_Classifier, loss_func, data_loader: torch.utils.data.dataloader.DataLoader): model.eval() preds = [] trues = [] losses = [] for iter, (graph_batch, label) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) prediction = model(graph_batch, emb) loss = loss_func(prediction, label) preds.append(prediction.detach()) trues.append(label.detach()) losses.append(loss.detach()) losses = torch.mean(torch.stack(losses)) preds = torch.cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = torch.sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = torch.cat(trues) result_dict = calculate_performance(trues, preds) test_output = {'preds': preds, 'trues': trues, 'result': result_dict} # logger.info(dumps(result_dict, indent=4)) return losses, test_output
def eval_graph_classifier(model: GAT_GCN_Classifier, G, X, loss_func, data_loader: utils.data.dataloader.DataLoader, n_classes=cfg['data']['num_classes'], save_gcn_embs=False): model.eval() preds = [] trues = [] losses = [] for iter, (graph_batch, local_ids, label, global_ids, node_counts) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): graph_batch = graph_batch.to(device) emb = emb.to(device) # local_ids = local_ids.to(device) # node_counts = node_counts.to(device) # global_ids = global_ids.to(device) G = G.to(device) X = X.to(device) if save_gcn_embs: save(X, 'X_glove.pt') start_time = timeit.default_timer() prediction = model(graph_batch, emb, local_ids, node_counts, global_ids, G, X, save_gcn_embs) test_time = timeit.default_timer() - start_time test_count = label.shape[0] logger.info(f"Test time per example: [{test_time / test_count} sec]") if prediction.dim() == 1: prediction = prediction.unsqueeze(1) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): prediction = prediction.to(device) loss = loss_func(prediction, label) preds.append(prediction.detach()) trues.append(label.detach()) losses.append(loss.detach()) losses = mean(stack(losses)) preds = cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = cat(trues) if n_classes == 1: result_dict = calculate_performance_bin_sk(trues, preds) else: result_dict = calculate_performance(trues, preds) test_output = {'preds': preds, 'trues': trues, 'result': result_dict} # logger.info(dumps(result_dict, indent=4)) return losses, test_output
def train_graph_classifier( model: GAT_Graph_Classifier, data_loader: torch.utils.data.dataloader.DataLoader, loss_func: torch.nn.modules.loss.BCEWithLogitsLoss, optimizer, epochs: int = 5, eval_data_loader: torch.utils.data.dataloader.DataLoader = None): train_epoch_losses = [] train_epoch_dict = OrderedDict() for epoch in range(epochs): model.train() epoch_loss = 0 preds = [] trues = [] for iter, (graph_batch, label) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) prediction = model(graph_batch, emb) loss = loss_func(prediction, label) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.detach().item() preds.append(prediction.detach()) trues.append(label.detach()) epoch_loss /= (iter + 1) losses, test_output = test_graph_classifier( model, loss_func=loss_func, data_loader=eval_data_loader) logger.info( f"Epoch {epoch}, Train loss {epoch_loss}, Eval loss {losses}," f" Macro F1 {test_output['result']['f1']['macro'].item()}") # logger.info(dumps(test_output['result'], indent=4)) train_epoch_losses.append(epoch_loss) preds = torch.cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = torch.sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = torch.cat(trues) result_dict = calculate_performance(trues, preds) # logger.info(dumps(result_dict, indent=4)) train_epoch_dict[epoch] = { 'preds': preds, 'trues': trues, 'result': result_dict } # logger.info(f'Epoch {epoch} result: \n{result_dict}') return train_epoch_losses, train_epoch_dict
def train_graph_classifier( model, G, X, data_loader: utils.data.dataloader.DataLoader, loss_func: nn.modules.loss.BCEWithLogitsLoss, optimizer, epochs: int = 5, eval_data_loader: utils.data.dataloader.DataLoader = None, test_data_loader: utils.data.dataloader.DataLoader = None, n_classes=cfg['data']['num_classes']): logger.info("Started training...") train_epoch_losses = [] train_epoch_dict = OrderedDict() for epoch in range(epochs): model.train() epoch_loss = 0 preds = [] trues = [] for iter, (graph_batch, local_ids, label, global_ids, node_counts) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): graph_batch = graph_batch.to(device) emb = emb.to(device) # local_ids = local_ids.to(device) # node_counts = node_counts.to(device) # global_ids = global_ids.to(device) G = G.to(device) X = X.to(device) start_time = timeit.default_timer() prediction = model(graph_batch, emb, local_ids, node_counts, global_ids, G, X) # if epoch == 30: # from evaluations import get_freq_disjoint_token_vecs, plot # glove_vecs = get_freq_disjoint_token_vecs(S_vocab, T_vocab, X) # plot(glove_vecs) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): prediction = prediction.to(device) if prediction.dim() == 1: prediction = prediction.unsqueeze(1) loss = loss_func(prediction, label) optimizer.zero_grad() loss.backward() optimizer.step() train_time = timeit.default_timer() - start_time train_count = label.shape[0] logger.info( f"Training time per example: [{train_time / train_count} sec]") logger.info(f"Iteration {iter}, loss: {loss.detach().item()}") epoch_loss += loss.detach().item() preds.append(prediction.detach()) trues.append(label.detach()) epoch_loss /= (iter + 1) val_losses, val_output = eval_graph_classifier( model, G, X, loss_func=loss_func, data_loader=eval_data_loader) logger.info(f'val_output: \n{dumps(val_output["result"], indent=4)}') test_losses, test_output = eval_graph_classifier( model, G, X, loss_func=loss_func, data_loader=test_data_loader) logger.info(f'test_output: \n{dumps(test_output["result"], indent=4)}') logger.info( f"Epoch {epoch}, Train loss {epoch_loss}, val loss " f"{val_losses}, test loss {test_losses}, Val Macro F1 " f"{val_output['result']['f1']['macro'].item()} Test Macro F1" f" {test_output['result']['f1']['macro'].item()}") # logger.info(f"Epoch {epoch}, Train loss {epoch_loss}, val loss " # f"{val_losses}, Val Macro F1 {val_output['result']['f1']['macro'].item()}") train_epoch_losses.append(epoch_loss) preds = cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = cat(trues) if n_classes == 1: result_dict = calculate_performance_bin_sk(trues, preds) else: result_dict = calculate_performance(trues, preds) # logger.info(dumps(result_dict, indent=4)) train_epoch_dict[epoch] = { 'preds': preds, 'trues': trues, 'result': result_dict } # logger.info(f'Epoch {epoch} result: \n{result_dict}') return train_epoch_losses, train_epoch_dict
def classify( train_df=None, test_df=None, stoi=None, vectors=None, n_classes=cfg['data']['num_classes'], dim=cfg['embeddings']['emb_dim'], data_dir=dataset_dir, train_filename=cfg['data']['train'], test_filename=cfg['data']['test'], cls_thresh=None, epoch=cfg['training']['num_epoch'], num_layers=cfg['lstm_params']['num_layers'], num_hidden_nodes=cfg['lstm_params']['hid_size'], dropout=cfg['model']['dropout'], default_thresh=0.5, lr=cfg['model']['optimizer']['lr'], train_batch_size=cfg['training']['train_batch_size'], test_batch_size=cfg['training']['eval_batch_size'], ): """ :param n_classes: :param test_batch_size: :param train_df: :param test_df: :param stoi: :param vectors: :param dim: :param data_dir: :param train_filename: :param test_filename: :param cls_thresh: :param epoch: :param num_layers: :param num_hidden_nodes: :param dropout: :param default_thresh: :param lr: :param train_batch_size: :return: """ ## Prepare labelled source data: # logger.info('Prepare labelled source data') # if train_df is None: # train_df = read_labelled_json(data_dir, train_filename) # train_df = labels_mapper(train_df) train_dataname = train_filename + "_4class.csv" train_df.to_csv(join(data_dir, train_dataname)) if stoi is None: logger.critical('GLOVE features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True) else: logger.critical('GCN features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True, embedding_file=None, embedding_dir=None) train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim) ## Plot representations: # plot_features_tsne(train_vocab.vocab.vectors, # list(train_vocab.vocab.stoi.keys())) ## Prepare labelled target data: logger.info('Prepare labelled target data') if test_df is None: test_df = read_labelled_json(data_dir, test_filename) test_dataname = test_filename + "_4class.csv" test_df.to_csv(join(data_dir, test_dataname)) test_dataset, (test_vocab, test_label) = get_dataset_fields( csv_dir=data_dir, csv_file=test_dataname, # init_vocab=True, labelled_data=True) # check whether cuda is available # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info('Get iterator') train_iter, val_iter = dataset2bucket_iter( (train_dataset, test_dataset), batch_sizes=(train_batch_size, test_batch_size)) size_of_vocab = len(train_vocab.vocab) num_output_nodes = n_classes # instantiate the model logger.info('instantiate the model') model = BiLSTM_Classifier(size_of_vocab, num_hidden_nodes, num_output_nodes, dim, num_layers, dropout=dropout) # architecture logger.info(model) # No. of trianable parameters logger.info('No. of trianable parameters') count_parameters(model) # Initialize the pretrained embedding logger.info('Initialize the pretrained embedding') pretrained_embeddings = train_vocab.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) logger.debug(pretrained_embeddings.shape) # label_cols = [str(cls) for cls in range(n_classes)] logger.info('Training model') model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer( model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr) plot_training_loss(losses['train'], losses['val'], plot_name='loss' + str(epoch) + str(lr)) if cls_thresh is None: cls_thresh = [default_thresh] * n_classes predicted_labels = logit2label(DataFrame( val_preds_trues_best['preds'].cpu().numpy()), cls_thresh, drop_irrelevant=False) logger.info('Calculate performance') result = calculate_performance_pl(val_preds_trues_best['trues'], val_preds_trues_best['preds']) logger.info("Result: {}".format(result)) # result_df = flatten_results(result) # result_df.round(decimals=4).to_csv( # join(data_dir, test_filename + '_results.csv')) return result
def predict_with_label(model, iterator, criterion=None, metric=True): """ Predicts and calculates performance. Labels mandatory Args: model: iterator: criterion: Returns: """ # initialize every epoch epoch_loss = 0 if criterion is None: criterion = nn.BCEWithLogitsLoss() preds_trues = { 'preds': [], 'trues': [], 'ids': [], 'losses': [], 'results': [] } # deactivating dropout layers model.eval() # deactivates autograd with no_grad(): for i, batch in enumerate(iterator): # retrieve text and no. of words text, text_lengths = batch.text # convert to 1d tensor predictions = model(text, text_lengths).squeeze() # compute loss and accuracy batch_labels = torchtext_batch2multilabel(batch) preds_trues['preds'].append(predictions) preds_trues['trues'].append(batch_labels) preds_trues['ids'].append(batch.ids) loss = criterion(predictions, batch_labels) # keep track of loss and accuracy epoch_loss += loss.item() preds_trues['losses'].append(epoch_loss) # epoch_acc += acc.item() # epoch_acc += acc["accuracy"]["unnormalize"] if metric: ## Converting raw scores to probabilities using Sigmoid: preds = sigmoid(predictions) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = cat(preds_trues['trues']) result_dict = calculate_performance(trues, preds) preds_trues['preds'] = cat(preds_trues['preds']) preds_trues['trues'] = cat(preds_trues['trues']) preds_trues['ids'] = cat(preds_trues['ids']) preds_trues['losses'] = cat(preds_trues['losses']) return epoch_loss / len(iterator), preds_trues