def load_json(filename: str, filepath: str = dataset_dir, ext: str = ".json", show_path: bool = True) -> OrderedDict: """ Reads json file as a Python OrderedDict. :param filename: :param filepath: :param ext: :param show_path: :return: """ file_loc = join(filepath, filename + ext) if show_path: logger.debug("Reading JSON file: [{}]".format(file_loc)) if exists(file_loc): try: with open(file_loc, encoding="utf-8") as file: json_dict = load(file) json_dict = OrderedDict(json_dict) file.close() return json_dict except Exception as e: logger.debug( "Could not open file as JSON: [{}]. \n Reason:[{}]".format( file_loc, e)) logger.warn("Reading JSON as STR: [{}]".format(file_loc)) with open(file_loc, encoding="utf-8") as file: json_dict = str(file) json_dict = loads(json_dict) return json_dict else: raise FileNotFoundError("File not found at: [{}]".format(file_loc))
def create_tabular_dataset( csv_file: str, data_dir: str, fields=None, skip_header: bool = True) -> data.dataset.TabularDataset: """ Reads a csv file and returns TorchText TabularDataset format. Args: csv_file: fields: skip_header: Returns: """ if fields is None: _, fields, unlabelled_fields = prepare_fields() dataset = data.TabularDataset(path=join(data_dir, csv_file), format='csv', fields=fields, skip_header=skip_header) logger.debug(vars(dataset.examples[0])) return dataset
def propagate_labels( features, labels, ): label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1) label_prop_model.fit(features, labels) logger.debug(label_prop_model.classes_) # preds = label_prop_model.predict(features) preds = label_prop_model.predict_proba(features) # logger.debug(label_prop_model.classes_) return preds
def create_dataset(examples, fields=None): """ Creates a TorchText Dataset from examples (list) and fields (dict). Args: fields: skip_header: Returns: """ if fields is None: _, fields, unlabelled_fields = prepare_fields() dataset = data.Dataset(examples=examples, fields=fields) logger.debug(vars(dataset.examples[0])) return dataset
def logit2label(predictions_df: pd.core.frame.DataFrame, cls_thresh: [list, float], drop_irrelevant=False, return_df=False): """ Converts logit to multi-hot based on threshold per class. :param predictions_df: can be pd.DataFrame or np.NDArray or torch.tensor :param cls_thresh: List of floats as threshold for each class :param drop_irrelevant: Remove samples for which no class crossed it's threshold. i.e. [0.,0.,0.,0.] """ if isinstance(predictions_df, pd.core.frame.DataFrame): logger.debug( (predictions_df.values.min(), predictions_df.values.max())) df_np = predictions_df.to_numpy() elif isinstance(predictions_df, (np.ndarray, torch.Tensor)): df_np = predictions_df else: NotImplementedError( f'Only supports pd.DataFrame or np.ndarray or ' f'torch.Tensor but received [{type(predictions_df)}]') ## Create threshold list for all classes if only one threshold float is provided: if isinstance(cls_thresh, float): cls_thresh = [cls_thresh for i in range(df_np.shape[1])] for col in range(df_np.shape[1]): df_np[:, col][df_np[:, col] > cls_thresh[col]] = 1. df_np[:, col][df_np[:, col] <= cls_thresh[col]] = 0. if return_df: predictions_df = pd.DataFrame(df_np, index=predictions_df.index) if drop_irrelevant: # delete all rows where sum == 0 irrelevant_rows = [] for i, row in predictions_df.iterrows(): if sum(row) < 1: irrelevant_rows.append(i) predictions_df = predictions_df.drop(irrelevant_rows) return predictions_df else: return df_np
def classify( train_df=None, test_df=None, stoi=None, vectors=None, n_classes=cfg['data']['num_classes'], dim=cfg['embeddings']['emb_dim'], data_dir=dataset_dir, train_filename=cfg['data']['train'], test_filename=cfg['data']['test'], cls_thresh=None, epoch=cfg['training']['num_epoch'], num_layers=cfg['lstm_params']['num_layers'], num_hidden_nodes=cfg['lstm_params']['hid_size'], dropout=cfg['model']['dropout'], default_thresh=0.5, lr=cfg['model']['optimizer']['lr'], train_batch_size=cfg['training']['train_batch_size'], test_batch_size=cfg['training']['eval_batch_size'], ): """ :param n_classes: :param test_batch_size: :param train_df: :param test_df: :param stoi: :param vectors: :param dim: :param data_dir: :param train_filename: :param test_filename: :param cls_thresh: :param epoch: :param num_layers: :param num_hidden_nodes: :param dropout: :param default_thresh: :param lr: :param train_batch_size: :return: """ ## Prepare labelled source data: # logger.info('Prepare labelled source data') # if train_df is None: # train_df = read_labelled_json(data_dir, train_filename) # train_df = labels_mapper(train_df) train_dataname = train_filename + "_4class.csv" train_df.to_csv(join(data_dir, train_dataname)) if stoi is None: logger.critical('GLOVE features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True) else: logger.critical('GCN features') train_dataset, (train_vocab, train_label) = get_dataset_fields( csv_dir=data_dir, csv_file=train_dataname, min_freq=1, labelled_data=True, embedding_file=None, embedding_dir=None) train_vocab.vocab.set_vectors(stoi=stoi, vectors=vectors, dim=dim) ## Plot representations: # plot_features_tsne(train_vocab.vocab.vectors, # list(train_vocab.vocab.stoi.keys())) ## Prepare labelled target data: logger.info('Prepare labelled target data') if test_df is None: test_df = read_labelled_json(data_dir, test_filename) test_dataname = test_filename + "_4class.csv" test_df.to_csv(join(data_dir, test_dataname)) test_dataset, (test_vocab, test_label) = get_dataset_fields( csv_dir=data_dir, csv_file=test_dataname, # init_vocab=True, labelled_data=True) # check whether cuda is available # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info('Get iterator') train_iter, val_iter = dataset2bucket_iter( (train_dataset, test_dataset), batch_sizes=(train_batch_size, test_batch_size)) size_of_vocab = len(train_vocab.vocab) num_output_nodes = n_classes # instantiate the model logger.info('instantiate the model') model = BiLSTM_Classifier(size_of_vocab, num_hidden_nodes, num_output_nodes, dim, num_layers, dropout=dropout) # architecture logger.info(model) # No. of trianable parameters logger.info('No. of trianable parameters') count_parameters(model) # Initialize the pretrained embedding logger.info('Initialize the pretrained embedding') pretrained_embeddings = train_vocab.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) logger.debug(pretrained_embeddings.shape) # label_cols = [str(cls) for cls in range(n_classes)] logger.info('Training model') model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer( model, train_iter, val_iter, N_EPOCHS=epoch, lr=lr) plot_training_loss(losses['train'], losses['val'], plot_name='loss' + str(epoch) + str(lr)) if cls_thresh is None: cls_thresh = [default_thresh] * n_classes predicted_labels = logit2label(DataFrame( val_preds_trues_best['preds'].cpu().numpy()), cls_thresh, drop_irrelevant=False) logger.info('Calculate performance') result = calculate_performance_pl(val_preds_trues_best['trues'], val_preds_trues_best['preds']) logger.info("Result: {}".format(result)) # result_df = flatten_results(result) # result_df.round(decimals=4).to_csv( # join(data_dir, test_filename + '_results.csv')) return result
def main(model_type='GNN', data_dir: str = dataset_dir, lr=cfg["model"]["optimizer"]["lr"], mittens_iter: int = 300, gcn_hops: int = 5, glove_embs=None, labelled_source_name: str = cfg['data']['train'], labelled_val_name: str = cfg['data']['val'], unlabelled_source_name: str = cfg["data"]["source"]['unlabelled'], labelled_target_name: str = cfg['data']['test'], unlabelled_target_name: str = cfg["data"]["target"]['unlabelled'], train_batch_size=cfg['training']['train_batch_size'], test_batch_size=cfg['training']['eval_batch_size'], use_lpa=False): logger.critical(f'Current Learning Rate: [{lr}]') labelled_source_path = join(data_dir, labelled_source_name) unlabelled_source_name = unlabelled_source_name unlabelled_target_name = unlabelled_target_name S_dataname = unlabelled_source_name + "_data.csv" T_dataname = unlabelled_target_name + "_data.csv" if exists(labelled_source_path + 'S_vocab.json')\ and exists(labelled_source_path + 'T_vocab.json')\ and exists(labelled_source_path + 'labelled_token2vec_map.json'): # ## Read labelled source data # s_lab_df = read_labelled_json(data_dir, labelled_source_name) # ## Match label space between two datasets: # if str(labelled_source_name).startswith('fire16'): # s_lab_df = labels_mapper(s_lab_df) C_vocab = read_json(labelled_source_path + 'C_vocab') S_vocab = read_json(labelled_source_path + 'S_vocab') T_vocab = read_json(labelled_source_path + 'T_vocab') labelled_token2vec_map = read_json(labelled_source_path + 'labelled_token2vec_map') if not exists(labelled_source_path + 'high_oov_freqs.json'): S_dataset, (S_fields, LABEL) = get_dataset_fields(csv_dir=data_dir, csv_file=S_dataname) T_dataset, (T_fields, LABEL) = get_dataset_fields(csv_dir=data_dir, csv_file=T_dataname) else: C_vocab, C_dataset, S_vocab, S_dataset, S_fields, T_vocab,\ T_dataset, T_fields, labelled_token2vec_map, s_lab_df =\ create_vocab(s_lab_df=None, data_dir=data_dir, labelled_source_name=labelled_source_name, unlabelled_source_name=unlabelled_source_name, unlabelled_target_name=unlabelled_target_name) ## Save vocabs: save_json(C_vocab, labelled_source_path + 'C_vocab') save_json(S_vocab, labelled_source_path + 'S_vocab') save_json(T_vocab, labelled_source_path + 'T_vocab') save_json(labelled_token2vec_map, labelled_source_path + 'labelled_token2vec_map') if glove_embs is None: glove_embs = glove2dict() if exists(labelled_source_path + 'high_oov_freqs.json')\ and exists(labelled_source_path + 'corpus.json')\ and exists(labelled_source_path + 'corpus_toks.json'): high_oov_freqs = read_json(labelled_source_path + 'high_oov_freqs') # low_glove_freqs = read_json(labelled_source_name+'low_glove_freqs') corpus = read_json(labelled_source_path + 'corpus', convert_ordereddict=False) corpus_toks = read_json(labelled_source_path + 'corpus_toks', convert_ordereddict=False) else: ## Get all OOVs which does not have Glove embedding: high_oov_freqs, low_glove_freqs, corpus, corpus_toks =\ preprocess_and_find_oov( (S_dataset, T_dataset), C_vocab, glove_embs=glove_embs, labelled_vocab_set=set(labelled_token2vec_map.keys())) ## Save token sets: high_oov_freqs, low_glove_freqs, corpus, corpus_toks save_json(high_oov_freqs, labelled_source_path + 'high_oov_freqs') # save_json(low_glove_freqs, labelled_source_name+'low_glove_freqs', overwrite=True) save_json(corpus, labelled_source_path + 'corpus') save_json(corpus_toks, labelled_source_path + 'corpus_toks') save_json(C_vocab, labelled_source_path + 'C_vocab', overwrite=True) ## Read labelled datasets and prepare: logger.info('Read labelled datasets and prepare') train_dataset, val_dataset, test_dataset, train_vocab, val_vocab, test_vocab\ = prepare_splitted_datasets() logger.info('Creating instance graphs') train_instance_graphs = Instance_Dataset_DGL( train_dataset, train_vocab, labelled_source_name, class_names=cfg['data']['class_names']) logger.debug(train_instance_graphs.num_labels) # logger.debug(train_instance_graphs.graphs, train_instance_graphs.labels) train_dataloader = DataLoader( train_instance_graphs, batch_size=train_batch_size, shuffle=True, collate_fn=train_instance_graphs.batch_graphs) logger.info( f"Number of training instance graphs: {len(train_instance_graphs)}") val_instance_graphs = Instance_Dataset_DGL( val_dataset, train_vocab, labelled_val_name, class_names=cfg['data']['class_names']) val_dataloader = DataLoader(val_instance_graphs, batch_size=test_batch_size, shuffle=True, collate_fn=val_instance_graphs.batch_graphs) logger.info( f"Number of validating instance graphs: {len(val_instance_graphs)}") test_instance_graphs = Instance_Dataset_DGL( test_dataset, train_vocab, labelled_target_name, class_names=cfg['data']['class_names']) test_dataloader = DataLoader(test_instance_graphs, batch_size=test_batch_size, shuffle=True, collate_fn=test_instance_graphs.batch_graphs) logger.info( f"Number of testing instance graphs: {len(test_instance_graphs)}") # model_type = 'GAT' logger.info(f'Classifying graphs using {model_type} model.') if model_type == 'GAT': logger.info('Using GAT model') train_epochs_output_dict, test_output = GAT_multilabel_classification( train_dataloader, val_dataloader, test_dataloader, in_dim=cfg['embeddings']['emb_dim'], hid_dim=cfg['gnn_params']['hid_dim'], num_heads=cfg['gnn_params']['num_heads'], epochs=cfg['training']['num_epoch'], lr=lr) else: ## Create token graph: logger.info(f'Using GNN model and creating token graph:') g_ob = Token_Dataset_nx(corpus_toks, C_vocab, S_vocab, T_vocab, dataset_name=labelled_source_name) g_ob.add_edge_weights() G = g_ob.G num_tokens = g_ob.num_tokens node_list = list(G.nodes) logger.info( f"Number of nodes {len(node_list)} and edges {len(G.edges)} in token graph" ) ## Create new embeddings for OOV tokens: oov_emb_filename = labelled_source_name + '_OOV_vectors_dict' if exists(join(data_dir, oov_emb_filename + '.pkl')): logger.info('Read OOV embeddings:') oov_embs = load_pickle(filepath=data_dir, filename=oov_emb_filename) else: logger.info('Create OOV embeddings using Mittens:') high_oov_tokens_list = list(high_oov_freqs.keys()) c_corpus = corpus[0] + corpus[1] oov_mat_coo = calculate_cooccurrence_mat(high_oov_tokens_list, c_corpus) oov_embs = train_mittens(oov_mat_coo, high_oov_tokens_list, glove_embs, max_iter=mittens_iter) save_pickle(oov_embs, filepath=data_dir, filename=oov_emb_filename, overwrite=True) ## Get adjacency matrix and node embeddings in same order: logger.info('Accessing token adjacency matrix') ## Note: Saving sparse tensor usually gets corrupted. # adj_filename = join(data_dir, labelled_source_name + "_adj.pt") # if exists(adj_filename): # adj = load(adj_filename) # # adj = sp_coo2torch_coo(adj) # else: # adj = adjacency_matrix(G, nodelist=node_list, weight='weight') # adj = sp_coo2torch_coo(adj) # save(adj, adj_filename) adj = adjacency_matrix(G, nodelist=node_list, weight='weight') adj = sp_coo2torch_coo(adj) logger.info('Accessing token graph node embeddings:') emb_filename = join(data_dir, labelled_source_name + "_emb.pt") if exists(emb_filename): X = load(emb_filename) else: logger.info('Get node embeddings from token graph:') X = g_ob.get_node_embeddings(oov_embs, glove_embs, C_vocab['idx2str_map']) # X = sp_coo2torch_coo(X) save(X, emb_filename) # logger.info('Applying GCN Forward old') # X_hat = GCN_forward_old(adj, X, forward=gcn_hops) # logger.info('Applying GCN Forward') # X_hat = GCN_forward(adj, X, forward=gcn_hops) ## Apply Label Propagation to get label vectors for unlabelled nodes: if use_lpa: logger.info('Getting propagated label vectors:') label_proba_filename = join(data_dir, labelled_source_name + "_lpa_vecs.pt") if exists(label_proba_filename): lpa_vecs = torch.load(label_proba_filename) else: all_node_labels, labelled_masks = fetch_all_nodes( node_list, labelled_token2vec_map, C_vocab['idx2str_map'], # default_fill=[0.]) default_fill=[0., 0., 0., 0.]) lpa_vecs = label_propagation(adj, all_node_labels, labelled_masks) torch.save(lpa_vecs, label_proba_filename) logger.info('Recalculate edge weights using LPA vectors:') g_ob.normalize_edge_weights(lpa_vecs) adj = adjacency_matrix(g_ob.G, nodelist=node_list, weight='weight') adj = sp_coo2torch_coo(adj) ## Normalize Adjacency matrix: logger.info('Normalize token graph:') adj = g_ob.normalize_adj(adj) # ## Create label to propagated vector map: # logger.info('Create label to propagated vector map') # node_txt2label_vec = {} # for node_id in node_list: # node_txt2label_vec[C_vocab['idx2str_map'][node_id]] =\ # lpa_vecs[node_id].tolist() # DataFrame.from_dict(node_txt2label_vec, orient='index').to_csv(labelled_source_name + 'node_txt2label_vec.csv') logger.info('Using GNN model') train_epochs_output_dict, test_output = GAT_GCN_trainer( adj, X, train_dataloader, val_dataloader, test_dataloader, num_tokens=num_tokens, in_feats=cfg['embeddings']['emb_dim'], hid_feats=cfg['gnn_params']['hid_dim'], num_heads=cfg['gnn_params']['num_heads'], epochs=cfg['training']['num_epoch'], lr=lr) # ## Propagating label vectors using GCN forward instead of LPA: # X_labels_hat = GCN_forward(adj, all_node_labels, forward=gcn_hops) # torch.save(X_labels_hat, 'X_labels_hat_05.pt') return C_vocab['str2idx_map'] # , X_hat