def build_corpus(df, txts: list = None, corpus: list = None): """Generates corpus (list of str) and vocab with occurrence count (dict of set of unique tokens). :param df: :param txts: :param corpus: :return: """ if corpus is None: corpus = [] vectorizer = CountVectorizer() X = vectorizer.fit_transform(df.text) logger.info(X.shape) logger.info(vectorizer.get_feature_names()) for txt in txts: # corpus = corpus + txt for token in txt: corpus.append(token) vocab_freq = Counter(corpus) return corpus, vocab_freq
def get_supervised_result(model, train_iterator, val_iterator, test_iterator, EPOCHS=5, cls_thresh=None, n_classes=cfg['data']['num_classes']): """ Train and Predict on full supervised mode. Returns: """ model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer( model, train_iterator, val_iterator, N_EPOCHS=EPOCHS) # logger.debug(losses) # evaluate the model test_loss, test_preds_trues = predict_with_label(model_best, test_iterator) if cls_thresh is None: cls_thresh = [0.5] * n_classes predicted_labels = logit2label(DataFrame( test_preds_trues['preds'].numpy()), cls_thresh, drop_irrelevant=False) result = calculate_performance_pl(test_preds_trues['trues'], predicted_labels) logger.info("Supervised result: {}".format(dumps(result, indent=4))) return result, model_best
def read_labelled_json(data_dir=dataset_dir, filename=cfg['data']['test'], data_keys=['text', 'classes'], data_set='train', # rename_cols={"parsed_tweet": "text"}, ): """ Reads json data and converts to DataFrame. Default reads labelled target data. Args: data_dir: filename: data_keys: data_set: rename_cols: Returns: """ data_df = json_keys2df(data_keys, json_filename=filename, dataset_dir=data_dir) logger.info(data_df.head()) # data_df = data_df.rename(columns=rename_cols) if data_set == 'train': y_hot = mlb.fit_transform(data_df.classes.to_list()) else: y_hot = mlb.transform(data_df.classes.to_list()) for i in range(y_hot.shape[1]): data_df[i] = y_hot[:, i:i + 1] data_df = data_df.drop(columns=['classes'], axis=0) return data_df
def train_node_classifier(g: DGLGraph, features: torch.Tensor, labels: torch.Tensor, labelled_mask: torch.Tensor, model: GAT_Node_Classifier, loss_func, optimizer, epochs: int = 5) -> None: """ :param g: :param features: :param labels: :param labelled_mask: :param model: :param loss_func: :param optimizer: :param epochs: """ model.train() dur = [] for epoch in range(epochs): t0 = time.time() logits = model(g, features) logp = F.log_softmax(logits, 1) loss = loss_func(logp[labelled_mask], labels[labelled_mask]) optimizer.zero_grad() loss.backward() optimizer.step() dur.append(time.time() - t0) logger.info("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format( epoch, loss.item(), np.mean(dur)))
def graph_multiclass_classification(in_feats: int = 1, hid_feats: int = 4, num_heads: int = 2) -> None: from dgl.data import MiniGCDataset # Create training and test sets. trainset = MiniGCDataset(320, 10, 20) testset = MiniGCDataset(80, 10, 20) # # Use PyTorch's DataLoader and the collate function defined before. data_loader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=batch_graphs) # Create model model = GAT_Graph_Classifier(in_feats, hid_feats, num_heads=num_heads, out_dim=trainset.num_classes) logger.info(model) loss_func = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) epoch_losses, epoch_predictions_dict = train_graph_classifier( model, data_loader, loss_func=loss_func, optimizer=optimizer, epochs=5)
def graph_multilabel_classification(gdh, in_feats: int = 100, hid_feats: int = 50, num_heads: int = 2, epochs=cfg['training']['num_epoch']): model = GAT_Graph_Classifier(in_feats, hid_feats, num_heads=num_heads, out_dim=gdh.num_classes) logger.info(model) loss_func = torch.nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters(), lr=cfg["model"]["optimizer"]["lr"]) epoch_losses, train_epochs_output_dict = train_graph_classifier( model, gdh.train_dataloader(), loss_func=loss_func, optimizer=optimizer, epochs=epochs, eval_data_loader=gdh.test_dataloader()) losses, test_output = test_graph_classifier( model, loss_func=loss_func, data_loader=gdh.test_dataloader()) logger.info(dumps(test_output['result'], indent=4)) return train_epochs_output_dict, test_output
def load_graph(self, graph_path): if graph_path is None: graph_path = self.graph_path # load processed data from directory graph_path logger.info(f'Loading graph from [{graph_path}]') self.G = read_gpickle(graph_path) return self.G
def eval_graph_classifier(model: GAT_GCN_Classifier, G, X, loss_func, data_loader: utils.data.dataloader.DataLoader, n_classes=cfg['data']['num_classes'], save_gcn_embs=False): model.eval() preds = [] trues = [] losses = [] for iter, (graph_batch, local_ids, label, global_ids, node_counts) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): graph_batch = graph_batch.to(device) emb = emb.to(device) # local_ids = local_ids.to(device) # node_counts = node_counts.to(device) # global_ids = global_ids.to(device) G = G.to(device) X = X.to(device) if save_gcn_embs: save(X, 'X_glove.pt') start_time = timeit.default_timer() prediction = model(graph_batch, emb, local_ids, node_counts, global_ids, G, X, save_gcn_embs) test_time = timeit.default_timer() - start_time test_count = label.shape[0] logger.info(f"Test time per example: [{test_time / test_count} sec]") if prediction.dim() == 1: prediction = prediction.unsqueeze(1) if cfg['model']['use_cuda'][plat][user] and cuda.is_available(): prediction = prediction.to(device) loss = loss_func(prediction, label) preds.append(prediction.detach()) trues.append(label.detach()) losses.append(loss.detach()) losses = mean(stack(losses)) preds = cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = cat(trues) if n_classes == 1: result_dict = calculate_performance_bin_sk(trues, preds) else: result_dict = calculate_performance(trues, preds) test_output = {'preds': preds, 'trues': trues, 'result': result_dict} # logger.info(dumps(result_dict, indent=4)) return losses, test_output
def process(self): ## Load or create graphs, labels, local and global ids: logger.info("Load or create graphs, labels, local and global ids.") if exists(self.graph_path): self.graphs, self.instance_graph_local_node_ids, self.labels, \ self.instance_graph_global_node_ids = self.load_instance_dgl(self.graph_path) else: self.graphs, self.instance_graph_local_node_ids, self.labels, self.instance_graph_global_node_ids\ = self.create_instance_dgls(dataset=self.dataset, vocab=self.vocab, class_names=self.class_names) self.save_instance_dgl()
def freeze_weights(self, except_layers=tuple('classifier')): """ :param except_layers: """ for name, param in self.model.named_parameters(): # for layer in except_layers: # if layer not in name: if name not in except_layers: ## Except specified layers param.requires_grad = False logger.info(f'Froze layer: {name}')
def train_graph_classifier( model: GAT_Graph_Classifier, data_loader: torch.utils.data.dataloader.DataLoader, loss_func: torch.nn.modules.loss.BCEWithLogitsLoss, optimizer, epochs: int = 5, eval_data_loader: torch.utils.data.dataloader.DataLoader = None): train_epoch_losses = [] train_epoch_dict = OrderedDict() for epoch in range(epochs): model.train() epoch_loss = 0 preds = [] trues = [] for iter, (graph_batch, label) in enumerate(data_loader): ## Store emb in a separate file as self_loop removes emb info: emb = graph_batch.ndata['emb'] # graph_batch = dgl.add_self_loop(graph_batch) prediction = model(graph_batch, emb) loss = loss_func(prediction, label) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.detach().item() preds.append(prediction.detach()) trues.append(label.detach()) epoch_loss /= (iter + 1) losses, test_output = test_graph_classifier( model, loss_func=loss_func, data_loader=eval_data_loader) logger.info( f"Epoch {epoch}, Train loss {epoch_loss}, Eval loss {losses}," f" Macro F1 {test_output['result']['f1']['macro'].item()}") # logger.info(dumps(test_output['result'], indent=4)) train_epoch_losses.append(epoch_loss) preds = torch.cat(preds) ## Converting raw scores to probabilities using Sigmoid: preds = torch.sigmoid(preds) ## Converting probabilities to class labels: preds = logit2label(preds.detach(), cls_thresh=0.5) trues = torch.cat(trues) result_dict = calculate_performance(trues, preds) # logger.info(dumps(result_dict, indent=4)) train_epoch_dict[epoch] = { 'preds': preds, 'trues': trues, 'result': result_dict } # logger.info(f'Epoch {epoch} result: \n{result_dict}') return train_epoch_losses, train_epoch_dict
def load_dgl(graph_path, info_path=None): """ Loads saved dgl graphs, labels and other info. :param graph_path: :param info_path: :return: """ # load processed data from directory graph_path logger.info(f'Loading graph data from: {graph_path}') graphs, label_dict = load_graphs(graph_path) labels = label_dict['labels'] if info_path is not None: info = load_info(info_path)['info'] return graphs, labels, info return graphs, labels
def main(): """ Main module to start code :param args: Type: tuple Required Read Only :return: """ import networkx as nx G_k = nx.path_graph(5) for i in range(5): G_k.nodes[i]['x'] = np.random.rand(7, ) output = GCN_forward(G_k_data) logger.info(output)
def calculate_performance_sk(true: (np.ndarray, torch.tensor), pred: (np.ndarray, torch.tensor), print_result=False) -> dict: """ :param pred: Multi-hot :param true: Multi-hot :param print_result: """ scores = {"accuracy": {}} scores["accuracy"]["unnormalize"] = accuracy_score(true, pred) scores["accuracy"]["normalize"] = accuracy_score(true, pred, normalize=True) scores["precision"] = {} scores["precision"]["classes"] = precision_score(true, pred, average=None).tolist() scores["precision"]["weighted"] = precision_score(true, pred, average='weighted') scores["precision"]["micro"] = precision_score(true, pred, average='micro') scores["precision"]["macro"] = precision_score(true, pred, average='macro') scores["precision"]["samples"] = precision_score(true, pred, average='samples') scores["recall"] = {} scores["recall"]["classes"] = recall_score(true, pred, average=None).tolist() scores["recall"]["weighted"] = recall_score(true, pred, average='weighted') scores["recall"]["micro"] = recall_score(true, pred, average='micro') scores["recall"]["macro"] = recall_score(true, pred, average='macro') scores["recall"]["samples"] = recall_score(true, pred, average='samples') scores["f1"] = {} scores["f1"]["classes"] = f1_score(true, pred, average=None).tolist() scores["f1"]["weighted"] = f1_score(true, pred, average='weighted') scores["f1"]["micro"] = f1_score(true, pred, average='micro') scores["f1"]["macro"] = f1_score(true, pred, average='macro') scores["f1"]["samples"] = f1_score(true, pred, average='samples') if print_result: logger.info(dumps(scores, indent=4)) return scores
def lpa_accuracy(preds_set: np.ndarray, labels: np.ndarray) -> dict: preds_set = preds_set.numpy() labels = labels.numpy() # pred_argmax = [] result = {} for cls in range(preds_set.shape[1]): # pred_argmax.append(np.argmax(pred, axis=1)) logger.info(f'Calculating accuracy for class: [{cls}]') test1 = np.ma.masked_where(labels[:, cls] > 0, labels[:, cls]) correct = ( labels[:, cls][test1.mask] == preds_set[:, cls][test1.mask]).sum() total = labels[test1.mask].shape[0] result[cls] = (correct, total, correct / total) logger.info(f'Accuracy class: [{correct / total, correct, total}]') return result
def save_dgl(graphs, labels, graph_path, info=None, info_path=None): """ Saves dgl graphs, labels and other info. :param instance_graph_global_node_ids: :param info: :param graphs: :param labels: :param graph_path: :param num_classes: :param info_path: """ # save graphs and labels logger.info(f'Saving graph data: {graph_path}') save_graphs(graph_path, graphs, {'labels': labels}) # save other information in python dict if info_path is not None: save_info(info_path, {'info': info})
def propagate_multilabels(features: Tensor, labels: np.ndarray) -> np.ndarray: """ :param features: :param labels: :return: """ all_preds = [] for i, labels_cls in enumerate(labels): logger.info(f'Propagating labels for class [{i}].') preds = [] for under_set in labels_cls: pred = propagate_labels(features, np.stack(under_set)) preds.append(pred) voted_preds = majority_voting(preds) all_preds.append(voted_preds) return np.stack(all_preds).T
def labels_mapper(df, class_maps: dict = None): """ Maps FIRE16 dataset labels to SMERP17 labels. -1 denotes the column to be deleted. Other columns are merged using logical or. :param class_maps: FIRE16 -> SMERP17 0 0 1 1 2 0 3 1 4 -1 5 3 6 2 :param df: """ if class_maps is None: class_maps = { 2: 0, 3: 1, # 4: -1, 5: 3, 6: 2 } logger.info(f'Mapping classes: [{class_maps}]') new_cols = sorted(list(class_maps.values())) df2 = pd.DataFrame(columns=new_cols) # df2 = df[df.columns.difference(new_cols)] # df2['text'] = df['text'] df2.insert(loc=0, column='text', value=df['text']) # df2 = df[not new_cols] # for col in df.columns: for cls, mapped_cls in class_maps.items(): df2[mapped_cls] = np.logical_or(df[cls], df[mapped_cls]) * 1 # if mapped_cls == -1: ## Delete column # del df[cls] # else: ## Merge columns using OR: # df[mapped_cls] = np.logical_or(df[cls], df[mapped_cls]) * 1 # del df[cls] # df2.index = df.index return df2
def label_propagation(adj: sparse, Y: list, labelled_masks: tuple, lpa_epoch: int = 1) -> Tensor: logger.info("Applying Label Propagation") lpa = Adj_Propagator() Y = tensor(Y) for i in range(lpa_epoch): Y_hat = lpa(adj, Y) mse = lpa_mse(Y_hat[labelled_masks[0]], Y[labelled_masks[0]]) # lpa_accuracy(Y_hat[labelled_masks[1]], Y[labelled_masks[1]]) logger.info(f'Label Propagation epoch {i} MSE: {mse}') labelled_mask = [ i or j for i, j in zip(labelled_masks[0], labelled_masks[1]) ] Y_hat[labelled_mask] = Y[labelled_mask] return Y_hat
def read_json(file_path: str = join(dataset_dir, 'acronym'), convert_ordereddict=True) -> OrderedDict: """ Reads json file as OrderedDict. :param convert_ordereddict: :param file_path: :return: """ file_path = Path(file_path + ".json") # file_path = Path(file_path) logger.info(f"Reading json file [{file_path}].") if file_path.exists(): with open(file_path, "r", encoding="utf-8") as file: data = load(file) if convert_ordereddict: data = OrderedDict(data) return data else: raise FileNotFoundError("File [{}] not found.".format(file_path))
def majority_voting(preds_set): logger.info("Taking majority voting.") if isinstance(preds_set, list): majority_count = (len(preds_set) // 2) + 1 elif isinstance(preds_set, np.ndarray): majority_count = (preds_set.shape[0] // 2) + 1 else: NotImplementedError(f"datatype {type(preds_set)} not supported.") pred_major = [] for pred in preds_set: pred_discreet = np.argmax(pred, axis=1) pred_major.append(pred_discreet) pred_major = np.sum(pred_major, axis=0) pred_major[(pred_major < majority_count)] = 0. pred_major[(pred_major >= majority_count)] = 1. return pred_major
def load_instance_dgl(self, graph_path, infopath=None): """ Loads instance graphs. :param graph_path: :param infopath: :return: """ if graph_path is None: graph_path = self.graph_path logger.info( f'Loading graphs from {join(self.data_dir, self.dataset_name)}') self.instance_graph_global_node_ids = load_pickle( self.dataset_name + '_instance_graph_global_node_ids', filepath=self.data_dir) self.instance_graph_local_node_ids = load_pickle( self.dataset_name + 'instance_graph_local_node_ids', filepath=self.data_dir) self.graphs, self.labels = load_dgl(graph_path, infopath) return self.graphs, self.instance_graph_local_node_ids, \ self.labels, self.instance_graph_global_node_ids
def main(): """ Main module to start code :param args: Type: tuple Required Read Only :return: """ true = torch.tensor([[0, 1], [1, 0]]) pred = torch.tensor([[0, 1], [0, 0]]) pl_dict = calculate_performance_pl(true, pred) logger.info(pl_dict) pl_sk_dict = calculate_performance_sk(true, pred) logger.info(pl_sk_dict) sk_dict = calculate_performance_sk(true.numpy(), pred.numpy()) logger.info(sk_dict) true = np.array([[0, 1, 0, 0], [0, 1, 1, 0], [1, 0, 1, 0]]) pred = np.array([[0, 1, 0, 1], [0, 1, 0, 0], [0, 0, 1, 0]]) true = torch.from_numpy(true) pred = torch.from_numpy(pred) precision_k_hot(true, pred)
def read_csv(data_dir=dataset_dir, data_file='fire16_labeled_train', index_col=0, header=0): """ Reads csv file as DF. :param header: :param index_col: :param data_dir: :param data_file: :return: """ data_dir = Path(data_dir) if not data_dir.exists(): raise FileNotFoundError("Directory [{}] not found.".format(data_dir)) data_file = data_dir / (data_file + '.csv') logger.info(f"Reading csv file from [{data_file}]") if not data_file.exists(): raise FileNotFoundError("File [{}] not found.".format(data_file)) df = pd.read_csv(data_file, index_col=index_col, header=header, encoding='utf-8', engine='python') df = df.sample(frac=1.) logger.info("Dataset size: [{}]".format(df.shape)) logger.info("Few dataset samples: \n[{}]".format(df.head())) return df
def plot_occurance(losses: list, title="Losses", ylabel="Loss", xlabel="Epoch", clear=True, log_scale=False, plot_name=None, plot_dir="", show_plot=False): """ Plots the validation loss against epochs. :param plot_name: :param plot_dir: :param xlabel: :param ylabel: :param title: :param losses: :param clear: :param log_scale: """ ## Turn interactive plotting off plt.ioff() fig = plt.figure() plt.plot(losses) plt.xlabel(xlabel) if log_scale: plt.yscale('log') plt.ylabel(ylabel) plt.title(title) if plot_name is None: plot_name = title + "_" + ylabel + "_" + xlabel + ".jpg" plt.savefig(join(plot_dir, plot_name)) logger.info(f"Saved plot with title [{title}] and ylabel [{ylabel}] and " f"xlabel [{xlabel}] at [{join(plot_dir, plot_name)}].") if clear: plt.cla() if show_plot: plt.show() plt.close(fig) # Closing the figure so it won't get displayed in console.
def node_binary_classification(hid_feats: int = 4, out_feats: int = 7, num_heads: int = 2) -> None: """ :param hid_feats: :param out_feats: :param num_heads: :return: """ from dgl.data import citation_graph as citegrh def load_cora_data(): data = citegrh.load_cora() features = torch.FloatTensor(data.features) labels = torch.LongTensor(data.labels) mask = torch.BoolTensor(data.train_mask) g = DGLGraph(data.graph) return g, features, labels, mask g, features, labels, mask = load_cora_data() net = GAT_Node_Classifier(in_dim=features.size(1), hidden_dim=hid_feats, out_dim=out_feats, num_heads=num_heads) logger.info(net) loss_func = F.nll_loss optimizer = optim.Adam(net.parameters(), lr=1e-3) train_node_classifier(g, features, labels, labelled_mask=mask, model=net, loss_func=loss_func, optimizer=optimizer, epochs=5)
def discretize_labelled(labelled_dict: dict, thresh1=0.1, k=2, label_neg=0., label_pos=1.): """ Discretize probabilities of labelled tokens. :param k: :param thresh1: :param label_pos: :param labelled_dict: token:np.array(vector) :param label_neg: Value to assign when no class should be assigned [0., -1.] """ logger.info(f'Discretizing label vector with threshold [{thresh1}].') labelled_vecs = np.array(list(labelled_dict.values())) ## value greater than threshold: labelled_vecs[(labelled_vecs > thresh1)] = 1. # labelled_vecs[(thresh2 < labelled_vecs) & (labelled_vecs <= thresh1)] = 1. discretized_dict = {} ## Top k values of each row: for token, vec in zip(labelled_dict.keys(), labelled_vecs): row_sum = vec.sum() if 0.5 < row_sum <= 1.: ## for non -1 rows only row_idx = np.argpartition(-vec, k) vec[row_idx[:k]] = label_pos vec[row_idx[k:]] = label_neg elif 0. <= row_sum <= 0.5: row_idx = np.argmax(vec) vec[row_idx] = label_pos vec[(vec != 1.0)] = label_neg elif row_sum > 1.: vec[(vec < 1.0)] = label_neg discretized_dict[token] = vec return discretized_dict
def undersample_major_class(X: np.ndarray, Y: np.ndarray, k=3): """ Undersamples the majority class k times. :param X: :param Y: :param k: :return: """ logger.info(f'Undersampling the majority class [{k}] times.') under_sampler = RandomUnderSampler() k_undersampled_list = [] for i in range(k): X_resampled, Y_resampled = under_sampler.fit_resample(X, Y) X_resampled, Y_resampled = unison_shuffled_copies( X_resampled, Y_resampled) undersampled_dict = {} for x, y in zip(X_resampled, Y_resampled): x = str(x[0]) undersampled_dict[x] = y k_undersampled_list.append(undersampled_dict) return k_undersampled_list
def split_data(self, df, train_size=0.8): train_df = df.sample(frac=train_size, random_state=self.seed) test_df = df.drop(train_df.index).reset_index(drop=True) train_df = train_df.reset_index(drop=True) logger.info("FULL Dataset: {}".format(df.shape)) logger.info("TRAIN Dataset: {}".format(train_df.shape)) logger.info("TEST Dataset: {}".format(test_df.shape)) return train_df, test_df
def train(self): self.model.to(device) self.model.train() epoch_loss = 0 stores = {'preds': [], 'trues': [], 'ids': []} for _, data in enumerate(self.train_loader, 0): ids = data['ids'].to(device, dtype=torch.long) mask = data['mask'].to(device, dtype=torch.long) token_type_ids = data['token_type_ids'].to(device, dtype=torch.long) targets = data['targets'].to(device, dtype=torch.float) ## Set pool_output = True for classification: outputs = self.model(ids, mask, token_type_ids=token_type_ids, pool_output=True) self.optimizer.zero_grad() loss = self.loss_fn(outputs[0], targets) stores['preds'].append(outputs[0]) stores['trues'].append(targets) if _ % 20 == 0: logger.info(f'Batch: {_}, Loss: {loss.item()}') self.optimizer.zero_grad() loss.backward() self.optimizer.step() epoch_loss += loss.item() stores['preds'] = torch.cat(stores['preds']) stores['trues'] = torch.cat(stores['trues']) # stores['ids'] = torch.cat(stores['ids']) return epoch_loss / len( self.train_loader), stores # , epoch_acc / len(iterator)