def build_corpus(df, txts: list = None, corpus: list = None):
    """Generates corpus (list of str) and vocab with occurrence count (dict of
     set of unique tokens).

    :param df:
    :param txts:
    :param corpus:
    :return:
    """
    if corpus is None:
        corpus = []

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df.text)
    logger.info(X.shape)
    logger.info(vectorizer.get_feature_names())

    for txt in txts:
        # corpus = corpus + txt
        for token in txt:
            corpus.append(token)

    vocab_freq = Counter(corpus)

    return corpus, vocab_freq
Exemple #2
0
def get_supervised_result(model,
                          train_iterator,
                          val_iterator,
                          test_iterator,
                          EPOCHS=5,
                          cls_thresh=None,
                          n_classes=cfg['data']['num_classes']):
    """ Train and Predict on full supervised mode.

    Returns:

    """

    model_best, val_preds_trues_best, val_preds_trues_all, losses = trainer(
        model, train_iterator, val_iterator, N_EPOCHS=EPOCHS)

    # logger.debug(losses)

    # evaluate the model
    test_loss, test_preds_trues = predict_with_label(model_best, test_iterator)

    if cls_thresh is None:
        cls_thresh = [0.5] * n_classes

    predicted_labels = logit2label(DataFrame(
        test_preds_trues['preds'].numpy()),
                                   cls_thresh,
                                   drop_irrelevant=False)

    result = calculate_performance_pl(test_preds_trues['trues'],
                                      predicted_labels)

    logger.info("Supervised result: {}".format(dumps(result, indent=4)))
    return result, model_best
def read_labelled_json(data_dir=dataset_dir, filename=cfg['data']['test'],
                       data_keys=['text', 'classes'], data_set='train',
                       # rename_cols={"parsed_tweet": "text"},
                       ):
    """ Reads json data and converts to DataFrame.

    Default reads labelled target data.

    Args:
        data_dir:
        filename:
        data_keys:
        data_set:
        rename_cols:

    Returns:

    """
    data_df = json_keys2df(data_keys, json_filename=filename,
                           dataset_dir=data_dir)
    logger.info(data_df.head())
    # data_df = data_df.rename(columns=rename_cols)

    if data_set == 'train':
        y_hot = mlb.fit_transform(data_df.classes.to_list())
    else:
        y_hot = mlb.transform(data_df.classes.to_list())

    for i in range(y_hot.shape[1]):
        data_df[i] = y_hot[:, i:i + 1]

    data_df = data_df.drop(columns=['classes'], axis=0)

    return data_df
Exemple #4
0
def train_node_classifier(g: DGLGraph,
                          features: torch.Tensor,
                          labels: torch.Tensor,
                          labelled_mask: torch.Tensor,
                          model: GAT_Node_Classifier,
                          loss_func,
                          optimizer,
                          epochs: int = 5) -> None:
    """

    :param g:
    :param features:
    :param labels:
    :param labelled_mask:
    :param model:
    :param loss_func:
    :param optimizer:
    :param epochs:
    """
    model.train()
    dur = []
    for epoch in range(epochs):
        t0 = time.time()
        logits = model(g, features)
        logp = F.log_softmax(logits, 1)
        loss = loss_func(logp[labelled_mask], labels[labelled_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        dur.append(time.time() - t0)

        logger.info("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), np.mean(dur)))
Exemple #5
0
def graph_multiclass_classification(in_feats: int = 1,
                                    hid_feats: int = 4,
                                    num_heads: int = 2) -> None:
    from dgl.data import MiniGCDataset

    # Create training and test sets.
    trainset = MiniGCDataset(320, 10, 20)
    testset = MiniGCDataset(80, 10, 20)

    # # Use PyTorch's DataLoader and the collate function defined before.
    data_loader = DataLoader(trainset,
                             batch_size=8,
                             shuffle=True,
                             collate_fn=batch_graphs)

    # Create model
    model = GAT_Graph_Classifier(in_feats,
                                 hid_feats,
                                 num_heads=num_heads,
                                 out_dim=trainset.num_classes)
    logger.info(model)

    loss_func = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epoch_losses, epoch_predictions_dict = train_graph_classifier(
        model, data_loader, loss_func=loss_func, optimizer=optimizer, epochs=5)
Exemple #6
0
def graph_multilabel_classification(gdh,
                                    in_feats: int = 100,
                                    hid_feats: int = 50,
                                    num_heads: int = 2,
                                    epochs=cfg['training']['num_epoch']):
    model = GAT_Graph_Classifier(in_feats,
                                 hid_feats,
                                 num_heads=num_heads,
                                 out_dim=gdh.num_classes)
    logger.info(model)

    loss_func = torch.nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=cfg["model"]["optimizer"]["lr"])

    epoch_losses, train_epochs_output_dict = train_graph_classifier(
        model,
        gdh.train_dataloader(),
        loss_func=loss_func,
        optimizer=optimizer,
        epochs=epochs,
        eval_data_loader=gdh.test_dataloader())

    losses, test_output = test_graph_classifier(
        model, loss_func=loss_func, data_loader=gdh.test_dataloader())
    logger.info(dumps(test_output['result'], indent=4))

    return train_epochs_output_dict, test_output
Exemple #7
0
    def load_graph(self, graph_path):
        if graph_path is None:
            graph_path = self.graph_path

        # load processed data from directory graph_path
        logger.info(f'Loading graph from [{graph_path}]')
        self.G = read_gpickle(graph_path)
        return self.G
def eval_graph_classifier(model: GAT_GCN_Classifier,
                          G,
                          X,
                          loss_func,
                          data_loader: utils.data.dataloader.DataLoader,
                          n_classes=cfg['data']['num_classes'],
                          save_gcn_embs=False):
    model.eval()
    preds = []
    trues = []
    losses = []
    for iter, (graph_batch, local_ids, label, global_ids,
               node_counts) in enumerate(data_loader):
        ## Store emb in a separate file as self_loop removes emb info:
        emb = graph_batch.ndata['emb']
        # graph_batch = dgl.add_self_loop(graph_batch)
        if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
            graph_batch = graph_batch.to(device)
            emb = emb.to(device)
            # local_ids = local_ids.to(device)
            # node_counts = node_counts.to(device)
            # global_ids = global_ids.to(device)
            G = G.to(device)
            X = X.to(device)
        if save_gcn_embs:
            save(X, 'X_glove.pt')
        start_time = timeit.default_timer()
        prediction = model(graph_batch, emb, local_ids, node_counts,
                           global_ids, G, X, save_gcn_embs)
        test_time = timeit.default_timer() - start_time
        test_count = label.shape[0]
        logger.info(f"Test time per example: [{test_time / test_count} sec]")
        if prediction.dim() == 1:
            prediction = prediction.unsqueeze(1)
        if cfg['model']['use_cuda'][plat][user] and cuda.is_available():
            prediction = prediction.to(device)
        loss = loss_func(prediction, label)
        preds.append(prediction.detach())
        trues.append(label.detach())
        losses.append(loss.detach())
    losses = mean(stack(losses))
    preds = cat(preds)

    ## Converting raw scores to probabilities using Sigmoid:
    preds = sigmoid(preds)

    ## Converting probabilities to class labels:
    preds = logit2label(preds.detach(), cls_thresh=0.5)
    trues = cat(trues)
    if n_classes == 1:
        result_dict = calculate_performance_bin_sk(trues, preds)
    else:
        result_dict = calculate_performance(trues, preds)
    test_output = {'preds': preds, 'trues': trues, 'result': result_dict}
    # logger.info(dumps(result_dict, indent=4))

    return losses, test_output
Exemple #9
0
 def process(self):
     ## Load or create graphs, labels, local and global ids:
     logger.info("Load or create graphs, labels, local and global ids.")
     if exists(self.graph_path):
         self.graphs, self.instance_graph_local_node_ids, self.labels, \
         self.instance_graph_global_node_ids = self.load_instance_dgl(self.graph_path)
     else:
         self.graphs, self.instance_graph_local_node_ids, self.labels, self.instance_graph_global_node_ids\
             = self.create_instance_dgls(dataset=self.dataset, vocab=self.vocab, class_names=self.class_names)
         self.save_instance_dgl()
    def freeze_weights(self, except_layers=tuple('classifier')):
        """

        :param except_layers:
        """
        for name, param in self.model.named_parameters():
            # for layer in except_layers:
            #     if layer not in name:
            if name not in except_layers:  ## Except specified layers
                param.requires_grad = False
                logger.info(f'Froze layer: {name}')
Exemple #11
0
def train_graph_classifier(
        model: GAT_Graph_Classifier,
        data_loader: torch.utils.data.dataloader.DataLoader,
        loss_func: torch.nn.modules.loss.BCEWithLogitsLoss,
        optimizer,
        epochs: int = 5,
        eval_data_loader: torch.utils.data.dataloader.DataLoader = None):
    train_epoch_losses = []
    train_epoch_dict = OrderedDict()
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        preds = []
        trues = []
        for iter, (graph_batch, label) in enumerate(data_loader):
            ## Store emb in a separate file as self_loop removes emb info:
            emb = graph_batch.ndata['emb']
            # graph_batch = dgl.add_self_loop(graph_batch)
            prediction = model(graph_batch, emb)
            loss = loss_func(prediction, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            preds.append(prediction.detach())
            trues.append(label.detach())
        epoch_loss /= (iter + 1)
        losses, test_output = test_graph_classifier(
            model, loss_func=loss_func, data_loader=eval_data_loader)
        logger.info(
            f"Epoch {epoch}, Train loss {epoch_loss}, Eval loss {losses},"
            f" Macro F1 {test_output['result']['f1']['macro'].item()}")
        # logger.info(dumps(test_output['result'], indent=4))
        train_epoch_losses.append(epoch_loss)
        preds = torch.cat(preds)

        ## Converting raw scores to probabilities using Sigmoid:
        preds = torch.sigmoid(preds)

        ## Converting probabilities to class labels:
        preds = logit2label(preds.detach(), cls_thresh=0.5)
        trues = torch.cat(trues)
        result_dict = calculate_performance(trues, preds)
        # logger.info(dumps(result_dict, indent=4))
        train_epoch_dict[epoch] = {
            'preds': preds,
            'trues': trues,
            'result': result_dict
        }
        # logger.info(f'Epoch {epoch} result: \n{result_dict}')

    return train_epoch_losses, train_epoch_dict
Exemple #12
0
def load_dgl(graph_path, info_path=None):
    """ Loads saved dgl graphs, labels and other info.

    :param graph_path:
    :param info_path:
    :return:
    """
    # load processed data from directory graph_path
    logger.info(f'Loading graph data from: {graph_path}')
    graphs, label_dict = load_graphs(graph_path)
    labels = label_dict['labels']
    if info_path is not None:
        info = load_info(info_path)['info']
        return graphs, labels, info
    return graphs, labels
def main():
    """
    Main module to start code
    :param args:
        Type: tuple
        Required
        Read Only
    :return:
    """
    import networkx as nx

    G_k = nx.path_graph(5)
    for i in range(5):
        G_k.nodes[i]['x'] = np.random.rand(7, )
    output = GCN_forward(G_k_data)
    logger.info(output)
Exemple #14
0
def calculate_performance_sk(true: (np.ndarray, torch.tensor),
                             pred: (np.ndarray, torch.tensor),
                             print_result=False) -> dict:
    """

    :param pred: Multi-hot
    :param true: Multi-hot
    :param print_result:

    """
    scores = {"accuracy": {}}
    scores["accuracy"]["unnormalize"] = accuracy_score(true, pred)
    scores["accuracy"]["normalize"] = accuracy_score(true,
                                                     pred,
                                                     normalize=True)

    scores["precision"] = {}
    scores["precision"]["classes"] = precision_score(true, pred,
                                                     average=None).tolist()
    scores["precision"]["weighted"] = precision_score(true,
                                                      pred,
                                                      average='weighted')
    scores["precision"]["micro"] = precision_score(true, pred, average='micro')
    scores["precision"]["macro"] = precision_score(true, pred, average='macro')
    scores["precision"]["samples"] = precision_score(true,
                                                     pred,
                                                     average='samples')

    scores["recall"] = {}
    scores["recall"]["classes"] = recall_score(true, pred,
                                               average=None).tolist()
    scores["recall"]["weighted"] = recall_score(true, pred, average='weighted')
    scores["recall"]["micro"] = recall_score(true, pred, average='micro')
    scores["recall"]["macro"] = recall_score(true, pred, average='macro')
    scores["recall"]["samples"] = recall_score(true, pred, average='samples')

    scores["f1"] = {}
    scores["f1"]["classes"] = f1_score(true, pred, average=None).tolist()
    scores["f1"]["weighted"] = f1_score(true, pred, average='weighted')
    scores["f1"]["micro"] = f1_score(true, pred, average='micro')
    scores["f1"]["macro"] = f1_score(true, pred, average='macro')
    scores["f1"]["samples"] = f1_score(true, pred, average='samples')

    if print_result:
        logger.info(dumps(scores, indent=4))

    return scores
def lpa_accuracy(preds_set: np.ndarray, labels: np.ndarray) -> dict:
    preds_set = preds_set.numpy()
    labels = labels.numpy()
    # pred_argmax = []
    result = {}
    for cls in range(preds_set.shape[1]):
        # pred_argmax.append(np.argmax(pred, axis=1))
        logger.info(f'Calculating accuracy for class: [{cls}]')
        test1 = np.ma.masked_where(labels[:, cls] > 0, labels[:, cls])
        correct = (
            labels[:, cls][test1.mask] == preds_set[:, cls][test1.mask]).sum()
        total = labels[test1.mask].shape[0]

        result[cls] = (correct, total, correct / total)
        logger.info(f'Accuracy class: [{correct / total, correct, total}]')

    return result
Exemple #16
0
def save_dgl(graphs, labels, graph_path, info=None, info_path=None):
    """ Saves dgl graphs, labels and other info.

    :param instance_graph_global_node_ids:
    :param info:
    :param graphs:
    :param labels:
    :param graph_path:
    :param num_classes:
    :param info_path:
    """
    # save graphs and labels
    logger.info(f'Saving graph data: {graph_path}')
    save_graphs(graph_path, graphs, {'labels': labels})
    # save other information in python dict
    if info_path is not None:
        save_info(info_path, {'info': info})
def propagate_multilabels(features: Tensor, labels: np.ndarray) -> np.ndarray:
    """

    :param features:
    :param labels:
    :return:
    """
    all_preds = []
    for i, labels_cls in enumerate(labels):
        logger.info(f'Propagating labels for class [{i}].')
        preds = []
        for under_set in labels_cls:
            pred = propagate_labels(features, np.stack(under_set))
            preds.append(pred)
        voted_preds = majority_voting(preds)
        all_preds.append(voted_preds)

    return np.stack(all_preds).T
Exemple #18
0
def labels_mapper(df, class_maps: dict = None):
    """ Maps FIRE16 dataset labels to SMERP17 labels.

    -1 denotes the column to be deleted.
    Other columns are merged using logical or.

    :param class_maps:
        FIRE16 -> SMERP17
            0       0
            1       1
            2       0
            3       1
            4       -1
            5       3
            6       2
    :param df:
    """
    if class_maps is None:
        class_maps = {
            2: 0,
            3: 1,
            # 4: -1,
            5: 3,
            6: 2
        }
    logger.info(f'Mapping classes: [{class_maps}]')
    new_cols = sorted(list(class_maps.values()))
    df2 = pd.DataFrame(columns=new_cols)
    # df2 = df[df.columns.difference(new_cols)]
    # df2['text'] = df['text']
    df2.insert(loc=0, column='text', value=df['text'])
    # df2 = df[not new_cols]
    # for col in df.columns:
    for cls, mapped_cls in class_maps.items():
        df2[mapped_cls] = np.logical_or(df[cls], df[mapped_cls]) * 1
        # if mapped_cls == -1:  ## Delete column
        #     del df[cls]
        # else:  ## Merge columns using OR:
        #     df[mapped_cls] = np.logical_or(df[cls], df[mapped_cls]) * 1
        #     del df[cls]

    # df2.index = df.index
    return df2
def label_propagation(adj: sparse,
                      Y: list,
                      labelled_masks: tuple,
                      lpa_epoch: int = 1) -> Tensor:
    logger.info("Applying Label Propagation")

    lpa = Adj_Propagator()
    Y = tensor(Y)
    for i in range(lpa_epoch):
        Y_hat = lpa(adj, Y)
        mse = lpa_mse(Y_hat[labelled_masks[0]], Y[labelled_masks[0]])
        # lpa_accuracy(Y_hat[labelled_masks[1]], Y[labelled_masks[1]])
        logger.info(f'Label Propagation epoch {i} MSE: {mse}')
        labelled_mask = [
            i or j for i, j in zip(labelled_masks[0], labelled_masks[1])
        ]
        Y_hat[labelled_mask] = Y[labelled_mask]

    return Y_hat
def read_json(file_path: str = join(dataset_dir, 'acronym'),
              convert_ordereddict=True) -> OrderedDict:
    """ Reads json file as OrderedDict.

    :param convert_ordereddict:
    :param file_path:
    :return:
    """
    file_path = Path(file_path + ".json")
    # file_path = Path(file_path)
    logger.info(f"Reading json file [{file_path}].")

    if file_path.exists():
        with open(file_path, "r", encoding="utf-8") as file:
            data = load(file)
            if convert_ordereddict:
                data = OrderedDict(data)
        return data
    else:
        raise FileNotFoundError("File [{}] not found.".format(file_path))
def majority_voting(preds_set):
    logger.info("Taking majority voting.")
    if isinstance(preds_set, list):
        majority_count = (len(preds_set) // 2) + 1
    elif isinstance(preds_set, np.ndarray):
        majority_count = (preds_set.shape[0] // 2) + 1
    else:
        NotImplementedError(f"datatype {type(preds_set)} not supported.")

    pred_major = []
    for pred in preds_set:
        pred_discreet = np.argmax(pred, axis=1)
        pred_major.append(pred_discreet)

    pred_major = np.sum(pred_major, axis=0)

    pred_major[(pred_major < majority_count)] = 0.
    pred_major[(pred_major >= majority_count)] = 1.

    return pred_major
Exemple #22
0
    def load_instance_dgl(self, graph_path, infopath=None):
        """ Loads instance graphs.

        :param graph_path:
        :param infopath:
        :return:
        """
        if graph_path is None:
            graph_path = self.graph_path

        logger.info(
            f'Loading graphs from {join(self.data_dir, self.dataset_name)}')
        self.instance_graph_global_node_ids = load_pickle(
            self.dataset_name + '_instance_graph_global_node_ids',
            filepath=self.data_dir)
        self.instance_graph_local_node_ids = load_pickle(
            self.dataset_name + 'instance_graph_local_node_ids',
            filepath=self.data_dir)
        self.graphs, self.labels = load_dgl(graph_path, infopath)
        return self.graphs, self.instance_graph_local_node_ids, \
               self.labels, self.instance_graph_global_node_ids
Exemple #23
0
def main():
    """ Main module to start code

    :param args:
        Type: tuple
        Required
        Read Only
    :return:
    """
    true = torch.tensor([[0, 1], [1, 0]])
    pred = torch.tensor([[0, 1], [0, 0]])
    pl_dict = calculate_performance_pl(true, pred)
    logger.info(pl_dict)
    pl_sk_dict = calculate_performance_sk(true, pred)
    logger.info(pl_sk_dict)
    sk_dict = calculate_performance_sk(true.numpy(), pred.numpy())
    logger.info(sk_dict)

    true = np.array([[0, 1, 0, 0], [0, 1, 1, 0], [1, 0, 1, 0]])
    pred = np.array([[0, 1, 0, 1], [0, 1, 0, 0], [0, 0, 1, 0]])

    true = torch.from_numpy(true)
    pred = torch.from_numpy(pred)

    precision_k_hot(true, pred)
def read_csv(data_dir=dataset_dir,
             data_file='fire16_labeled_train',
             index_col=0,
             header=0):
    """ Reads csv file as DF.

    :param header:
    :param index_col:
    :param data_dir:
    :param data_file:
    :return:
    """
    data_dir = Path(data_dir)
    if not data_dir.exists():
        raise FileNotFoundError("Directory [{}] not found.".format(data_dir))

    data_file = data_dir / (data_file + '.csv')
    logger.info(f"Reading csv file from [{data_file}]")
    if not data_file.exists():
        raise FileNotFoundError("File [{}] not found.".format(data_file))

    df = pd.read_csv(data_file,
                     index_col=index_col,
                     header=header,
                     encoding='utf-8',
                     engine='python')
    df = df.sample(frac=1.)

    logger.info("Dataset size: [{}]".format(df.shape))
    logger.info("Few dataset samples: \n[{}]".format(df.head()))

    return df
Exemple #25
0
def plot_occurance(losses: list,
                   title="Losses",
                   ylabel="Loss",
                   xlabel="Epoch",
                   clear=True,
                   log_scale=False,
                   plot_name=None,
                   plot_dir="",
                   show_plot=False):
    """ Plots the validation loss against epochs.

    :param plot_name:
    :param plot_dir:
    :param xlabel:
    :param ylabel:
    :param title:
    :param losses:
    :param clear:
    :param log_scale:
    """
    ## Turn interactive plotting off
    plt.ioff()

    fig = plt.figure()
    plt.plot(losses)
    plt.xlabel(xlabel)
    if log_scale:
        plt.yscale('log')
    plt.ylabel(ylabel)
    plt.title(title)
    if plot_name is None:
        plot_name = title + "_" + ylabel + "_" + xlabel + ".jpg"
    plt.savefig(join(plot_dir, plot_name))
    logger.info(f"Saved plot with title [{title}] and ylabel [{ylabel}] and "
                f"xlabel [{xlabel}] at [{join(plot_dir, plot_name)}].")
    if clear:
        plt.cla()
    if show_plot: plt.show()
    plt.close(fig)  # Closing the figure so it won't get displayed in console.
Exemple #26
0
def node_binary_classification(hid_feats: int = 4,
                               out_feats: int = 7,
                               num_heads: int = 2) -> None:
    """

    :param hid_feats:
    :param out_feats:
    :param num_heads:
    :return:
    """
    from dgl.data import citation_graph as citegrh

    def load_cora_data():
        data = citegrh.load_cora()
        features = torch.FloatTensor(data.features)
        labels = torch.LongTensor(data.labels)
        mask = torch.BoolTensor(data.train_mask)
        g = DGLGraph(data.graph)
        return g, features, labels, mask

    g, features, labels, mask = load_cora_data()

    net = GAT_Node_Classifier(in_dim=features.size(1),
                              hidden_dim=hid_feats,
                              out_dim=out_feats,
                              num_heads=num_heads)
    logger.info(net)

    loss_func = F.nll_loss

    optimizer = optim.Adam(net.parameters(), lr=1e-3)
    train_node_classifier(g,
                          features,
                          labels,
                          labelled_mask=mask,
                          model=net,
                          loss_func=loss_func,
                          optimizer=optimizer,
                          epochs=5)
def discretize_labelled(labelled_dict: dict,
                        thresh1=0.1,
                        k=2,
                        label_neg=0.,
                        label_pos=1.):
    """ Discretize probabilities of labelled tokens.

    :param k:
    :param thresh1:
    :param label_pos:
    :param labelled_dict: token:np.array(vector)
    :param label_neg: Value to assign when no class should be assigned [0., -1.]
    """
    logger.info(f'Discretizing label vector with threshold [{thresh1}].')
    labelled_vecs = np.array(list(labelled_dict.values()))

    ## value greater than threshold:
    labelled_vecs[(labelled_vecs > thresh1)] = 1.
    # labelled_vecs[(thresh2 < labelled_vecs) & (labelled_vecs <= thresh1)] = 1.

    discretized_dict = {}
    ## Top k values of each row:
    for token, vec in zip(labelled_dict.keys(), labelled_vecs):
        row_sum = vec.sum()
        if 0.5 < row_sum <= 1.:  ## for non -1 rows only
            row_idx = np.argpartition(-vec, k)
            vec[row_idx[:k]] = label_pos
            vec[row_idx[k:]] = label_neg
        elif 0. <= row_sum <= 0.5:
            row_idx = np.argmax(vec)
            vec[row_idx] = label_pos
            vec[(vec != 1.0)] = label_neg
        elif row_sum > 1.:
            vec[(vec < 1.0)] = label_neg

        discretized_dict[token] = vec

    return discretized_dict
def undersample_major_class(X: np.ndarray, Y: np.ndarray, k=3):
    """ Undersamples the majority class k times.

    :param X:
    :param Y:
    :param k:
    :return:
    """
    logger.info(f'Undersampling the majority class [{k}] times.')
    under_sampler = RandomUnderSampler()
    k_undersampled_list = []
    for i in range(k):
        X_resampled, Y_resampled = under_sampler.fit_resample(X, Y)
        X_resampled, Y_resampled = unison_shuffled_copies(
            X_resampled, Y_resampled)
        undersampled_dict = {}
        for x, y in zip(X_resampled, Y_resampled):
            x = str(x[0])
            undersampled_dict[x] = y

        k_undersampled_list.append(undersampled_dict)

    return k_undersampled_list
    def split_data(self, df, train_size=0.8):
        train_df = df.sample(frac=train_size, random_state=self.seed)
        test_df = df.drop(train_df.index).reset_index(drop=True)
        train_df = train_df.reset_index(drop=True)

        logger.info("FULL Dataset: {}".format(df.shape))
        logger.info("TRAIN Dataset: {}".format(train_df.shape))
        logger.info("TEST Dataset: {}".format(test_df.shape))
        return train_df, test_df
    def train(self):
        self.model.to(device)
        self.model.train()
        epoch_loss = 0
        stores = {'preds': [], 'trues': [], 'ids': []}
        for _, data in enumerate(self.train_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device,
                                                       dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)

            ## Set pool_output = True for classification:
            outputs = self.model(ids,
                                 mask,
                                 token_type_ids=token_type_ids,
                                 pool_output=True)

            self.optimizer.zero_grad()
            loss = self.loss_fn(outputs[0], targets)

            stores['preds'].append(outputs[0])
            stores['trues'].append(targets)

            if _ % 20 == 0:
                logger.info(f'Batch: {_}, Loss:  {loss.item()}')

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()

        stores['preds'] = torch.cat(stores['preds'])
        stores['trues'] = torch.cat(stores['trues'])
        # stores['ids'] = torch.cat(stores['ids'])
        return epoch_loss / len(
            self.train_loader), stores  # , epoch_acc / len(iterator)