Example #1
0
def load_datasets(args):
    """Loads dataset and graph if exists, else create and process them from raw data
    Returns --->
    f: torch tensor input of GCN (Identity matrix)
    X: input of GCN (Identity matrix)
    A_hat: transformed adjacency matrix A
    selected: indexes of selected labelled nodes for training
    test_idxs: indexes of not-selected nodes for inference/testing
    labels_selected: labels of selected labelled nodes for training
    labels_not_selected: labels of not-selected labelled nodes for inference/testing
    """
    logger.info("Loading data...")
    df_data_path = "./data/df_data.pkl"
    graph_path = "./data/text_graph.pkl"
    if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path):
        logger.info("Building datasets and graph from raw data... Note this will take quite a while...")
        generate_text_graph()
    df_data = load_pickle("df_data.pkl")
    G = load_pickle("text_graph.pkl")
    
    logger.info("Building adjacency and degree matrices...")
    A = nx.to_numpy_matrix(G, weight="weight"); A = A + np.eye(G.number_of_nodes())
    degrees = []
    for d in G.degree(weight=None):
        if d == 0:
            degrees.append(0)
        else:
            degrees.append(d[1]**(-0.5))
    degrees = np.diag(degrees)
    X = np.eye(G.number_of_nodes()) # Features are just identity matrix
    A_hat = degrees@A@degrees
    f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net
    
    logger.info("Splitting labels for training and inferring...")
    ### stratified test samples
    test_idxs = []
    for b_id in df_data["b"].unique():
        dum = df_data[df_data["b"] == b_id]
        if len(dum) >= 4:
            test_idxs.extend(list(np.random.choice(dum.index, size=round(args.test_ratio*len(dum)), replace=False)))
    save_as_pickle("test_idxs.pkl", test_idxs)
    # select only certain labelled nodes for semi-supervised GCN
    selected = []
    for i in range(len(df_data)):
        if i not in test_idxs:
            selected.append(i)
    save_as_pickle("selected.pkl", selected)
    
    f_selected = f[selected]; f_selected = torch.from_numpy(f_selected).float()
    labels_selected = [l for idx, l in enumerate(df_data["b"]) if idx in selected]
    f_not_selected = f[test_idxs]; f_not_selected = torch.from_numpy(f_not_selected).float()
    labels_not_selected = [l for idx, l in enumerate(df_data["b"]) if idx not in selected]
    f = torch.from_numpy(f).float()
    save_as_pickle("labels_selected.pkl", labels_selected)
    save_as_pickle("labels_not_selected.pkl", labels_not_selected)
    logger.info("Split into %d train and %d test lebels." % (len(labels_selected), len(labels_not_selected)))
    return f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs
Example #2
0
def load_results(model_no=0):
    """ Loads saved results if exists """
    losses_path = "./data/test_losses_per_epoch_%d.pkl" % model_no
    accuracy_path = "./data/test_accuracy_per_epoch_%d.pkl" % model_no
    if os.path.isfile(losses_path) and os.path.isfile(accuracy_path):
        losses_per_epoch = load_pickle("test_losses_per_epoch_%d.pkl" % model_no)
        accuracy_per_epoch = load_pickle("test_accuracy_per_epoch_%d.pkl" % model_no)
        logger.info("Loaded results buffer")
    else:
        losses_per_epoch, accuracy_per_epoch = [], []
    return losses_per_epoch, accuracy_per_epoch
Example #3
0
def evaluate_model_results(args=None):
    logger.info("Loading dataset and model for evaluation...")
    base_path = "./data/"
    if args == None:
        args = load_pickle("args.pkl")
    ### Loads bible data
    df = pd.read_csv(os.path.join(base_path, "t_bbe.csv"))
    df.drop(["id", "v"], axis=1, inplace=True)
    df = df[["t", "c", "b"]]
    ### one chapter per document, labelled by book
    df_data = pd.DataFrame(columns=["c", "b"])
    for book in df["b"].unique():
        dum = pd.DataFrame(columns=["c", "b"])
        dum["c"] = df[df["b"] == book].groupby("c").apply(
            lambda x: (" ".join(x["t"])).lower())
        dum["b"] = book
        df_data = pd.concat([df_data, dum], ignore_index=True)
    del df
    book_dict = pd.read_csv(os.path.join(base_path, "key_english.csv"))
    book_dict = {
        book.lower(): number
        for book, number in zip(book_dict["field.1"], book_dict["field"])
    }
    book_dict = {v: k for k, v in zip(book_dict.keys(), book_dict.values())}
    ### Loads graph data
    G = load_pickle("text_graph.pkl")
    A = nx.to_numpy_matrix(G, weight="weight")
    A = A + np.eye(G.number_of_nodes())
    degrees = []
    for d in G.degree(weight=None):
        if d == 0:
            degrees.append(0)
        else:
            degrees.append(d[1]**(-0.5))
    degrees = np.diag(degrees)
    X = np.eye(G.number_of_nodes())  # Features are just identity matrix
    A_hat = degrees @ A @ degrees
    f = X  # (n X n) X (n X n) x (n X n) X (n X n) input of net
    f = torch.from_numpy(f).float()

    logger.info("Loading labels...")
    ### Loads labels
    test_idxs = load_pickle("test_idxs.pkl")
    selected = load_pickle("selected.pkl")
    labels_selected = load_pickle("labels_selected.pkl")
    labels_not_selected = load_pickle("labels_not_selected.pkl")

    ### Loads best model ###
    checkpoint = torch.load(
        os.path.join(base_path, "test_model_best_%d.pth.tar" % 0))
    net = gcn(X.shape[1], A_hat, args)
    net.load_state_dict(checkpoint['state_dict'])

    ### labels distribution
    fig = plt.figure(figsize=(15, 17))
    ax = fig.add_subplot(111)
    ax.hist([(e - 1)
             for e in labels_not_selected] + [(e - 1)
                                              for e in labels_selected],
            bins=66)
    ax.set_title("Class label distribution for data set", fontsize=20)
    ax.set_xlabel("Class label", fontsize=17)
    ax.set_ylabel("Counts", fontsize=17)
    [x.set_fontsize(15) for x in ax.get_xticklabels()]
    [x.set_fontsize(15) for x in ax.get_yticklabels()]
    plt.savefig(os.path.join("./data/", "data_idxs_dist.png"))

    fig = plt.figure(figsize=(15, 17))
    ax = fig.add_subplot(111)
    ax.hist([(e - 1) for e in labels_not_selected], bins=66)
    ax.set_title("Class label distribution for test set", fontsize=20)
    ax.set_xlabel("Class label", fontsize=17)
    ax.set_ylabel("Counts", fontsize=17)
    [x.set_fontsize(15) for x in ax.get_xticklabels()]
    [x.set_fontsize(15) for x in ax.get_yticklabels()]
    plt.savefig(os.path.join("./data/", "test_true_idxs_dist.png"))
    ### Inference
    net.eval()
    with torch.no_grad():
        pred_labels = net(f)
    c_m = confusion_matrix([(e - 1) for e in labels_not_selected],
                           list(pred_labels[test_idxs].max(1)[1].numpy()))
    fig = plt.figure(figsize=(25, 25))
    ax = fig.add_subplot(111)
    sb.heatmap(c_m, annot=False)
    ax.set_title("Confusion Matrix", fontsize=20)
    ax.set_xlabel("Actual class", fontsize=17)
    ax.set_ylabel("Predicted", fontsize=17)
    plt.savefig(os.path.join("./data/", "confusion_matrix.png"))
    #### Prints misclassified labels
    misclassified(df_data, pred_labels[test_idxs], labels_not_selected,
                  test_idxs, book_dict)