def load_datasets(args): """Loads dataset and graph if exists, else create and process them from raw data Returns ---> f: torch tensor input of GCN (Identity matrix) X: input of GCN (Identity matrix) A_hat: transformed adjacency matrix A selected: indexes of selected labelled nodes for training test_idxs: indexes of not-selected nodes for inference/testing labels_selected: labels of selected labelled nodes for training labels_not_selected: labels of not-selected labelled nodes for inference/testing """ logger.info("Loading data...") df_data_path = "./data/df_data.pkl" graph_path = "./data/text_graph.pkl" if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path): logger.info("Building datasets and graph from raw data... Note this will take quite a while...") generate_text_graph() df_data = load_pickle("df_data.pkl") G = load_pickle("text_graph.pkl") logger.info("Building adjacency and degree matrices...") A = nx.to_numpy_matrix(G, weight="weight"); A = A + np.eye(G.number_of_nodes()) degrees = [] for d in G.degree(weight=None): if d == 0: degrees.append(0) else: degrees.append(d[1]**(-0.5)) degrees = np.diag(degrees) X = np.eye(G.number_of_nodes()) # Features are just identity matrix A_hat = degrees@A@degrees f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net logger.info("Splitting labels for training and inferring...") ### stratified test samples test_idxs = [] for b_id in df_data["b"].unique(): dum = df_data[df_data["b"] == b_id] if len(dum) >= 4: test_idxs.extend(list(np.random.choice(dum.index, size=round(args.test_ratio*len(dum)), replace=False))) save_as_pickle("test_idxs.pkl", test_idxs) # select only certain labelled nodes for semi-supervised GCN selected = [] for i in range(len(df_data)): if i not in test_idxs: selected.append(i) save_as_pickle("selected.pkl", selected) f_selected = f[selected]; f_selected = torch.from_numpy(f_selected).float() labels_selected = [l for idx, l in enumerate(df_data["b"]) if idx in selected] f_not_selected = f[test_idxs]; f_not_selected = torch.from_numpy(f_not_selected).float() labels_not_selected = [l for idx, l in enumerate(df_data["b"]) if idx not in selected] f = torch.from_numpy(f).float() save_as_pickle("labels_selected.pkl", labels_selected) save_as_pickle("labels_not_selected.pkl", labels_not_selected) logger.info("Split into %d train and %d test lebels." % (len(labels_selected), len(labels_not_selected))) return f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs
def load_results(model_no=0): """ Loads saved results if exists """ losses_path = "./data/test_losses_per_epoch_%d.pkl" % model_no accuracy_path = "./data/test_accuracy_per_epoch_%d.pkl" % model_no if os.path.isfile(losses_path) and os.path.isfile(accuracy_path): losses_per_epoch = load_pickle("test_losses_per_epoch_%d.pkl" % model_no) accuracy_per_epoch = load_pickle("test_accuracy_per_epoch_%d.pkl" % model_no) logger.info("Loaded results buffer") else: losses_per_epoch, accuracy_per_epoch = [], [] return losses_per_epoch, accuracy_per_epoch
def evaluate_model_results(args=None): logger.info("Loading dataset and model for evaluation...") base_path = "./data/" if args == None: args = load_pickle("args.pkl") ### Loads bible data df = pd.read_csv(os.path.join(base_path, "t_bbe.csv")) df.drop(["id", "v"], axis=1, inplace=True) df = df[["t", "c", "b"]] ### one chapter per document, labelled by book df_data = pd.DataFrame(columns=["c", "b"]) for book in df["b"].unique(): dum = pd.DataFrame(columns=["c", "b"]) dum["c"] = df[df["b"] == book].groupby("c").apply( lambda x: (" ".join(x["t"])).lower()) dum["b"] = book df_data = pd.concat([df_data, dum], ignore_index=True) del df book_dict = pd.read_csv(os.path.join(base_path, "key_english.csv")) book_dict = { book.lower(): number for book, number in zip(book_dict["field.1"], book_dict["field"]) } book_dict = {v: k for k, v in zip(book_dict.keys(), book_dict.values())} ### Loads graph data G = load_pickle("text_graph.pkl") A = nx.to_numpy_matrix(G, weight="weight") A = A + np.eye(G.number_of_nodes()) degrees = [] for d in G.degree(weight=None): if d == 0: degrees.append(0) else: degrees.append(d[1]**(-0.5)) degrees = np.diag(degrees) X = np.eye(G.number_of_nodes()) # Features are just identity matrix A_hat = degrees @ A @ degrees f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net f = torch.from_numpy(f).float() logger.info("Loading labels...") ### Loads labels test_idxs = load_pickle("test_idxs.pkl") selected = load_pickle("selected.pkl") labels_selected = load_pickle("labels_selected.pkl") labels_not_selected = load_pickle("labels_not_selected.pkl") ### Loads best model ### checkpoint = torch.load( os.path.join(base_path, "test_model_best_%d.pth.tar" % 0)) net = gcn(X.shape[1], A_hat, args) net.load_state_dict(checkpoint['state_dict']) ### labels distribution fig = plt.figure(figsize=(15, 17)) ax = fig.add_subplot(111) ax.hist([(e - 1) for e in labels_not_selected] + [(e - 1) for e in labels_selected], bins=66) ax.set_title("Class label distribution for data set", fontsize=20) ax.set_xlabel("Class label", fontsize=17) ax.set_ylabel("Counts", fontsize=17) [x.set_fontsize(15) for x in ax.get_xticklabels()] [x.set_fontsize(15) for x in ax.get_yticklabels()] plt.savefig(os.path.join("./data/", "data_idxs_dist.png")) fig = plt.figure(figsize=(15, 17)) ax = fig.add_subplot(111) ax.hist([(e - 1) for e in labels_not_selected], bins=66) ax.set_title("Class label distribution for test set", fontsize=20) ax.set_xlabel("Class label", fontsize=17) ax.set_ylabel("Counts", fontsize=17) [x.set_fontsize(15) for x in ax.get_xticklabels()] [x.set_fontsize(15) for x in ax.get_yticklabels()] plt.savefig(os.path.join("./data/", "test_true_idxs_dist.png")) ### Inference net.eval() with torch.no_grad(): pred_labels = net(f) c_m = confusion_matrix([(e - 1) for e in labels_not_selected], list(pred_labels[test_idxs].max(1)[1].numpy())) fig = plt.figure(figsize=(25, 25)) ax = fig.add_subplot(111) sb.heatmap(c_m, annot=False) ax.set_title("Confusion Matrix", fontsize=20) ax.set_xlabel("Actual class", fontsize=17) ax.set_ylabel("Predicted", fontsize=17) plt.savefig(os.path.join("./data/", "confusion_matrix.png")) #### Prints misclassified labels misclassified(df_data, pred_labels[test_idxs], labels_not_selected, test_idxs, book_dict)