Beispiel #1
0
    def train(self):
        train_losses = []
        val_losses = []
        model_path = os.path.join(self.model_dir, self.model_file)

        print("Training model...\n")
        timer = Timer()
        timer.tic()

        x = self.data.x.to(self.device)
        train_pos_edge_index = self.data.train_pos_edge_index.to(self.device)

        for epoch in range(self.epochs):
            print("Epoch: {}".format(epoch + 1))
            self.model.train()
            self.optimizer.zero_grad()
            z = self.model.encode(x, train_pos_edge_index)
            loss = self.model.recon_loss(z, train_pos_edge_index)
            if self.model_name == "ARGVA":
                loss = loss + (1 / self.data.num_nodes) * self.model.kl_loss()
            loss += self.dis_loss_para * self.model.discriminator_loss(z) + \
                self.reg_loss_para * self.model.reg_loss(z)
            loss.backward()
            self.optimizer.step()

            # Evaluate on validation data
            self.model.eval()
            with torch.no_grad():
                train_losses.append(loss.cpu().detach().numpy())

                # Compute validation statistics
                val_pos_edge_index = self.data.val_pos_edge_index.to(
                    self.device)
                val_neg_edge_index = self.data.val_neg_edge_index.to(
                    self.device)
                z = self.model.encode(x, train_pos_edge_index)
                val_loss = self.model.recon_loss(z, train_pos_edge_index)
                if self.model_name == "ARGVA":
                    val_loss += (1 /
                                 self.data.num_nodes) * self.model.kl_loss()
                val_loss += self.dis_loss_para * self.model.discriminator_loss(
                    z) + self.reg_loss_para * self.model.reg_loss(z)
                val_losses.append(val_loss.cpu().detach().numpy())
                if val_losses[-1] == min(val_losses):
                    print("\tSaving model...")
                    torch.save(self.model.state_dict(), model_path)
                    print("\tSaved.")
                print("\ttrain_loss=", "{:.5f}".format(loss), "val_loss=",
                      "{:.5f}".format(val_loss))

        print("Finished training.\n")
        training_time = timer.toc()
        self._plot_losses(train_losses, val_losses)
        self._print_stats(train_losses, val_losses, training_time)
    def __init__(self, embedding_type, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "han", self.embedding_type)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)
Beispiel #3
0
    def __init__(self, embedding_type, graph_type, threshold=2, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type)
        if not os.path.isdir(self.path_persistent):
            os.mkdir(self.path_persistent)
Beispiel #4
0
 def __init__(self):
     self.parser = FileParser()
     self.persistent = {}
     self.timer = Timer()
     self.processes = {
         "chapters_books": {
             "process_data": "_process_data_chapters_books",
             "persistent_file": os.path.join(self.path,
                                             "chapters_books.pkl")
         },
         "chapters_all_scigraph_citations": {
             "process_data":
             "_process_data_chapters_all_scigraph_citations",
             "persistent_file":
             os.path.join(self.path, "chapters_all_scigraph_citations.pkl")
         },
         "chapters_confproc_scigraph_citations": {
             "process_data":
             "_process_data_chapters_confproc_scigraph_citations",
             "persistent_file":
             os.path.join(self.path,
                          "chapters_confproc_scigraph_citations.pkl")
         },
         "books_conferences": {
             "process_data": "_process_data_books_conferences",
             "persistent_file": os.path.join(self.path,
                                             "books_conferences.pkl")
         },
         "author_id_chapters": {
             "process_data":
             "_process_data_author_id_chapters",
             "persistent_file":
             os.path.join(self.path, "author_id_chapters.pkl")
         },
         "author_name_chapters": {
             "process_data":
             "_process_data_author_name_chapters",
             "persistent_file":
             os.path.join(self.path, "author_name_chapters.pkl")
         },
         "confproc_scigraph_citations_chapters": {
             "process_data":
             "_process_data_confproc_scigraph_citations_chapters",
             "persistent_file":
             os.path.join(self.path,
                          "confproc_scigraph_citations_chapters.pkl")
         }
     }
    def __init__(self,
                 embedding_type,
                 dataset,
                 graph_type="directed",
                 threshold=2,
                 gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)
    def train(self, data):
        if not self._load_model_classifier():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()

            print("Loading the training embeddings...")
            if not self._load_train_embeddings():
                print("The pretrained embeddings are missing.")
            else:
                print("Loaded.")
            training_ids = list(data.chapter)
            training_embeddings = self.pretrained_embeddings[[
                self.pretrained_embeddings_id_map[id] for id in training_ids
            ]]

            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(
                data.conferenceseries)
            self.classifier.fit(training_embeddings, self.labels)
            self._save_model_classifier()

            print("Training finished.")
            timer.toc()
Beispiel #7
0
class DatasetsParser:
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                        "..", "data", "interim", "parsed_data")

    def __init__(self):
        self.parser = FileParser()
        self.persistent = {}
        self.timer = Timer()
        self.processes = {
            "chapters_books": {
                "process_data": "_process_data_chapters_books",
                "persistent_file": os.path.join(self.path,
                                                "chapters_books.pkl")
            },
            "chapters_all_scigraph_citations": {
                "process_data":
                "_process_data_chapters_all_scigraph_citations",
                "persistent_file":
                os.path.join(self.path, "chapters_all_scigraph_citations.pkl")
            },
            "chapters_confproc_scigraph_citations": {
                "process_data":
                "_process_data_chapters_confproc_scigraph_citations",
                "persistent_file":
                os.path.join(self.path,
                             "chapters_confproc_scigraph_citations.pkl")
            },
            "books_conferences": {
                "process_data": "_process_data_books_conferences",
                "persistent_file": os.path.join(self.path,
                                                "books_conferences.pkl")
            },
            "author_id_chapters": {
                "process_data":
                "_process_data_author_id_chapters",
                "persistent_file":
                os.path.join(self.path, "author_id_chapters.pkl")
            },
            "author_name_chapters": {
                "process_data":
                "_process_data_author_name_chapters",
                "persistent_file":
                os.path.join(self.path, "author_name_chapters.pkl")
            },
            "confproc_scigraph_citations_chapters": {
                "process_data":
                "_process_data_confproc_scigraph_citations_chapters",
                "persistent_file":
                os.path.join(self.path,
                             "confproc_scigraph_citations_chapters.pkl")
            }
        }

    def get_data(self, process):
        # Check if the data is already present
        if (process in self.persistent):
            return self.persistent[process]

        print("Process '{}' not in memory yet.".format(process))
        # Load from persistent file if data already processed
        if os.path.isfile(self.processes[process]["persistent_file"]):
            with open(self.processes[process]["persistent_file"], "rb") as f:
                self.persistent[process] = pickle.load(f)
                return self.persistent[process]

        print("Process '{}' not persistent yet. Processing.".format(process))

        # Process the data
        self.persistent[process] = self._parse_file(
            self.processes[process]["process_data"])

        with open(self.processes[process]["persistent_file"], "wb") as f:
            pickle.dump(self.persistent[process], f)

        return self.persistent[process]

    def _parse_file(self, process_data):
        print("Start processing file.\n")
        self.timer.tic()
        process_data_function = self.__getattribute__(process_data)
        results = process_data_function()
        self.timer.toc()
        return results

    # processes implementation
    def _process_data_chapters_books(self):
        # Load datasets
        df_chapters_books_isbns = pd.DataFrame(
            list(self.parser.get_data("chapters_books_isbns").items()),
            columns=["chapter", "books_isbns"])
        df_isbn_book_ids = pd.DataFrame(list(
            self.parser.get_data("isbn_books").items()),
                                        columns=["isbn", "book"])

        # Process datasets
        df_chapters_books_isbns[["isbn1", "isbn2"]] = pd.DataFrame(
            df_chapters_books_isbns["books_isbns"].tolist(),
            index=df_chapters_books_isbns.index)
        df_chapters_books_isbns.drop(columns=["books_isbns"],
                                     axis=1,
                                     inplace=True)
        df_chapters_isbn1 = pd.merge(
            df_chapters_books_isbns[["chapter", "isbn1"]],
            df_isbn_book_ids,
            how="inner",
            left_on=["isbn1"],
            right_on=["isbn"])
        df_chapters_isbn1.drop(columns=["isbn1", "isbn"], inplace=True)
        df_chapters_isbn2 = pd.merge(
            df_chapters_books_isbns[["chapter", "isbn2"]],
            df_isbn_book_ids,
            how="inner",
            left_on=["isbn2"],
            right_on=["isbn"])
        df_chapters_isbn2.drop(columns=["isbn2", "isbn"], inplace=True)
        df_chapters_books = df_chapters_isbn1.append(df_chapters_isbn2,
                                                     ignore_index=True)
        df_chapters_books.drop_duplicates(inplace=True)
        return df_chapters_books

    def _process_data_chapters_all_scigraph_citations(self):
        df_chapters_citations = pd.DataFrame(
            list(self.parser.get_data("chapters_all_citations").items()),
            columns=["chapter", "chapter_citations"])
        chapters_count = len(df_chapters_citations)
        with tqdm(desc="Processing citations",
                  total=chapters_count,
                  unit="chapter") as pbar:
            for idx in range(chapters_count):
                citations = df_chapters_citations.iloc[idx][
                    "chapter_citations"]
                citations = [
                    c for c in citations
                    if c is not None and c.startswith("sg")
                ]
                df_chapters_citations.iloc[
                        idx]["chapter_citations"] = citations if citations \
                    else np.nan
                pbar.update(1)
        return df_chapters_citations[
            df_chapters_citations["chapter_citations"].notnull()]

    def _process_data_chapters_confproc_scigraph_citations(self):
        df_scigraph_citations = self.get_data(
            "chapters_all_scigraph_citations")
        df_chapters = pd.DataFrame(self.parser.get_data("chapters"),
                                   columns=["chapter"])
        chapters = set(list(df_chapters["chapter"]))
        chapters_count = len(df_scigraph_citations)
        with tqdm(desc="Processing citations",
                  total=chapters_count,
                  unit="chapter") as pbar:
            for idx in range(chapters_count):
                scigraph_citations = df_scigraph_citations.iloc[idx][
                    "chapter_citations"]
                citations = [c for c in scigraph_citations if c in chapters]
                df_scigraph_citations.iloc[idx][
                    "chapter_citations"] = citations if citations else np.nan
                pbar.update(1)
        return df_scigraph_citations[
            df_scigraph_citations["chapter_citations"].notnull()]

    def _process_data_books_conferences(self):
        df_old_books_new_books = pd.DataFrame(list(
            self.parser.get_data("old_books_new_books").items()),
                                              columns=["old_book", "new_book"])
        df_old_books_conferences = pd.DataFrame(
            list(self.parser.get_data("old_books_conferences").items()),
            columns=["old_book", "conference"])
        df = pd.merge(df_old_books_new_books,
                      df_old_books_conferences,
                      how="left",
                      on=["old_book", "old_book"])
        df.drop(columns=["old_book"], axis=1, inplace=True)
        df.rename(columns={
            "new_book": "book",
            "conference": "conference"
        },
                  inplace=True)
        return df[df["conference"].notnull()]

    def _process_data_author_id_chapters(self):
        df_chapters_authors = pd.DataFrame(list(
            self.parser.get_data("chapters_authors").items()),
                                           columns=["chapter", "authors"])
        contributions = []
        for idx in range(len(df_chapters_authors)):
            authors = [
                author for author in df_chapters_authors.iloc[idx]["authors"]
            ]
            chapter = df_chapters_authors.iloc[idx]["chapter"]
            contributions.extend([(author, chapter) for author in authors])
        author_id_chapters = pd.DataFrame.from_records(
            contributions, columns=["author", "chapter"])
        return author_id_chapters

    def _process_data_author_name_chapters(self):
        df_chapters_authors_name = pd.DataFrame(
            list(self.parser.get_data("chapters_authors_name").items()),
            columns=["chapter", "authors_name"])
        contributions = []
        for idx in range(len(df_chapters_authors_name)):
            authors_name = [
                author_name for author_name in
                df_chapters_authors_name.iloc[idx]["authors_name"]
            ]
            chapter = df_chapters_authors_name.iloc[idx]["chapter"]
            contributions.extend([(author_name, chapter)
                                  for author_name in authors_name])
        author_name_chapters = pd.DataFrame.from_records(
            contributions, columns=["author_name", "chapter"])
        return author_name_chapters

    def _process_data_confproc_scigraph_citations_chapters(self):
        df_chapters_confproc_scigraph_citations = self.get_data(
            "chapters_confproc_scigraph_citations")
        citations = []
        for idx in range(len(df_chapters_confproc_scigraph_citations)):
            citation_list = [
                citation
                for citation in df_chapters_confproc_scigraph_citations.
                iloc[idx]["chapter_citations"]
            ]
            chapter = df_chapters_confproc_scigraph_citations.iloc[idx][
                "chapter"]
            citations.extend([(citation, chapter)
                              for citation in citation_list])
        confproc_scigraph_citations_chapter = pd.DataFrame.from_records(
            citations, columns=["citation", "chapter"])
        return confproc_scigraph_citations_chapter
Beispiel #8
0
    def inference(self, test_data, gpu_mem_fraction=None):
        print("Inference.")
        timer = Timer()
        timer.tic()

        G = test_data[0]
        features = test_data[1]
        id_map = test_data[2]
        class_map = test_data[4]
        if isinstance(list(class_map.values())[0], list):
            num_classes = len(list(class_map.values())[0])
        else:
            num_classes = len(set(class_map.values()))

        if not features is None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        placeholders = self._construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(num_classes, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Restore model
        print("Restoring trained model.")
        checkpoint_file = os.path.join(self._log_dir(), "model.ckpt")
        ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file)
        if checkpoint_file:
            saver.restore(sess, checkpoint_file)
            print("Model restored.")
        else:
            print("This model checkpoint does not exist. The model might " +
                  "not be trained yet or the checkpoint is invalid.")

        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)
        sess.run(val_adj_info.op)

        print("Computing predictions...")
        t_test = time.time()
        finished = False
        val_losses = []
        val_preds = []
        nodes = []
        iter_num = 0
        while not finished:
            feed_dict_val, _, finished, nodes_subset = minibatch.incremental_node_val_feed_dict(
                self.batch_size, iter_num, test=True)
            node_outs_val = sess.run([model.preds, model.loss],
                                     feed_dict=feed_dict_val)
            val_preds.append(node_outs_val[0])
            val_losses.append(node_outs_val[1])
            nodes.extend(nodes_subset)
            iter_num += 1
        val_preds = np.vstack(val_preds)
        print("Computed.")

        # Return only the embeddings of the test nodes
        test_preds_ids = {}
        for i, node in enumerate(nodes):
            test_preds_ids[node] = i
        test_nodes = [n for n in G.nodes() if G.node[n]['test']]
        test_preds = val_preds[[test_preds_ids[id] for id in test_nodes]]
        timer.toc()
        sess.close()
        return test_nodes, test_preds
Beispiel #9
0
    def train(self, train_data, test_data=None):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]
        class_map = train_data[4]
        if isinstance(list(class_map.values())[0], list):
            num_classes = len(list(class_map.values())[0])
        else:
            num_classes = len(set(class_map.values()))

        if not features is None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        placeholders = self._construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(num_classes, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Train model
        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_losses = []
        validation_losses = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []
            while not minibatch.end():
                # Construct feed dictionary
                feed_dict, labels = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})

                t = time.time()
                # Training step
                outs = sess.run(
                    [merged, model.opt_op, model.loss, model.preds],
                    feed_dict=feed_dict)
                train_cost = outs[2]
                train_loss_epoch.append(train_cost)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    if self.validate_batch_size == -1:
                        val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate(
                            sess, model, minibatch, self.batch_size)
                    else:
                        val_cost, val_f1_mic, val_f1_mac, duration = self._evaluate(
                            sess, model, minibatch, self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)

#                if total_steps % self.print_every == 0:
#                    summary_writer.add_summary(outs[0], total_steps)

# Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    train_f1_mic, train_f1_mac = self._calc_f1(
                        labels, outs[-1])
                    print("Iter:", '%04d' % iter, "train_loss=",
                          "{:.5f}".format(train_cost), "train_f1_mic=",
                          "{:.5f}".format(train_f1_mic), "train_f1_mac=",
                          "{:.5f}".format(train_f1_mac), "val_loss=",
                          "{:.5f}".format(val_cost), "val_f1_mic=",
                          "{:.5f}".format(val_f1_mic), "val_f1_mac=",
                          "{:.5f}".format(val_f1_mac), "time=",
                          "{:.5f}".format(avg_time))

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # If the epoch has the lowest validation loss so far
            if validation_losses[-1] == min(validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                # Save model at each epoch
                print("Saving model at epoch {}.".format(epoch))
                saver.save(sess, os.path.join(self._log_dir(), "model.ckpt"))

            if total_steps > self.max_total_steps:
                break

        print("Optimization Finished!")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses)
        self._print_stats(train_losses, validation_losses, training_time)

        sess.run(val_adj_info.op)
        val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate(
            sess, model, minibatch, self.batch_size)
        print("Full validation stats:", "loss=", "{:.5f}".format(val_cost),
              "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=",
              "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(duration))
        with open(self._log_dir() + "val_stats.txt", "w") as fp:
            fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".
                     format(val_cost, val_f1_mic, val_f1_mac, duration))
Beispiel #10
0
    def __init__(self):
        self.timer = Timer()
        self.persistent = {}
        self.processes = {
                # Old datasets
                "old_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "old_books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "old_books_new_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_new_books",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_new_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "old_books_conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_conferences",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_conferences.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "conferences.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferences_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_name",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferences_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_acronym": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_acronym",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_acronym.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_city": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_city",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_city.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_country": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_country",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_country.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_year": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_year",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_datestart": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_datestart",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_datestart.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_dateend": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_dateend",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_dateend.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_conferenceseries.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferenceseries.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferenceseries_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferenceseries_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },

                # New datasets
                "books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "isbn_books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_isbn_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "isbn_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "authors_name": {
                        "filename": os.path.join(self.path_raw, authors_file),
                        "process_line": "_process_line_authors_name",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "chapters_title": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_title",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_title.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_year": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_year",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_language": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_language",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_language.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_abstract": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_abstract",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_abstract.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_authors.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors_name": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_all_citations": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_all_citations",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_all_citations.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_keywords": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_keywords",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_keywords.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_books_isbns": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_books_isbns",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_books_isbns.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                }
Beispiel #11
0
    def train(self, train_data):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = train_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(placeholders, features, adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(),
        #                                                         sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Train model
        train_shadow_mrr = None
        shadow_mrr = None

        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_losses = []
        validation_losses = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []
            while not minibatch.end():
                # Construct feed dictionary
                feed_dict = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})

                t = time.time()
                # Training step
                outs = sess.run([
                    merged, model.opt_op, model.loss, model.ranks,
                    model.aff_all, model.mrr, model.outputs1
                ],
                                feed_dict=feed_dict)

                train_cost = outs[2]
                train_mrr = outs[5]
                train_loss_epoch.append(train_cost)
                if train_shadow_mrr is None:
                    train_shadow_mrr = train_mrr
                else:
                    train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr -
                                                      train_mrr)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    val_cost, ranks, val_mrr, duration = self._evaluate(
                        sess, model, minibatch, size=self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)
                if shadow_mrr is None:
                    shadow_mrr = val_mrr
                else:
                    shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr)

#                if total_steps % self.print_every == 0:
#                    summary_writer.add_summary(outs[0], total_steps)

# Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    print(
                        "Iter: %04d" % iter,
                        "train_loss={:.5f}".format(train_cost),
                        "train_mrr={:.5f}".format(train_mrr),
                        # exponential moving average
                        "train_mrr_ema={:.5f}".format(train_shadow_mrr),
                        "val_loss={:.5f}".format(val_cost),
                        "val_mrr={:.5f}".format(val_mrr),
                        # exponential moving average
                        "val_mrr_ema={:.5f}".format(shadow_mrr),
                        "time={:.5f}".format(avg_time))

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # Save embeddings if the epoch has the lowest validation loss
            # so far
            if self.save_embeddings and validation_losses[-1] == min(
                    validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                sess.run(val_adj_info.op)
                self._save_embeddings(sess, model,
                                      minibatch, self.validate_batch_size,
                                      self._log_dir())

            # Save model at each epoch
            print("Saving model at epoch {}.".format(epoch))
            saver.save(sess,
                       os.path.join(self._log_dir(),
                                    "model_epoch_" + str(epoch) + ".ckpt"),
                       global_step=total_steps)

            if total_steps > self.max_total_steps:
                break

        print("Optimization finished!\n")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses)
        self._print_stats(train_losses, validation_losses, training_time)
Beispiel #12
0
    def train(self):
        # Make the datasets iterable
        batch_size = 10000

        train_data_loader = torch.utils.data.DataLoader(
            dataset=self.training_data, batch_size=batch_size)
        validation_data_loader = torch.utils.data.DataLoader(
            dataset=self.validation_data, batch_size=batch_size)
        train_labels_loader = torch.utils.data.DataLoader(
            dataset=self.training_labels, batch_size=batch_size)
        validation_labels_loader = torch.utils.data.DataLoader(
            dataset=self.validation_labels, batch_size=batch_size)

        # Train the model
        timer = Timer()
        timer.tic()

        mean_train_losses = []
        mean_validation_losses = []

        for epoch in range(self.epochs):
            print("Epoch: {}".format(epoch + 1))
            train_losses = []
            validation_losses = []
            self.model.train()

            for i, (train_data, train_labels) in enumerate(
                    zip(train_data_loader, train_labels_loader)):
                self.model.train()
                self.optimizer.zero_grad()
                outputs = self.model(train_data)
                loss = self.cross_entropy_loss(outputs.squeeze(), train_labels)
                loss.backward()
                self.optimizer.step()
                train_losses.append(loss.item())

                # Compute validation loss
                self.model.eval()
                with torch.no_grad():
                    for _, (val_data, val_labels) in enumerate(
                            zip(validation_data_loader,
                                validation_labels_loader)):
                        val_pred = self.model(val_data)
                        val_loss = self.cross_entropy_loss(
                            val_pred.squeeze(), val_labels)
                        validation_losses.append(val_loss.item())

            print("\tTrain loss: {}, validation loss: {}".format(
                np.mean(train_losses), np.mean(validation_losses)))
            mean_train_losses.append(np.mean(train_losses))
            mean_validation_losses.append(np.mean(validation_losses))
            if mean_validation_losses[-1] == min(mean_validation_losses):
                print("\tSaving model...")
                torch.save(self.model.state_dict(), self.model_path)
                print("\tSaved.")

        print("Finished training.")
        training_time = timer.toc()
        self._plot_losses(mean_train_losses, mean_validation_losses)
        self._print_stats(mean_train_losses, mean_validation_losses,
                          training_time)
    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for GraphSAGE concatenated ' +
            'classifier model evaluation.')
        parser.add_argument(
            "classifier_name",
            choices=["KNN", "MLP", "MultinomialLogisticRegression"],
            help="The name of the classifier.")
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('model_checkpoint_citations',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the citations graph.')
        parser.add_argument('model_checkpoint_authors',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the authors graph.')
        parser.add_argument('train_prefix_citations',
                            help='Name of the object file that stores the ' +
                            'citations training data.')
        parser.add_argument('train_prefix_authors',
                            help='Name of the object file that stores the ' +
                            'authors training data.')
        parser.add_argument('model_name',
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_true",
                            default=False,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        parser.add_argument('--recs',
                            type=int,
                            default=10,
                            help='Number of recommendations.')
        args = parser.parse_args()

        print("Starting evaluation...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        print("Using GPU {}.".format(str(args.gpu)))

        from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation
        evaluation_model = GraphSAGEClassifierConcatEvaluation(
            args.classifier_name, args.embedding_type, args.model_name,
            args.model_size, args.learning_rate, args.gpu, args.recs)

        # Initialize GraphSAGE models
        graphsage_model_citations = UnsupervisedModel(
            args.train_prefix_citations, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        graphsage_model_authors = UnsupervisedModel(
            args.train_prefix_authors, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)

        # Train model if needed:
        if not evaluation_model._has_persistent_model():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()
            evaluation_model.train(graphsage_model_citations,
                                   graphsage_model_authors)
            print("Training finished.")
            timer.toc()
        else:
            evaluation_model._load_model_classifier()

        # Load test data
        print("Loading test data...")
        query_test, query_test_authors, truth = evaluation_model.load_data()
        print("Loaded.")

        # Infer embeddings
        print("Inferring embeddings for citations graph.")
        queue_citations = mp.Queue()
        process_citations = mp.Process(
            target=evaluation_model.infer_embeddings,
            args=(query_test, None, "citations", graphsage_model_citations,
                  args.model_checkpoint_citations, queue_citations))
        process_citations.start()
        embeddings_citations = queue_citations.get()
        process_citations.join()
        process_citations.terminate()

        print("Inferring embeddings for authors graphs.")
        queue_authors = mp.Queue()
        process_authors = mp.Process(target=evaluation_model.infer_embeddings,
                                     args=(query_test, query_test_authors,
                                           "authors", graphsage_model_authors,
                                           args.model_checkpoint_authors,
                                           queue_authors))
        process_authors.start()
        embeddings_authors = queue_authors.get()
        process_authors.join()
        process_authors.terminate()

        # Concatenate embeddings
        test_embeddings = np.concatenate(
            (embeddings_citations, embeddings_authors), axis=1)

        print("Computing predictions...")
        recommendation = evaluation_model.compute_predictions(test_embeddings)
        print("Predictions computed.")

        # Evaluate
        print("Evaluating...")
        evaluation = EvaluationContainer()
        evaluation.evaluate(recommendation, truth)
        print("Finished.")
Beispiel #14
0
class Processor():
    def __init__(self, embedding_type, graph_type, threshold=2, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type)
        if not os.path.isdir(self.path_persistent):
            os.mkdir(self.path_persistent)

    def training_data(self, num_walks=50):
        self.prefix = "train_val"
        self.timer.tic()
        print("Creating training files.")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        # Create and save graph
        self.G = nx.Graph()

        # Add nodes and edges
        print("Adding training nodes.")
        self._add_nodes(df_train, test=False, val=False)

        print("Adding training edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                df_train = d_train.author_names().data
            self._add_edges(df_train)
        elif self.graph_type == "citations_authors_het_edges":
            # Adding heterogeneous edges
            # Add citation edges
            self._add_weighted_edges_citations(df_train)
            # Add author edges
            df_train = d_train.author_names().data
            self._add_weighted_edges_authors(df_train)
        else:
            raise KeyError("Graph type unknown.")

        print("Adding validation nodes.")
        self._add_nodes(df_validation, test=False, val=True)

        print("Adding validation edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                df_validation = d_val.author_names().data
            self._add_edges(df_validation)
        elif self.graph_type == "citations_authors_het_edges":
            # Add citation edges
            self._add_weighted_edges_citations(df_validation)
            # Add author edges
            df_validation = d_val.author_names().data
            self._add_weighted_edges_authors(df_validation)
        else:
            raise KeyError("Graph type unknown.")

        if self.graph_type == "citations_authors_het_edges":
            # Remove edges with weight lower than threshold
            remove_edges = [(u, v) for u, v, e in self.G.edges(data=True)
                            if e["weight"] < self.threshold]
            self.G.remove_edges_from(remove_edges)
            # Clear edge attributes
            for n1, n2, d in self.G.edges(data=True):
                d.clear()
            print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

        print("Removing nodes without features.")
        for node in list(self.G.nodes()):
            if "feature" not in self.G.nodes[node].keys():
                self.G.remove_node(node)
        print("Nodes in graph: {}, edges in graph: {}.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        print("Saving graph to disk.")
        G_data = json_graph.node_link_data(self.G)
        with open(os.path.join(self.path_persistent, self.prefix + "-G.json"),
                  "w") as f:
            f.write(json.dumps(G_data))

        # Create and save class map
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        data = df_train.append(df_validation, ignore_index=True)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        self._create_class_map(data)

        # Create and save id map
        self._create_id_map()

        # Create and save features
        self._create_features()

        # Perform and save random walks
        nodes = [
            n for n in list(self.G.nodes())
            if not self.G.node[n]["val"] and not self.G.node[n]["test"]
        ]
        subgraph = self.G.subgraph(nodes)
        self._run_random_walks(subgraph, nodes, num_walks)

        print("Finished creating training files.")
        self.timer.toc()

        # print some statistics
        self._get_stats()

        # Plot degree histogram
        self._degree_histogram()

    def test_data(self,
                  df_test,
                  G_train,
                  authors_df=None,
                  class_map=None,
                  normalize=True):
        # TO DO: Add case for authors
        self.prefix = "test"
        print("Preprocessing data...")
        self.G = G_train
        print("Training graph has {} nodes and {} edges.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        # Add nodes and edges
        print("Adding test nodes.")
        self._add_nodes(df_test, test=True, val=False)

        print("Adding test edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                if authors_df is not None:
                    df_test = pd.merge(df_test,
                                       authors_df,
                                       how="left",
                                       on=["chapter", "chapter"])
                else:
                    raise ValueError("Chapter authors are missing.")
            self._add_edges(df_test)
        elif self.graph_type == "citations_authors_het_edges":
            # Adding heterogeneous edges
            # Add citation edges
            self._add_weighted_edges_citations(df_test)

            # Add author edges
            if authors_df is not None:
                df_test = pd.merge(df_test,
                                   authors_df,
                                   how="left",
                                   on=["chapter", "chapter"])
            else:
                raise ValueError("Chapter authors are missing.")
            self._add_weighted_edges_authors(df_test)

            # Remove edges with weight lower than threshold
            remove_edges = [
                (u, v) for u, v, e in self.G.edges(data=True)
                if "weight" in e.keys() and e["weight"] < self.threshold
            ]
            self.G.remove_edges_from(remove_edges)

            # Clear edge attributes
            for n1, n2, d in self.G.edges(data=True):
                d.clear()
            print("Edges in graph: {}.\n".format(self.G.number_of_edges()))
        else:
            raise KeyError("Graph type unknown.")

        print("Removing nodes without features.")
        for node in list(self.G.nodes()):
            if "feature" not in self.G.nodes[node].keys():
                self.G.remove_node(node)
        print("Nodes in graph: {}, edges in graph: {}.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        # Remove all nodes that do not have val/test annotations
        broken_count = 0
        for node in self.G.nodes():
            if 'val' not in self.G.node[node] or 'test' not in self.G.node[
                    node]:
                self.G.remove_node(node)
                broken_count += 1
        print(
            "Removed {} nodes that lacked proper annotations due to networkx versioning issues."
            .format(broken_count))

        # Make sure the graph has edge train_removed annotations
        for edge in self.G.edges():
            if (self.G.node[edge[0]]['val'] or self.G.node[edge[1]]['val']
                    or self.G.node[edge[0]]['test']
                    or self.G.node[edge[1]]['test']):
                self.G[edge[0]][edge[1]]['train_removed'] = True
            else:
                self.G[edge[0]][edge[1]]['train_removed'] = False

        # Create and process id map
        id_map = self._create_id_map()

        if isinstance(list(self.G.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n
        id_map = {conversion(k): int(v) for k, v in id_map.items()}

        # Create and process features
        features = self._create_features()

        if normalize:
            train_ids = np.array([
                id_map[n] for n in self.G.nodes()
                if not self.G.node[n]['val'] and not self.G.node[n]['test']
            ])
            train_feats = features[train_ids]
            scaler = StandardScaler()
            scaler.fit(train_feats)
            features = scaler.transform(features)
        print("Finished preprocessing data.")

        # print some statistics
        self._get_stats()

        # Plot degree histogram
        self._degree_histogram()

        # Add "fake" temporary classes for test nodes in class map
        if class_map is not None:
            test_nodes = [n for n in self.G.nodes() if self.G.node[n]['test']]
            for test_node in test_nodes:
                class_map[test_node] = np.zeros(
                    (len(class_map[list(class_map.keys())[0]]), ), dtype=int)
            return self.G, features, id_map, class_map

        return self.G, features, id_map

    def _add_nodes(self, data, test=False, val=False):
        with tqdm(desc="Adding nodes: ", total=len(data), unit="node") as pbar:
            for idx in range(len(data)):
                self.G.add_node(
                    data.chapter.iloc[idx],
                    test=test,
                    feature=np.concatenate(
                        (self.embeddings_parser.embed_sequence(
                            data.chapter_title.iloc[idx], self.embedding_type),
                         self.embeddings_parser.embed_sequence(
                             data.chapter_abstract.iloc[idx],
                             self.embedding_type)),
                        axis=0).tolist(),
                    val=val)
                pbar.update(1)
        print("Nodes in graph: {}.\n".format(self.G.number_of_nodes()))

    def _add_edges(self, data):
        if self.graph_type == "citations":
            self._add_edges_citations(data)
        elif self.graph_type == "authors":
            self._add_edges_authors(data)
        else:
            raise KeyError("Graph type unknown.")

    def _add_edges_citations(self, data):
        """Adds edges between papers that share a citation.
        """
        with tqdm(desc="Adding edges: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                self.G.add_edges_from([
                    (data.chapter.iloc[idx],
                     data.chapter_citations.iloc[idx][i])
                    for i in range(len(data.chapter_citations.iloc[idx]))
                ])
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_weighted_edges_citations(self, data):
        """Adds edges between papers that share a citation.
        """
        with tqdm(desc="Adding edges: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                self.G.add_edges_from(
                    [(data.chapter.iloc[idx],
                      data.chapter_citations.iloc[idx][i])
                     for i in range(len(data.chapter_citations.iloc[idx]))],
                    weight=100)
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_edges_authors(self, data):
        """Adds edges between papers sharing an author.
        """
        data_grouped = data.groupby("author_name")["chapter"].agg(
            list).reset_index()
        with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar:
            for idx in range(len(data_grouped)):
                self.G.add_edges_from(
                    combinations(data_grouped.iloc[idx].chapter, 2))
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_weighted_edges_authors(self, data):
        """Adds edges between papers sharing an author.
        """
        data_grouped = data.groupby("author_name")["chapter"].agg(
            list).reset_index()
        with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar:
            for idx in range(len(data_grouped)):
                edges = combinations(data_grouped.iloc[idx].chapter, 2)
                for edge in edges:
                    if self.G.has_edge(edge[0], edge[1]):
                        self.G[edge[0]][edge[1]]["weight"] += 1
                    else:
                        self.G.add_edge(edge[0], edge[1], weight=1)
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _create_class_map(self, data):
        print("Creating class map.")
        nodes = list(self.G.nodes)
        class_map = {
            nodes[i]: [
                int(j) for j in list(
                    self.label_encoder.transform(
                        np.array(data[data.chapter == nodes[i]].
                                 conferenceseries).reshape(-1, 1))[0])
            ]
            for i in range(len(nodes))
        }
        print("Saving class map to disk.")
        with open(
                os.path.join(self.path_persistent,
                             self.prefix + "-class_map.json"), "w") as f:
            f.write(json.dumps(class_map))

        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def _create_id_map(self):
        if self.prefix == "train_val":
            print("Creating id map.")

        nodes = list(self.G.nodes)
        id_map = {nodes[i]: i for i in range(len(nodes))}

        if self.prefix == "test":
            return id_map
        else:
            print("Saving id map to disk.")
            with open(
                    os.path.join(self.path_persistent,
                                 self.prefix + "-id_map.json"), "w") as f:
                f.write(json.dumps(id_map))

    def _create_features(self):
        if self.prefix == "train_val":
            print("Creating features.")

        features = np.array(
            [self.G.nodes[node]["feature"] for node in list(self.G.nodes)])

        if self.prefix == "test":
            return features
        else:
            print("Saving features to disk.")
            np.save(
                os.path.join(self.path_persistent, self.prefix + "-feats.npy"),
                features)

    def _run_random_walks(self, graph, nodes, num_walks):
        print("Running random walks.")
        walks = run_random_walks(graph, nodes, num_walks=num_walks)
        print("Saving random walks to disk.")
        with open(
                os.path.join(self.path_persistent, self.prefix + "-walks.txt"),
                "w") as fp:
            fp.write("\n".join([str(w[0]) + "\t" + str(w[1]) for w in walks]))

    def _get_stats(self):
        degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True)
        degree_count = Counter(degree_sequence)

        with open(
                os.path.join(self.path_persistent, self.prefix + "-stats.txt"),
                "w") as fp:
            self._print(
                "Number of nodes in the graph: {}\n".format(
                    self.G.number_of_nodes()), fp)
            self._print(
                "Number of edges in the graph: {}\n".format(
                    self.G.number_of_edges()), fp)
            self._print(
                "The graph is connected: {}\n".format(nx.is_connected(self.G)),
                fp)
            self._print(
                "Number of connected components: {}\n".format(
                    nx.number_connected_components(self.G)), fp)
            self._print(
                "Number of self-loops: {}\n".format(
                    nx.number_of_selfloops(self.G)), fp)
            self._print("Maximum degree: {}\n".format(max(degree_count)), fp)
            self._print("Minimum degree: {}\n".format(min(degree_count)), fp)
            self._print(
                "Average degree: {}\n".format(
                    sum(degree_sequence) / len(self.G)), fp)

    def _degree_histogram(self):
        # Plot degree histogram
        degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True)
        degree_count = Counter(degree_sequence)
        deg, cnt = zip(*degree_count.items())

        fig, ax = plt.subplots()
        plt.bar(deg, cnt, width=0.80, color='b')
        plt.title("Degree Histogram")
        plt.ylabel("Count")
        plt.xlabel("Degree")
        ax.set_xticks([d + 0.4 for d in deg])
        ax.set_xticklabels(deg)

        plt.savefig(os.path.join(self.path_persistent,
                                 self.prefix + "-degree_histogram.png"),
                    bbox_inches="tight")

    def _print(self, text, f):
        print(text)
        f.write(text)

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--threshold',
                            type=int,
                            default=2,
                            help='Threshold for edge weights in ' +
                            'heterogeneous graph.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset,
                              args.threshold, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()
Beispiel #15
0
class Processor:
    def __init__(self, embedding_type, dataset, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create file with feature vectors for both training and validation
        # data (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        allx_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".allx")
        with open(allx_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        # Create file with feature vectors only for training data
        # (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training data.")
        train_features = train_val_features[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        x_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".x")
        with open(x_file, "wb") as f:
            pickle.dump(train_features, f)
        print("Saved.\n")

        # Create file with the labels for the training and validation data
        # (as a numpy.ndarray object)
        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        ally_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".ally")
        with open(ally_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        # Create file with the labels for the training data
        # (as a numpy.ndarray object)
        print("Creating labels for training data.")
        train_labels = train_val_labels[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        y_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".y")
        with open(y_file, "wb") as f:
            pickle.dump(train_labels, f)
        print("Saved.\n")

        # Create a dict in the format {index: [index_of_neighbor_nodes]}
        # (as a collections.defaultdict object)
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")

        print("Statistics")
        print("\tTraining data features: {}.".format(train_features.shape))
        print("\tTraining data labels: {}.".format(len(train_labels)))
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            len(train_val_labels)))
        print("\tGraph size: {}.".format(len(graph)))

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return sp.csr.csr_matrix(np.array(features))

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def test_data(self, df_test, train_features, train_labels,
                  train_val_features, train_val_labels, graph):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create the indices of test instances in graph (as a list object)
        test_indices = list(df_test.index)

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(train_val_labels[0])),
                               dtype=int)

        # Update graph with test data
        print("Updating graph information...")
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Updated.")

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        print("Created.")

        max_degree = len(max(graph.values(), key=len))
        test_idx_range = np.sort(test_indices)
        features = sp.vstack((train_val_features, test_features)).tolil()
        features[test_indices, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((train_val_labels, test_labels))
        labels[test_indices, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(train_labels))
        idx_val = range(len(train_labels), len(train_val_labels))

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask = sample_mask(idx_val, labels.shape[0])
        test_mask = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]
        print("Finished preprocessing data.\n")

        print("Adjacency matrix shape: {}.".format(adj.shape))
        print("Features matrix shape: {}.".format(features.shape))
        print("Graph size: {}.".format(len(graph)))
        print("Max degree: {}.\n".format(max_degree))

        dataset = [adj, features, y_train, y_test, train_mask, test_mask]
        prepared_test_data = self._prepare_test_data(dataset, max_degree)
        return prepared_test_data, max_degree

    def _prepare_test_data(self, dataset, max_degree):
        print("Preparing test data...")
        adj, features, y_train, y_test, train_mask, test_mask = dataset
        train_index = np.where(train_mask)[0]
        adj_train = adj[train_index, :][:, train_index]
        y_train = y_train[train_index]
        test_index = np.where(test_mask)[0]
        y_test = y_test[test_index]

        num_train = adj_train.shape[0]
        input_dim = features.shape[1]

        features = nontuple_preprocess_features(features).todense()
        train_features = features[train_index]

        norm_adj_train = nontuple_preprocess_adj(adj_train)
        norm_adj = nontuple_preprocess_adj(adj)

        adj_train, adj_val_train = compute_adjlist(norm_adj_train, max_degree)
        train_features = np.concatenate(
            (train_features, np.zeros((1, input_dim))))
        print("Prepared.\n")
        return norm_adj, adj_train, adj_val_train, features, train_features, y_train, y_test, test_index

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()
class Processor:
    def __init__(self,
                 embedding_type,
                 dataset,
                 graph_type="directed",
                 threshold=2,
                 gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create file with feature vectors for both training and validation
        # data (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        allx_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".allx")
        with open(allx_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        # Create file with feature vectors only for training data
        # (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training data.")
        train_features = train_val_features[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        x_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".x")
        with open(x_file, "wb") as f:
            pickle.dump(train_features, f)
        print("Saved.\n")

        # Create file with the labels for the training and validation data
        # (as a numpy.ndarray object)
        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        ally_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".ally")
        with open(ally_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        # Create file with the labels for the training data
        # (as a numpy.ndarray object)
        print("Creating labels for training data.")
        train_labels = train_val_labels[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        y_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".y")
        with open(y_file, "wb") as f:
            pickle.dump(train_labels, f)
        print("Saved.\n")

        # Create a dict in the format {index: [index_of_neighbor_nodes]}
        # (as a collections.defaultdict object)
        if self.dataset == "citations":
            if self.graph_type == "directed":
                graph = self._create_directed_graph(train_val_data)
            else:
                graph = self._create_undirected_graph(train_val_data)
        if self.dataset == "citations_authors_het_edges":
            df_train_authors = d_train.author_names().data
            df_val_authors = d_val.author_names().data
            train_val_authors_data = pd.concat(
                (df_train_authors, df_val_authors),
                axis=0).reset_index(drop=True)
            data_authors = train_val_authors_data.groupby(
                "author_name")["chapter"].agg(list).reset_index()
            if self.graph_type == "directed":
                graph = self._create_heterogeneous_directed_graph(
                    train_val_data, data_authors)
            else:
                raise ValueError("Graph type incompatible. Only directed " +
                                 "graph is suported.")
        print("Finished creating training files.\n")

        print("Statistics")
        print("\tTraining data features: {}.".format(train_features.shape))
        print("\tTraining data labels: {}.".format(len(train_labels)))
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            len(train_val_labels)))
        print("\tGraph size: {}.".format(len(graph)))
        print("\tMax node degree: {}.".format(len(max(graph.values(),
                                                      key=len))))

    def _create_directed_graph(self, train_val_data):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph_directed")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_heterogeneous_directed_graph(self, train_val_data,
                                             data_authors):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        # Add citation edges between papers
        with tqdm(desc="Adding citation neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                graph[idx] = [(i[0], 100) for i in citations_indices if i]
                pbar.update(1)

        # Add edges between papers if they share an author
        with tqdm(desc="Adding author neighbours: ",
                  total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    train_val_data[train_val_data.chapter ==
                                   paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append((edge[1], 1))
                pbar.update(1)

        # Removed edges with weights below the threshold
        for key in graph.keys():
            d = defaultdict(int)
            for x, y in graph[key]:
                d[x] += y
            graph[key] = [k for k, v in d.items() if v >= self.threshold]
        print("Created.")

        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph_directed")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_undirected_graph(self, train_val_data):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return sp.csr.csr_matrix(np.array(features))

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def _update_directed_graph(self, graph, train_val_data, df_test):
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        return graph

    def _update_heterogeneous_directed_graph(self, graph, train_val_data,
                                             df_test, data_authors):
        with tqdm(desc="Adding citation neighbours: ",
                  total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = [(i[0], 100) for i in citations_indices if i]
                pbar.update(1)

        with tqdm(desc="Adding author neighbours: ",
                  total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    train_val_data[train_val_data.chapter ==
                                   paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append((edge[1], 1))
                pbar.update(1)

        for key in graph.keys():
            d = defaultdict(int)
            for e in reversed(graph[key]):
                if type(e) is tuple:
                    if e[0] in d.keys():
                        d[e[0]] += e[1]
                    else:
                        d[e[0]] = e[1]
                graph[key].remove(e)
            graph[key].extend([k for k, v in d.items() if v >= self.threshold])

        return graph

    def _update_undirected_graph(self, graph, train_val_data, df_test):
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        return graph

    def test_data(self,
                  df_test,
                  train_features,
                  train_labels,
                  train_val_features,
                  train_val_labels,
                  graph,
                  authors_df=None):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create the indices of test instances in graph (as a list object)
        test_indices = list(df_test.index)

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(train_val_labels[0])),
                               dtype=int)

        # Update graph with test data
        print("Updating graph information...")
        if self.dataset == "citations":
            if self.graph_type == "directed":
                graph = self._update_directed_graph(graph, train_val_data,
                                                    df_test)
            else:
                graph = self._update_undirected_graph(graph, train_val_data,
                                                      df_test)
        if self.dataset == "citations_authors_het_edges":
            data_authors = authors_df.groupby("author_name")["chapter"].agg(
                list).reset_index()
            if self.graph_type == "directed":
                graph = self._update_heterogeneous_directed_graph(
                    graph, train_val_data, df_test, data_authors)
            else:
                raise ValueError("Graph type incompatible. Only directed " +
                                 "graph is suported.")
        print("Updated.")

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        print("Created.")

        test_idx_range = np.sort(test_indices)
        features = sp.vstack((train_val_features, test_features)).tolil()
        features[test_indices, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((train_val_labels, test_labels))
        labels[test_indices, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(train_labels))
        idx_val = range(len(train_labels), len(train_val_labels))

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask = sample_mask(idx_val, labels.shape[0])
        test_mask = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]
        print("Finished preprocessing data.")

        print("Adjacency matrix shape: {}.".format(adj.shape))
        print("Features matrix shape: {}.".format(features.shape))
        print("Graph size: {}.".format(len(graph)))

        return adj, features, y_train, y_test, train_mask, test_mask

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--graph_type',
                            choices=["directed", "undirected"],
                            default="directed",
                            help='The type of graph used ' +
                            '(directed vs. undirected).')
        parser.add_argument('--threshold',
                            type=int,
                            default=2,
                            help='Threshold for edge weights in ' +
                            'heterogeneous graph.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset,
                              args.graph_type, args.threshold, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()
class Processor:
    def __init__(self, embedding_type, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "han", self.embedding_type)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        print("Creating index files for training and validation data.")
        train_idx = np.asarray(list(train_val_data.index))[:len(df_train)]
        train_idx = np.asarray([train_idx])
        val_idx = np.asarray(list(train_val_data.index))[len(df_train):]
        val_idx = np.asarray([val_idx])
        print("Created.")
        print("Saving to disk...")
        train_idx_file = os.path.join(self.path_persistent, "train_idx.pkl")
        val_idx_file = os.path.join(self.path_persistent, "val_idx.pkl")
        with open(train_idx_file, "wb") as f:
            pickle.dump(train_idx, f)
        with open(val_idx_file, "wb") as f:
            pickle.dump(val_idx, f)
        print("Saved.")

        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        labels_file = os.path.join(self.path_persistent, "labels.pkl")
        with open(labels_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        features_file = os.path.join(self.path_persistent, "features.pkl")
        with open(features_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        df_train_authors = d_train.author_names().data
        df_val_authors = d_val.author_names().data
        train_val_authors_data = pd.concat((df_train_authors, df_val_authors),
                                           axis=0).reset_index(drop=True)
        data_authors = train_val_authors_data.groupby(
            "author_name")["chapter"].agg(list).reset_index()

        print("Creating adjacency matrices...")
        PCP = self._create_PCP_adjacency(train_val_data)
        PAP = self._create_PAP_adjacency(train_val_data, data_authors)
        print("Created.")

        print("Finished creating training files.\n")

        print("Statistics")
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            train_val_labels.shape))
        print("\tPCP graph size: {}.".format(len(PCP)))
        print("\tMax node degree: {}.".format(len(max(PCP.values(), key=len))))
        print("\tPAP graph size: {}.".format(len(PAP)))
        print("\tMax node degree: {}.".format(len(max(PAP.values(), key=len))))

    def test_data(self, df_test, authors_df, train_idx, features, labels, PCP,
                  PAP):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)
        data_authors = authors_df.groupby("author_name")["chapter"].agg(
            list).reset_index()

        # Create the indices of test instances in graph (as a list object)
        test_idx = np.asarray(list(df_test.index))
        test_idx = np.asarray([test_idx])

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(labels[0])), dtype=int)
        labels = np.vstack((labels, test_labels))

        train_mask = sample_mask(train_idx, labels.shape[0])
        test_mask = sample_mask(test_idx, labels.shape[0])
        y_train = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]

        # Update graph with test data
        print("Updating graph information...")
        PCP_graph = self._update_PCP_adjacency(PCP, train_val_data, df_test)
        PAP_graph = self._update_PAP_adjacency(PAP, train_val_data, df_test,
                                               data_authors)
        print("Updated.")
        PAP = nx.adjacency_matrix(nx.from_dict_of_lists(PAP_graph))
        PCP = nx.adjacency_matrix(nx.from_dict_of_lists(PCP_graph))
        row_networks = [PCP, PAP]
        print("PCP: {}; PAP: {}".format(PCP.shape, PAP.shape))

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        features = np.vstack((features, test_features))
        print("Features: {}".format(features.shape))
        print("Created.")

        print("Finished preprocessing data.")
        print("y_train: {}, y_test: {}, train_idx: {}, test_idx: {}".format(
            y_train.shape, y_test.shape, train_idx.shape, test_idx.shape))

        features_list = [features, features, features]
        return row_networks, features_list, y_train, y_test, train_mask, test_mask

    def _create_PCP_adjacency(self, data):
        print("Creating paper-citation-paper adjacency lists.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                citations_indices = [
                    data[data.chapter == citation].index.tolist()
                    for citation in data.chapter_citations.iloc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent, "PCP.pkl")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _update_PCP_adjacency(self, graph, data, df_test):
        print("Updating paper-citation-paper adjacency lists.")
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    data[data.chapter == citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Updated.")
        return graph

    def _create_PAP_adjacency(self, data, data_authors):
        print("Creating paper-author-paper adjacency lists.")
        graph = defaultdict()
        for idx in data.index:
            graph[idx] = []
        # Add edges between papers if they share an author
        with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    data[data.chapter == paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append(edge[1])
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent, "PAP.pkl")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _update_PAP_adjacency(self, graph, data, df_test, data_authors):
        print("Updating paper-author-paper adjacency lists.")
        for idx in df_test.index:
            graph[idx] = []
        with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    data[data.chapter == paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append(edge[1])
                pbar.update(1)
        print("Updated.")
        return graph

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return np.asarray(features)

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()
Beispiel #18
0
    def predict(self, test_data, model_checkpoint, gpu_mem_fraction=None):
        timer = Timer()
        timer.tic()

        G = test_data[0]
        features = test_data[1]
        id_map = test_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = test_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(placeholders, features, adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        if gpu_mem_fraction is not None:
            config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(),
        #                                                         sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver()

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        # Restore model
        print("Restoring trained model.")
        checkpoint_file = os.path.join(self._log_dir(), model_checkpoint)
        ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file)
        if checkpoint_file:
            saver.restore(sess, checkpoint_file)
            print("Model restored.")
        else:
            print("This model checkpoint does not exist. The model might " +
                  "not be trained yet or the checkpoint is invalid.")

        # Infer embeddings
        sess.run(val_adj_info.op)
        print("Computing embeddings...")
        val_embeddings = []
        finished = False
        seen = set([])
        nodes = []
        iter_num = 0
        while not finished:
            feed_dict_val, finished, edges = minibatch.incremental_embed_feed_dict(
                self.validate_batch_size, iter_num)
            iter_num += 1
            outs_val = sess.run([model.loss, model.mrr, model.outputs1],
                                feed_dict=feed_dict_val)
            for i, edge in enumerate(edges):
                if not edge[0] in seen:
                    val_embeddings.append(outs_val[-1][i, :])
                    nodes.append(edge[0])
                    seen.add(edge[0])

        val_embeddings = np.vstack(val_embeddings)
        if self.save_embeddings:
            print("Saving embeddings...")
            if not os.path.exists(self._log_dir()):
                os.makedirs(self._log_dir())
            np.save(self._log_dir() + "inferred_embeddings.npy",
                    val_embeddings)
            with open(self._log_dir() + "inferred_embeddings_ids.txt",
                      "w") as fp:
                fp.write("\n".join(map(str, nodes)))
            print("Embeddings saved.\n")

        # Return only the embeddings of the test nodes
        test_embeddings_ids = {}
        for i, node in enumerate(nodes):
            test_embeddings_ids[node] = i
        test_nodes = [n for n in G.nodes() if G.node[n]['test']]
        test_embeddings = val_embeddings[[
            test_embeddings_ids[id] for id in test_nodes
        ]]

        sess.close()
        tf.compat.v1.reset_default_graph()
        timer.toc()
        return test_nodes, test_embeddings
Beispiel #19
0
    def train(self):
        print("Loading data...")
        adj_list, features_list, y_train, y_val, train_mask, val_mask = load_data(
            self.embedding_type)
        print("Loaded.")

        nb_nodes = features_list[0].shape[0]
        ft_size = features_list[0].shape[1]
        nb_classes = y_train.shape[1]

        features_list = [features[np.newaxis] for features in features_list]
        y_train = y_train[np.newaxis]
        y_val = y_val[np.newaxis]
        train_mask = train_mask[np.newaxis]
        val_mask = val_mask[np.newaxis]
        biases_list = [preprocess_adj_bias(adj) for adj in adj_list]

        print("Training model...")
        timer = Timer()
        timer.tic()

        print(
            "Parameters: batch size={}, nb_nodes={}, ft_size={}, nb_classes={}\n"
            .format(self.batch_size, nb_nodes, ft_size, nb_classes))

        model = HAN(self.model,
                    self.hid_units,
                    self.n_heads,
                    nb_classes,
                    nb_nodes,
                    l2_coef=self.weight_decay,
                    ffd_drop=self.ffd_drop,
                    attn_drop=self.attn_drop,
                    activation=self.nonlinearity,
                    residual=self.residual)

        vlss_mn = np.inf
        vacc_mx = 0.0
        curr_step = 0

        train_loss_avg = 0
        train_acc_avg = 0
        val_loss_avg = 0
        val_acc_avg = 0

        train_losses = []
        val_losses = []
        train_accuracies = []
        val_accuracies = []

        for epoch in range(self.epochs):
            print("\nEpoch {}".format(epoch))

            # Training
            tr_step = 0
            tr_size = features_list[0].shape[0]
            while tr_step * self.batch_size < tr_size:
                feats_list = [
                    features[tr_step * self.batch_size:(tr_step + 1) *
                             self.batch_size] for features in features_list
                ]

                _, train_embed, att_val, acc_tr, loss_value_tr = self._train(
                    model=model,
                    inputs_list=feats_list,
                    bias_mat_list=biases_list,
                    lbl_in=y_train[tr_step * self.batch_size:(tr_step + 1) *
                                   self.batch_size],
                    msk_in=train_mask[tr_step * self.batch_size:(tr_step + 1) *
                                      self.batch_size])

                train_loss_avg += loss_value_tr
                train_acc_avg += acc_tr
                tr_step += 1

            # Validation
            vl_step = 0
            vl_size = features_list[0].shape[0]

            while vl_step * self.batch_size < vl_size:
                feats_list = [
                    features[vl_step * self.batch_size:(vl_step + 1) *
                             self.batch_size] for features in features_list
                ]

                _, val_embed, att_val, acc_vl, loss_value_vl = self.evaluate(
                    model=model,
                    inputs_list=feats_list,
                    bias_mat_list=biases_list,
                    lbl_in=y_val[vl_step * self.batch_size:(vl_step + 1) *
                                 self.batch_size],
                    msk_in=val_mask[vl_step * self.batch_size:(vl_step + 1) *
                                    self.batch_size])

                val_loss_avg += loss_value_vl
                val_acc_avg += acc_vl
                vl_step += 1

            print(
                'Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f'
                % (train_loss_avg / tr_step, train_acc_avg / tr_step,
                   val_loss_avg / vl_step, val_acc_avg / vl_step))
            train_losses.append(train_loss_avg / tr_step)
            val_losses.append(val_loss_avg / vl_step)
            train_accuracies.append(train_acc_avg / tr_step)
            val_accuracies.append(val_acc_avg / vl_step)

            # Early Stopping
            if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn:
                if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn:
                    vacc_early_model = val_acc_avg / vl_step
                    vlss_early_model = val_loss_avg / vl_step
                    working_weights = model.get_weights()
                    print(
                        "Minimum validation loss ({}), maximum accuracy ({}) so far  at epoch {}."
                        .format(val_loss_avg / vl_step, val_acc_avg / vl_step,
                                epoch))
                    self._save_model(model)
                vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx))
                vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn))
                curr_step = 0
            else:
                curr_step += 1
                if curr_step == self.patience:
                    print("Early stop! Min loss: {}, Max accuracy: {}".format(
                        vlss_mn, vacc_mx))
                    print("Early stop model validation loss: {}, accuracy: {}".
                          format(vlss_early_model, vacc_early_model))
                    model.set_weights(working_weights)
                    break

            train_loss_avg = 0
            train_acc_avg = 0
            val_loss_avg = 0
            val_acc_avg = 0

        print("Training finished.")

        training_time = timer.toc()
        train_losses = [x.numpy() for x in train_losses]
        val_losses = [x.numpy() for x in val_losses]
        train_accuracies = [x.numpy() for x in train_accuracies]
        val_accuracies = [x.numpy() for x in val_accuracies]
        self._plot_losses(train_losses, val_losses)
        self._plot_accuracies(train_accuracies, val_accuracies)
        self._print_stats(train_losses, val_losses, train_accuracies,
                          val_accuracies, training_time)
    def train(self, train_data, sampler_name='Uniform'):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = train_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")
        adj_shape = adj_info.get_shape().as_list()

        model = self._create_model(sampler_name, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(
        #                self._log_dir(sampler_name), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Restore params of ML sampler model
        if sampler_name == 'ML' or sampler_name == 'FastML':
            sampler_vars = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="MLsampler")
            saver_sampler = tf.compat.v1.train.Saver(var_list=sampler_vars)
            sampler_model_path = self._sampler_model_path()
            saver_sampler.restore(sess, sampler_model_path + 'model.ckpt')

        # Loss node path
        loss_node_path = self._loss_node_path(sampler_name)
        if not os.path.exists(loss_node_path):
            os.makedirs(loss_node_path)

        # Train model
        train_shadow_mrr = None
        shadow_mrr = None

        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        train_losses = []
        validation_losses = []

        val_cost_ = []
        val_mrr_ = []
        shadow_mrr_ = []
        duration_ = []

        ln_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]),
                                   dtype=np.float32)
        lnc_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]),
                                    dtype=np.int32)
        ln_acc = ln_acc.tolil()
        lnc_acc = lnc_acc.tolil()

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []

            while not minibatch.end():
                # Construct feed dictionary
                feed_dict = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})
                t = time.time()

                # Training step
                outs = sess.run([
                    merged, model.opt_op, model.loss, model.ranks,
                    model.aff_all, model.mrr, model.outputs1, model.loss_node,
                    model.loss_node_count
                ],
                                feed_dict=feed_dict)
                train_cost = outs[2]
                train_mrr = outs[5]
                train_loss_epoch.append(train_cost)

                if train_shadow_mrr is None:
                    train_shadow_mrr = train_mrr
                else:
                    train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr -
                                                      train_mrr)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    val_cost, ranks, val_mrr, duration = self._evaluate(
                        sess, model, minibatch, size=self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)

                if shadow_mrr is None:
                    shadow_mrr = val_mrr
                else:
                    shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr)

                val_cost_.append(val_cost)
                val_mrr_.append(val_mrr)
                shadow_mrr_.append(shadow_mrr)
                duration_.append(duration)

                #                if total_steps % self.print_every == 0:
                #                    summary_writer.add_summary(outs[0], total_steps)

                # Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    print(
                        "Iter: %04d" % iter,
                        "train_loss={:.5f}".format(train_cost),
                        "train_mrr={:.5f}".format(train_mrr),
                        # exponential moving average
                        "train_mrr_ema={:.5f}".format(train_shadow_mrr),
                        "val_loss={:.5f}".format(val_cost),
                        "val_mrr={:.5f}".format(val_mrr),
                        # exponential moving average
                        "val_mrr_ema={:.5f}".format(shadow_mrr),
                        "time={:.5f}".format(avg_time))

                ln = outs[7].values
                ln_idx = outs[7].indices
                ln_acc[ln_idx[:, 0], ln_idx[:, 1]] += ln

                lnc = outs[8].values
                lnc_idx = outs[8].indices
                lnc_acc[lnc_idx[:, 0], lnc_idx[:, 1]] += lnc

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # If the epoch has the lowest validation loss so far
            if validation_losses[-1] == min(validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                # Save loss node and count
                loss_node = sparse.save_npz(loss_node_path + 'loss_node.npz',
                                            sparse.csr_matrix(ln_acc))
                loss_node_count = sparse.save_npz(
                    loss_node_path + 'loss_node_count.npz',
                    sparse.csr_matrix(lnc_acc))
                # Save embeddings
                if self.save_embeddings and sampler_name is not "Uniform":
                    sess.run(val_adj_info.op)
                    self._save_embeddings(sess, model, minibatch,
                                          self.validate_batch_size,
                                          self._log_dir(sampler_name))

            # Save model at each epoch
            print("Saving model at epoch {}.".format(epoch))
            saver.save(
                sess,
                os.path.join(self._log_dir(sampler_name),
                             "model_epoch_" + str(epoch) + ".ckpt"))

            if total_steps > self.max_total_steps:
                break

        print("Optimization Finished!")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses, sampler_name)
        self._print_stats(train_losses, validation_losses, training_time,
                          sampler_name)
Beispiel #21
0
class FileParser:

    path_raw = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "..", "..", "data", "raw")
    path_persistent = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   "..", "..", "data", "interim", "parsed_data"
                                   )

    def __init__(self):
        self.timer = Timer()
        self.persistent = {}
        self.processes = {
                # Old datasets
                "old_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "old_books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "old_books_new_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_new_books",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_new_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "old_books_conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_conferences",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_conferences.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "conferences.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferences_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_name",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferences_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_acronym": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_acronym",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_acronym.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_city": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_city",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_city.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_country": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_country",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_country.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_year": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_year",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_datestart": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_datestart",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_datestart.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_dateend": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_dateend",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_dateend.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_conferenceseries.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferenceseries.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferenceseries_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferenceseries_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },

                # New datasets
                "books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "isbn_books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_isbn_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "isbn_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "authors_name": {
                        "filename": os.path.join(self.path_raw, authors_file),
                        "process_line": "_process_line_authors_name",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "chapters_title": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_title",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_title.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_year": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_year",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_language": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_language",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_language.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_abstract": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_abstract",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_abstract.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_authors.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors_name": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_all_citations": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_all_citations",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_all_citations.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_keywords": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_keywords",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_keywords.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_books_isbns": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_books_isbns",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_books_isbns.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                }

    def get_data(self, process):
        # Check if the data is already present
        if (process in self.persistent):
            return self.persistent[process]

        print("Process '{}' not in memory yet.".format(process))

        # Load from persistent file if data already processed
        if os.path.isfile(self.processes[process]["persistent_file"]):
            with open(self.processes[process]["persistent_file"],
                      "rb") as f:
                self.persistent[process] = pickle.load(f)
                return self.persistent[process]

        print("Process '{}' not persistent yet. Processing.".format(
                process))

        # Process the raw data
        self.persistent[process] = self.processes[process][
                "persistent_variable"]
        self._parse_file(
                self.processes[process]["filename"],
                self.processes[process]["process_line"],
                self.persistent[process],
                self.processes[process]["dataset_format"]
                )
        with open(self.processes[process]["persistent_file"], "wb") as f:
            pickle.dump(self.persistent[process], f)

        return self.persistent[process]

    def _parse_file(self, filename, process_line, results, dataset_format):
        if dataset_format == "json":
            self._process_json_file(filename, process_line, results)
        else:
            self._process_ntriples_file(filename, process_line, results)

    def _process_json_file(self, filename, process_line, results):
        print("Computing number of json files.")
        with tarfile.open(filename, "r:gz", encoding="utf-8") as tar:
            count_files = len(tar.getnames())
        print("Finished computing number of files: {}.\n".format(
                count_files))

        print("Start processing file.\n")
        self.timer.tic()
        process_line_function = self.__getattribute__(process_line)
        with tqdm(desc="Processing files: ", total=count_files,
                  unit="file") as pbar:
            with tarfile.open(filename, "r:gz", encoding="utf-8") as tar:
                for member in tar.getmembers():
                    if "jsonl" in member.name:
                        file = tar.extractfile(member)
                        content = [json.loads(line) for line in
                                   file.readlines()]
                        for line in content:
                            process_line_function(line, results)
                    pbar.update(1)
        self.timer.toc()
        print("Finished processing file.\n\n")

    def _process_ntriples_file(self, filename, process_line, results):
        print("Computing file size.")
        with gzip.open(filename, mode="rt", encoding="utf-8") as f:
            file_size = f.seek(0, io.SEEK_END)
        print("Finished computing file size: {} bytes.\n".format(
                file_size))

        print("Start processing file.\n")
        self.timer.tic()
        process_line_function = self.__getattribute__(process_line)
        with tqdm(desc="Processing file: ", total=file_size,
                  unit="bytes") as pbar:
            with gzip.open(filename, mode="rt", encoding="utf-8") as f:
                for line in f:
                    process_line_function(line, results)
                    pbar.update(len(line))
        self.timer.toc()
        print("Finished processing file.\n\n")

    # Processes implementations
    def _process_line_old_books(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference:
            if line[0].startswith(nt_book):
                if line[0] not in results:
                    results.append(line[0])

    def _process_line_old_books_new_books(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_webpage:
            if line[0].startswith(nt_book):
                if line[0] in self.get_data("old_books"):
                    new_book_id = "sg:pub." + line[2].split(
                            ".com/")[-1].rsplit(">")[0]
                    results[line[0]] = new_book_id

    def _process_line_old_books_conferences(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference:
            if line[0].startswith(nt_book):
                results[line[0]] = line[2]

    def _process_line_conferences(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[0].startswith(nt_conferences):
            if line[0] not in results:
                results.append(line[0])

    def _process_line_conferences_name(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_name:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_acronym(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_acronym:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_city(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_city:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_country(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_country:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_year(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_year:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_datestart(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_datestart:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_dateend(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_dateend:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_conferenceseries(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference_series:
            results[line[0]] = line[2]

    def _process_line_conferenceseries(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[0].startswith(nt_conference_series):
            if line[0] not in results:
                results.append(line[0])

    def _process_line_conferenceseries_name(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_name:
            if line[0].startswith(nt_conference_series):
                results[line[0]] = line[2]

    def _process_line_books(self, line, results):
        new_books = list(self.get_data("old_books_new_books").values())
        if line["id"] not in results:
            if line["id"] in new_books:
                results.append(line["id"])

    def _process_line_isbn_books(self, line, results):
        if "isbn" in line.keys():
            if line["id"] in self.get_data("books"):
                isbn_list = line["isbn"]
                for isbn in isbn_list:
                    results[isbn] = line["id"]

    def _process_line_authors_name(self, line, results):
        family_name = line["familyName"] if "familyName" in line.keys() else ""
        given_name = line["givenName"] if "givenName" in line.keys() else ""
        if not family_name == "Not available":
            author_name = family_name + " " + given_name
        else:
            author_name = ""
        results[line["id"]] = author_name

    def _process_line_chapters(self, line, results):
        if "isPartOf" in line.keys():
            if line["id"] not in results:
                book = line["isPartOf"]
                if "isbn" in book.keys():
                    isbn_list = book["isbn"]
                    for isbn in isbn_list:
                        if isbn in self.get_data("isbn_books"):
                            results.append(line["id"])

    def _process_line_chapters_title(self, line, results):
        if "name" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["name"]

    def _process_line_chapters_year(self, line, results):
        if "datePublished" in line.keys():
            if line["id"] in self.get_data("chapters"):
                year = line["datePublished"].split("-")[0]
                results[line["id"]] = year

    def _process_line_chapters_language(self, line, results):
        if "inLanguage" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["inLanguage"][0]

    def _process_line_chapters_abstract(self, line, results):
        if "description" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["description"]

    def _process_line_chapters_authors(self, line, results):
        if "author" in line.keys():
            if line["id"] in self.get_data("chapters"):
                authors = line["author"]
                authors_id = [authors[i]["id"] for i in
                              range(len(authors)) if "id" in
                              authors[i].keys()]
                results[line["id"]] = authors_id

    def _process_line_chapters_authors_name(self, line, results):
        if "author" in line.keys():
            if line["id"] in self.get_data("chapters"):
                authors = line["author"]
                author_names = list()
                for i in range(len(authors)):
                    family_name = authors[i]["familyName"] if \
                        "familyName" in authors[i].keys() else ""
                    given_name = authors[i]["givenName"] if "givenName" \
                        in authors[i].keys() else ""
                    author_names.append(family_name + " " + given_name)
                results[line["id"]] = author_names

    def _process_line_chapters_all_citations(self, line, results):
        if "citation" in line.keys():
            if line["id"] in self.get_data("chapters"):
                citations = line["citation"]
                citations_id = [citations[i]["id"] for i in range(
                        len(citations))]
                results[line["id"]] = citations_id

    def _process_line_chapters_keywords(self, line, results):
        if "keywords" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["keywords"]

    def _process_line_chapters_books_isbns(self, line, results):
        if "isPartOf" in line.keys():
            if line["id"] in self.get_data("chapters"):
                book = line["isPartOf"]
                if "isbn" in book.keys():
                    isbn_list = book["isbn"]
                    results[line["id"]] = isbn_list