Esempio n. 1
0
 def main():
     parser = argparse.ArgumentParser(
         description='Arguments for data preprocessing.')
     parser.add_argument('embedding_type',
                         choices=[
                             "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                             "MAX_2L", "CONC_AVG_MAX_2L",
                             "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                         ],
                         help="Type of embedding.")
     parser.add_argument('dataset',
                         help='Name of the object file that stores the ' +
                         'training data.')
     parser.add_argument('--threshold',
                         type=int,
                         default=2,
                         help='Threshold for edge weights in ' +
                         'heterogeneous graph.')
     parser.add_argument('--gpu',
                         type=int,
                         default=0,
                         help='Which gpu to use.')
     args = parser.parse_args()
     print("Starting...")
     from preprocess_data import Processor
     processor = Processor(args.embedding_type, args.dataset,
                           args.threshold, args.gpu)
     processor.training_data()
     print("Finished.")
Esempio n. 2
0
    def __init__(self, embedding_type, dataset, model_name, max_degree=696,
                 learning_rate=0.001, weight_decay=5e-4, dropout=0.0,
                 epochs=300, early_stopping=30, hidden1=16, rank=128, skip=0,
                 var=0.5, sampler_device="cpu", gpu=None, recs=10):

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.dropout = dropout
        self.epochs = epochs
        self.early_stopping = early_stopping
        self.hidden1 = hidden1
        self.rank = rank
        self.skip = skip
        self.var = var
        self.sampler_device = sampler_device
        self.recs = recs

        self.preprocessor = Processor(self.embedding_type, self.dataset, gpu)
        self.training_data = self._load_training_data()

        if not self._load_label_encoder():
            print("The label encoder does not exist.")
    def infer_embeddings(self, query, query_authors, graph_type, model,
                         model_checkpoint, queue):
        df_test = pd.DataFrame(query,
                               columns=[
                                   "chapter", "chapter_title",
                                   "chapter_abstract", "chapter_citations"
                               ])

        # Load the training graph
        print("Loading {} training graph...".format(graph_type))
        G_train = self._load_training_graph(graph_type)
        print("Loaded.")

        # Load the training walks
        print("Loading {} training walks...".format(graph_type))
        walks = self._load_training_walks(graph_type, G_train)
        print("Loaded.")

        print("Preprocessing {} test data...".format(graph_type))
        from preprocess_data import Processor
        preprocessor = Processor(self.embedding_type, graph_type, self.gpu)
        graph, features, id_map = preprocessor.test_data(
            df_test, G_train, authors_df=query_authors)
        print("Preprocessed.")

        print("Inferring embeddings...")
        embeddings = model.predict([graph, features, id_map, walks],
                                   model_checkpoint)[1]
        print("Inferred.")

        queue.put(embeddings)
    def __init__(self,
                 classifier,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.classifier = classifier
        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      gpu)

        self.classifier_file = os.path.join(
            self.graphsage_model._log_dir(),
            self.classifier.__class__.__name__ + ".pkl")

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_walks():
            print("The walks do not exist.")
Esempio n. 5
0
    def __init__(self,
                 embedding_type,
                 graph_type,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 samples_3=0,
                 dim_1=128,
                 dim_2=128,
                 batch_size=512,
                 sigmoid=False,
                 identity_dim=0,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=5,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10,
                 threshold=2):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = SupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, samples_3,
            dim_1, dim_2, batch_size, sigmoid, identity_dim, base_log_dir,
            validate_iter, validate_batch_size, gpu, print_every,
            max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      threshold, gpu)

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_class_map():
            print("The training class map dows not exist.")

        if not self._load_label_encoder():
            print("The label encoder does not exist.")
 def main():
     parser = argparse.ArgumentParser(
         description='Arguments for data preprocessing.')
     parser.add_argument('embedding_type',
                         choices=[
                             "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                             "MAX_2L", "CONC_AVG_MAX_2L",
                             "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                         ],
                         help="Type of embedding.")
     parser.add_argument('--gpu',
                         type=int,
                         default=0,
                         help='Which gpu to use.')
     args = parser.parse_args()
     print("Starting...")
     from preprocess_data import Processor
     processor = Processor(args.embedding_type, args.gpu)
     processor.training_data()
     print("Finished.")
    def __init__(self,
                 embedding_type,
                 graph_type,
                 train_prefix,
                 model_name,
                 nonlinear_sampler=True,
                 fast_ver=False,
                 allhop_rewards=False,
                 model_size="small",
                 learning_rate=0.001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 samples_3=0,
                 dim_1=512,
                 dim_2=512,
                 dim_3=0,
                 batch_size=128,
                 sigmoid=False,
                 identity_dim=0,
                 base_log_dir='../../../data/processed/graphsage_rl/',
                 validate_iter=5000,
                 validate_batch_size=128,
                 gpu=0,
                 print_every=5,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10,
                 threshold=2):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.fast_ver = fast_ver
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        if gpu is not None:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
        else:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""

        self.graphsage_model = SupervisedModelRL(
            train_prefix=train_prefix,
            model_name=model_name,
            nonlinear_sampler=nonlinear_sampler,
            fast_ver=self.fast_ver,
            allhop_rewards=allhop_rewards,
            model_size=model_size,
            learning_rate=learning_rate,
            epochs=epochs,
            dropout=dropout,
            weight_decay=weight_decay,
            max_degree=max_degree,
            samples_1=samples_1,
            samples_2=samples_2,
            samples_3=samples_3,
            dim_1=dim_1,
            dim_2=dim_2,
            dim_3=dim_3,
            batch_size=batch_size,
            sigmoid=sigmoid,
            identity_dim=identity_dim,
            base_log_dir=base_log_dir,
            validate_iter=validate_iter,
            validate_batch_size=validate_batch_size,
            gpu=None,
            print_every=print_every,
            max_total_steps=max_total_steps,
            log_device_placement=log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      threshold, gpu)

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_class_map():
            print("The training class map dows not exist.")

        if not self._load_label_encoder():
            print("The label encoder does not exist.")
class GraphSAGERLModel(AbstractModel):
    def __init__(self,
                 embedding_type,
                 graph_type,
                 train_prefix,
                 model_name,
                 nonlinear_sampler=True,
                 fast_ver=False,
                 allhop_rewards=False,
                 model_size="small",
                 learning_rate=0.001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 samples_3=0,
                 dim_1=512,
                 dim_2=512,
                 dim_3=0,
                 batch_size=128,
                 sigmoid=False,
                 identity_dim=0,
                 base_log_dir='../../../data/processed/graphsage_rl/',
                 validate_iter=5000,
                 validate_batch_size=128,
                 gpu=0,
                 print_every=5,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10,
                 threshold=2):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.fast_ver = fast_ver
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        if gpu is not None:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
        else:
            os.environ["CUDA_VISIBLE_DEVICES"] = ""

        self.graphsage_model = SupervisedModelRL(
            train_prefix=train_prefix,
            model_name=model_name,
            nonlinear_sampler=nonlinear_sampler,
            fast_ver=self.fast_ver,
            allhop_rewards=allhop_rewards,
            model_size=model_size,
            learning_rate=learning_rate,
            epochs=epochs,
            dropout=dropout,
            weight_decay=weight_decay,
            max_degree=max_degree,
            samples_1=samples_1,
            samples_2=samples_2,
            samples_3=samples_3,
            dim_1=dim_1,
            dim_2=dim_2,
            dim_3=dim_3,
            batch_size=batch_size,
            sigmoid=sigmoid,
            identity_dim=identity_dim,
            base_log_dir=base_log_dir,
            validate_iter=validate_iter,
            validate_batch_size=validate_batch_size,
            gpu=None,
            print_every=print_every,
            max_total_steps=max_total_steps,
            log_device_placement=log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      threshold, gpu)

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_class_map():
            print("The training class map dows not exist.")

        if not self._load_label_encoder():
            print("The label encoder does not exist.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        # Generate an ID for the query
        query_id = "new_node_id:" + "-".join(
            [str(i) for i in random.sample(range(0, 10000), 5)])

        if self.graph_type == "citations":
            if len(query) < 3:
                raise ValueError("The input does not contain enough data; " +
                                 "chapter  title chapter abstract, and " +
                                 "chapter citations are required.")
            return self.query_batch([(query_id, query[0], query[1], query[2])])
        elif self.graph_type == "citations_authors_het_edges":
            if len(query) < 4:
                raise ValueError(
                    "The input does not contain enough data; " +
                    "chapter title chapter abstract, chapter " +
                    "citations, and chapter authors are required.")
            authors_df = pd.DataFrame({
                "author_name": query[3],
                "chapter": [query_id] * len(query[3])
            })
            return self.query_batch(
                ([(query_id, query[0], query[1], query[2])], authors_df))
        else:
            raise ValueError("Graph type not recognised.")

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if self.graph_type == "citations":
            df_test = pd.DataFrame(batch,
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])

            # Preprocess the data
            graph, features, id_map, class_map = self.preprocessor.test_data(
                df_test, self.G_train, class_map=self.class_map_train)

        elif self.graph_type == "citations_authors_het_edges":
            df_test = pd.DataFrame(batch[0],
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])
            authors_df = batch[1]
            # Preprocess the data
            graph, features, id_map, class_map = self.preprocessor.test_data(
                df_test,
                self.G_train,
                authors_df=authors_df,
                class_map=self.class_map_train)
        else:
            raise ValueError("Graph type not recognised.")

        # Inference on test data
        if self.fast_ver:
            sampler_name = "FastML"
        else:
            sampler_name = "ML"
        predictions = self.graphsage_model.inference(
            [graph, features, id_map, None, class_map], sampler_name)[1]

        # Compute predictions
        sorted_predictions = (-predictions).argsort(axis=1)
        conferences = list()
        confidences = list()

        for i in range(len(predictions)):
            one_hot_preds = np.zeros((self.recs, len(predictions[0])),
                                     dtype=int)
            for j in range(self.recs):
                one_hot_preds[j][sorted_predictions[i, j]] = 1
            conferences.append(
                list(
                    self.label_encoder.inverse_transform(
                        one_hot_preds).flatten()))
            confidences.append(
                list(predictions[i, sorted_predictions[:, :self.recs][i]]))

        results = [conferences, confidences]
        return results

    def train(self):
        pass

    def _load_training_graph(self):
        graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-G.json")
        if os.path.isfile(graph_file):
            print("Loading training graph...")
            with open(graph_file) as f:
                self.G_train = json_graph.node_link_graph(json.load(f))
            print("Loaded.")
            return True
        return False

    def _load_training_class_map(self):
        class_map_file = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type, "train_val-class_map.json")
        self.class_map_train = {}
        if isinstance(list(self.G_train.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n
        if os.path.isfile(class_map_file):
            print("Loading training class map...")
            self.class_map_train = json.load(open(class_map_file))
            if isinstance(list(self.class_map_train.values())[0], list):
                lab_conversion = lambda n: n
            else:
                lab_conversion = lambda n: int(n)
            self.class_map_train = {
                conversion(k): lab_conversion(v)
                for k, v in self.class_map_train.items()
            }
            print("Loaded.")
            return True
        return False

    def _load_label_encoder(self):
        label_encoder_file = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type, "label_encoder.pkl")
        if os.path.isfile(label_encoder_file):
            with open(label_encoder_file, "rb") as f:
                print("Loading label encoder.")
                self.label_encoder = pickle.load(f)
            print("Loaded.")
            return True
        return False
Esempio n. 9
0
    def __init__(self,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs
        self.gpu = gpu

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, "citations",
                                      self.gpu)

        # Prepare the training data
        d_train = DataLoader()
        self.df_train = d_train.training_data_with_abstracts_citations().data

        print("Loading the training embeddings...")
        if not self._load_train_embeddings():
            print("The pretrained embeddings are missing.")
        else:
            print("Loaded.")

        training_ids = list(self.df_train.chapter)
        self.training_embeddings = self.pretrained_embeddings[[
            self.pretrained_embeddings_id_map[id] for id in training_ids
        ]]
        self.sim = Similarities(self.training_embeddings, training_ids)

        print("Loading training graph...")
        if not self._load_training_graph():
            print("The training graph does not exist.")
        else:
            print("Loaded.")

        print("Loading training walks...")
        if not self._load_training_walks():
            print("The walks do not exist.")
        else:
            print("Loaded.")
Esempio n. 10
0
class GraphSAGENeighbourModel(AbstractModel):
    def __init__(self,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs
        self.gpu = gpu

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, "citations",
                                      self.gpu)

        # Prepare the training data
        d_train = DataLoader()
        self.df_train = d_train.training_data_with_abstracts_citations().data

        print("Loading the training embeddings...")
        if not self._load_train_embeddings():
            print("The pretrained embeddings are missing.")
        else:
            print("Loaded.")

        training_ids = list(self.df_train.chapter)
        self.training_embeddings = self.pretrained_embeddings[[
            self.pretrained_embeddings_id_map[id] for id in training_ids
        ]]
        self.sim = Similarities(self.training_embeddings, training_ids)

        print("Loading training graph...")
        if not self._load_training_graph():
            print("The training graph does not exist.")
        else:
            print("Loaded.")

        print("Loading training walks...")
        if not self._load_training_walks():
            print("The walks do not exist.")
        else:
            print("Loaded.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if len(query) < 3:
            raise ValueError("The input does not contain enough data; " +
                             "chapter title chapter abstract, and chapter " +
                             "citations are required.")
        # Generate an ID for the query
        query_id = "new_node_id:" + "-".join(
            [str(i) for i in random.sample(range(0, 10000), 5)])
        return self.query_batch([(query_id, query[0], query[1], query[2])])

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """

        df_test = pd.DataFrame(batch,
                               columns=[
                                   "chapter", "chapter_title",
                                   "chapter_abstract", "chapter_citations"
                               ])

        # Preprocess the data
        graph, features, id_map = self.preprocessor.test_data(
            df_test, self.G_train)

        # Infer embeddings
        test_nodes, test_embeddings = self.graphsage_model.predict(
            [graph, features, id_map, self.walks], self.model_checkpoint)

        # Obtain the most similar neighbours
        similarities = []
        with tqdm(desc="Computing similarities",
                  total=len(test_embeddings)) as pbar:
            for vector in test_embeddings:
                similarities.append(
                    self.sim.similar_by_vector(vector, topn=self.recs * 10))
                pbar.update(1)

        # Map similar papers to conferences
        conferenceseries = []
        confidences = []
        with tqdm(desc="Computing conference predicitons.",
                  total=len(similarities)) as pbar:
            for similarity in similarities:
                conferences = set()
                scores = []
                for idx in range(len(similarity)):
                    conferences_length = len(conferences)
                    if conferences_length < self.recs:
                        conferences.add(
                            list(self.df_train[self.df_train.chapter ==
                                               similarity[idx]
                                               [0]].conferenceseries)[0])
                        if len(conferences) != conferences_length:
                            scores.append(similarity[idx][1])
                conferenceseries.append(list(conferences))
                confidences.append(scores)
                pbar.update(1)

        results = [conferenceseries, confidences]
        return results

    def train(self):
        pass

    def _load_train_embeddings(self):
        embeddings_file = os.path.join(self.graphsage_model._log_dir(),
                                       "embeddings.npy")
        embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(),
                                           "embeddings_ids.txt")
        if os.path.isfile(embeddings_file) and os.path.isfile(
                embeddings_ids_file):
            self.pretrained_embeddings = np.load(embeddings_file)
            self.pretrained_embeddings_id_map = {}
            with open(embeddings_ids_file) as f:
                for i, line in enumerate(f):
                    self.pretrained_embeddings_id_map[line.strip()] = i
            return True
        return False

    def _load_training_graph(self):
        graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-G.json")
        if os.path.isfile(graph_file):
            with open(graph_file) as f:
                self.G_train = json_graph.node_link_graph(json.load(f))
            return True
        return False

    def _load_training_walks(self):
        walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-walks.txt")
        self.walks = []
        if isinstance(list(self.G_train.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n

        if os.path.isfile(walks_file):
            with open(walks_file) as f:
                for line in f:
                    self.walks.append(map(conversion, line.split()))
            return True
        return False
class GraphSAGEClassifierModel(AbstractModel):
    def __init__(self,
                 classifier,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.classifier = classifier
        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      gpu)

        self.classifier_file = os.path.join(
            self.graphsage_model._log_dir(),
            self.classifier.__class__.__name__ + ".pkl")

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_walks():
            print("The walks do not exist.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        # Generate an ID for the query
        query_id = "new_node_id:" + "-".join(
            [str(i) for i in random.sample(range(0, 10000), 5)])

        if self.graph_type == "citations":
            if len(query) < 3:
                raise ValueError("The input does not contain enough data; " +
                                 "chapter  title chapter abstract, and " +
                                 "chapter citations are required.")
            return self.query_batch([(query_id, query[0], query[1], query[2])])
        elif self.graph_type == "authors":
            if len(query) < 4:
                raise ValueError(
                    "The input does not contain enough data; " +
                    "chapter title chapter abstract, chapter " +
                    "citations, and chapter authors are required.")
            authors_df = pd.DataFrame({
                "author_name": query[3],
                "chapter": [query_id] * len(query[3])
            })
            return self.query_batch([(query_id, query[0], query[1], query[2])],
                                    authors_df)
        else:
            raise ValueError("Graph type not recognised.")

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if self.graph_type == "citations":
            df_test = pd.DataFrame(batch,
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])

            # Preprocess the data
            graph, features, id_map = self.preprocessor.test_data(
                df_test, self.G_train)
        elif self.graph_type == "authors":
            df_test = pd.DataFrame(batch[0],
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])
            authors_df = batch[1]
            # Preprocess the data
            graph, features, id_map = self.preprocessor.test_data(
                df_test, self.G_train, authors_df=authors_df)
        else:
            raise ValueError("Graph type not recognised.")

        # Infer embeddings
        test_embeddings = self.graphsage_model.predict(
            [graph, features, id_map, self.walks], self.model_checkpoint)[1]

        # Compute predictions
        predictions = self.classifier.predict_proba(test_embeddings)
        sorted_predictions = np.argsort(-np.array(predictions))

        conferenceseries = list()
        confidences = list()

        for index, order in enumerate(sorted_predictions):
            conferences = list()
            scores = list()
            i = 0
            while len(conferences) < self.recs:
                conf = self.label_encoder.inverse_transform([order[i]
                                                             ]).tolist()[0]
                if conf not in conferences:
                    conferences.append(conf)
                    scores.append(predictions[index][order][i])
                i += 1
            conferenceseries.append(conferences)
            confidences.append(scores)

        results = [conferenceseries, confidences]
        return results

    def train(self, data):
        if not self._load_model_classifier():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()

            print("Loading the training embeddings...")
            if not self._load_train_embeddings():
                print("The pretrained embeddings are missing.")
            else:
                print("Loaded.")
            training_ids = list(data.chapter)
            training_embeddings = self.pretrained_embeddings[[
                self.pretrained_embeddings_id_map[id] for id in training_ids
            ]]

            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(
                data.conferenceseries)
            self.classifier.fit(training_embeddings, self.labels)
            self._save_model_classifier()

            print("Training finished.")
            timer.toc()

    def _load_training_graph(self):
        graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-G.json")
        if os.path.isfile(graph_file):
            print("Loading training graph...")
            with open(graph_file) as f:
                self.G_train = json_graph.node_link_graph(json.load(f))
            print("Loaded.")
            return True
        return False

    def _load_training_walks(self):
        walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-walks.txt")
        self.walks = []
        if isinstance(list(self.G_train.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n

        if os.path.isfile(walks_file):
            print("Loading training walks...")
            with open(walks_file) as f:
                for line in f:
                    self.walks.append(map(conversion, line.split()))
            print("Loaded.")
            return True
        return False

    def _load_train_embeddings(self):
        embeddings_file = os.path.join(self.graphsage_model._log_dir(),
                                       "embeddings.npy")
        embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(),
                                           "embeddings_ids.txt")
        if os.path.isfile(embeddings_file) and os.path.isfile(
                embeddings_ids_file):
            self.pretrained_embeddings = np.load(embeddings_file)
            self.pretrained_embeddings_id_map = {}
            with open(embeddings_ids_file) as f:
                for i, line in enumerate(f):
                    self.pretrained_embeddings_id_map[line.strip()] = i
            return True
        return False

    def _load_model_classifier(self):
        if os.path.isfile(self.classifier_file):
            print("Loading classifier...")
            with open(self.classifier_file, "rb") as f:
                self.label_encoder, self.labels, self.classifier = pickle.load(
                    f)
                print("Loaded.")
                return True
        return False

    def _save_model_classifier(self):
        with open(self.classifier_file, "wb") as f:
            pickle.dump([self.label_encoder, self.labels, self.classifier],
                        f,
                        protocol=4)

    def _has_persistent_model(self):
        if os.path.isfile(self.classifier_file):
            return True
        return False
Esempio n. 12
0
class ASGCNModel(AbstractModel):

    def __init__(self, embedding_type, dataset, model_name, max_degree=696,
                 learning_rate=0.001, weight_decay=5e-4, dropout=0.0,
                 epochs=300, early_stopping=30, hidden1=16, rank=128, skip=0,
                 var=0.5, sampler_device="cpu", gpu=None, recs=10):

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.dropout = dropout
        self.epochs = epochs
        self.early_stopping = early_stopping
        self.hidden1 = hidden1
        self.rank = rank
        self.skip = skip
        self.var = var
        self.sampler_device = sampler_device
        self.recs = recs

        self.preprocessor = Processor(self.embedding_type, self.dataset, gpu)
        self.training_data = self._load_training_data()

        if not self._load_label_encoder():
            print("The label encoder does not exist.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        # Generate an ID for the query
        if len(query) < 3:
            raise ValueError("The input does not contain enough data; " +
                             "chapter title, chapter abstract, and chapter " +
                             "citations are required.")
        return self.query_batch([(query[0], query[1], query[2])])

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if len(batch) == 3:
            df_test = pd.DataFrame(batch,
                                   columns=["chapter_title",
                                            "chapter_abstract",
                                            "chapter_citations"])
        else:
            df_test_extended = pd.DataFrame(batch,
                                            columns=["chapter",
                                                     "chapter_title",
                                                     "chapter_abstract",
                                                     "chapter_citations"])
            df_test = df_test_extended[["chapter_title", "chapter_abstract",
                                        "chapter_citations"]]
        train_features, train_labels, train_val_features, train_val_labels, graph = self.training_data

        # Reindex test dataframe such that indices follow those from the
        # train data
        df_test.index = range(train_val_features.shape[0],
                              train_val_features.shape[0] + len(df_test))

        # Preprocess the data
        test_data, max_degree = self.preprocessor.test_data(
                df_test, train_features, train_labels, train_val_features,
                train_val_labels, graph)

        # Inference on test data
        asgcn_model = ASGCN(
                self.embedding_type, self.dataset, self.model_name, max_degree,
                self.learning_rate, self.weight_decay, self.dropout,
                self.epochs, self.early_stopping, self.hidden1, self.rank,
                self.skip, self.var, self.sampler_device, gpu=None)
        predictions = asgcn_model.test(test_data)

        # Compute predictions
        sorted_predictions = (-predictions).argsort(axis=1)
        conferences = list()
        confidences = list()

        for i in range(len(predictions)):
            one_hot_preds = np.zeros((self.recs, len(predictions[0])),
                                     dtype=int)
            for j in range(self.recs):
                one_hot_preds[j][sorted_predictions[i, j]] = 1
            conferences.append(list(self.label_encoder.inverse_transform(
                    one_hot_preds).flatten()))
            confidences.append(list(
                    predictions[i, sorted_predictions[:, :self.recs][i]]))

        results = [conferences, confidences]
        return results

    def train(self):
        pass

    def _load_training_data(self):
        print("Loading training data.")
        path_persistent = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "..", "..", "..", "data", "interim", "gat",
                self.embedding_type, self.dataset)
        names = ['x', 'y', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open(path_persistent + "/ind.{}.{}".format(
                    self.dataset, names[i]), 'rb') as f:
                objects.append(pickle.load(f, encoding='latin1'))
        x, y, allx, ally, graph = tuple(objects)
        print("Loaded.")
        return x, y, allx, ally, graph

    def _load_label_encoder(self):
        label_encoder_file = os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "..", "..", "..", "data", "interim", "gat",
                self.embedding_type, self.dataset, "label_encoder.pkl")
        if os.path.isfile(label_encoder_file):
            with open(label_encoder_file, "rb") as f:
                print("Loading label encoder.")
                self.label_encoder = pickle.load(f)
            print("Loaded.")
            return True
        return False