def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('dataset', help='Name of the object file that stores the ' + 'training data.') parser.add_argument('--threshold', type=int, default=2, help='Threshold for edge weights in ' + 'heterogeneous graph.') parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.dataset, args.threshold, args.gpu) processor.training_data() print("Finished.")
def __init__(self, embedding_type, dataset, model_name, max_degree=696, learning_rate=0.001, weight_decay=5e-4, dropout=0.0, epochs=300, early_stopping=30, hidden1=16, rank=128, skip=0, var=0.5, sampler_device="cpu", gpu=None, recs=10): self.embedding_type = embedding_type self.dataset = dataset self.model_name = model_name self.learning_rate = learning_rate self.weight_decay = weight_decay self.dropout = dropout self.epochs = epochs self.early_stopping = early_stopping self.hidden1 = hidden1 self.rank = rank self.skip = skip self.var = var self.sampler_device = sampler_device self.recs = recs self.preprocessor = Processor(self.embedding_type, self.dataset, gpu) self.training_data = self._load_training_data() if not self._load_label_encoder(): print("The label encoder does not exist.")
def infer_embeddings(self, query, query_authors, graph_type, model, model_checkpoint, queue): df_test = pd.DataFrame(query, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Load the training graph print("Loading {} training graph...".format(graph_type)) G_train = self._load_training_graph(graph_type) print("Loaded.") # Load the training walks print("Loading {} training walks...".format(graph_type)) walks = self._load_training_walks(graph_type, G_train) print("Loaded.") print("Preprocessing {} test data...".format(graph_type)) from preprocess_data import Processor preprocessor = Processor(self.embedding_type, graph_type, self.gpu) graph, features, id_map = preprocessor.test_data( df_test, G_train, authors_df=query_authors) print("Preprocessed.") print("Inferring embeddings...") embeddings = model.predict([graph, features, id_map, walks], model_checkpoint)[1] print("Inferred.") queue.put(embeddings)
def __init__(self, classifier, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.classifier = classifier self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, gpu) self.classifier_file = os.path.join( self.graphsage_model._log_dir(), self.classifier.__class__.__name__ + ".pkl") if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_walks(): print("The walks do not exist.")
def __init__(self, embedding_type, graph_type, train_prefix, model_name, model_size="small", learning_rate=0.001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, samples_3=0, dim_1=128, dim_2=128, batch_size=512, sigmoid=False, identity_dim=0, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=5, max_total_steps=10**10, log_device_placement=False, recs=10, threshold=2): self.embedding_type = embedding_type self.graph_type = graph_type self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.graphsage_model = SupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, samples_3, dim_1, dim_2, batch_size, sigmoid, identity_dim, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, threshold, gpu) if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_class_map(): print("The training class map dows not exist.") if not self._load_label_encoder(): print("The label encoder does not exist.")
def main(): parser = argparse.ArgumentParser( description='Arguments for data preprocessing.') parser.add_argument('embedding_type', choices=[ "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL", "MAX_2L", "CONC_AVG_MAX_2L", "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L" ], help="Type of embedding.") parser.add_argument('--gpu', type=int, default=0, help='Which gpu to use.') args = parser.parse_args() print("Starting...") from preprocess_data import Processor processor = Processor(args.embedding_type, args.gpu) processor.training_data() print("Finished.")
def __init__(self, embedding_type, graph_type, train_prefix, model_name, nonlinear_sampler=True, fast_ver=False, allhop_rewards=False, model_size="small", learning_rate=0.001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, samples_3=0, dim_1=512, dim_2=512, dim_3=0, batch_size=128, sigmoid=False, identity_dim=0, base_log_dir='../../../data/processed/graphsage_rl/', validate_iter=5000, validate_batch_size=128, gpu=0, print_every=5, max_total_steps=10**10, log_device_placement=False, recs=10, threshold=2): self.embedding_type = embedding_type self.graph_type = graph_type self.fast_ver = fast_ver self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) else: os.environ["CUDA_VISIBLE_DEVICES"] = "" self.graphsage_model = SupervisedModelRL( train_prefix=train_prefix, model_name=model_name, nonlinear_sampler=nonlinear_sampler, fast_ver=self.fast_ver, allhop_rewards=allhop_rewards, model_size=model_size, learning_rate=learning_rate, epochs=epochs, dropout=dropout, weight_decay=weight_decay, max_degree=max_degree, samples_1=samples_1, samples_2=samples_2, samples_3=samples_3, dim_1=dim_1, dim_2=dim_2, dim_3=dim_3, batch_size=batch_size, sigmoid=sigmoid, identity_dim=identity_dim, base_log_dir=base_log_dir, validate_iter=validate_iter, validate_batch_size=validate_batch_size, gpu=None, print_every=print_every, max_total_steps=max_total_steps, log_device_placement=log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, threshold, gpu) if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_class_map(): print("The training class map dows not exist.") if not self._load_label_encoder(): print("The label encoder does not exist.")
class GraphSAGERLModel(AbstractModel): def __init__(self, embedding_type, graph_type, train_prefix, model_name, nonlinear_sampler=True, fast_ver=False, allhop_rewards=False, model_size="small", learning_rate=0.001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, samples_3=0, dim_1=512, dim_2=512, dim_3=0, batch_size=128, sigmoid=False, identity_dim=0, base_log_dir='../../../data/processed/graphsage_rl/', validate_iter=5000, validate_batch_size=128, gpu=0, print_every=5, max_total_steps=10**10, log_device_placement=False, recs=10, threshold=2): self.embedding_type = embedding_type self.graph_type = graph_type self.fast_ver = fast_ver self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) else: os.environ["CUDA_VISIBLE_DEVICES"] = "" self.graphsage_model = SupervisedModelRL( train_prefix=train_prefix, model_name=model_name, nonlinear_sampler=nonlinear_sampler, fast_ver=self.fast_ver, allhop_rewards=allhop_rewards, model_size=model_size, learning_rate=learning_rate, epochs=epochs, dropout=dropout, weight_decay=weight_decay, max_degree=max_degree, samples_1=samples_1, samples_2=samples_2, samples_3=samples_3, dim_1=dim_1, dim_2=dim_2, dim_3=dim_3, batch_size=batch_size, sigmoid=sigmoid, identity_dim=identity_dim, base_log_dir=base_log_dir, validate_iter=validate_iter, validate_batch_size=validate_batch_size, gpu=None, print_every=print_every, max_total_steps=max_total_steps, log_device_placement=log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, threshold, gpu) if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_class_map(): print("The training class map dows not exist.") if not self._load_label_encoder(): print("The label encoder does not exist.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ # Generate an ID for the query query_id = "new_node_id:" + "-".join( [str(i) for i in random.sample(range(0, 10000), 5)]) if self.graph_type == "citations": if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title chapter abstract, and " + "chapter citations are required.") return self.query_batch([(query_id, query[0], query[1], query[2])]) elif self.graph_type == "citations_authors_het_edges": if len(query) < 4: raise ValueError( "The input does not contain enough data; " + "chapter title chapter abstract, chapter " + "citations, and chapter authors are required.") authors_df = pd.DataFrame({ "author_name": query[3], "chapter": [query_id] * len(query[3]) }) return self.query_batch( ([(query_id, query[0], query[1], query[2])], authors_df)) else: raise ValueError("Graph type not recognised.") def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ if self.graph_type == "citations": df_test = pd.DataFrame(batch, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Preprocess the data graph, features, id_map, class_map = self.preprocessor.test_data( df_test, self.G_train, class_map=self.class_map_train) elif self.graph_type == "citations_authors_het_edges": df_test = pd.DataFrame(batch[0], columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) authors_df = batch[1] # Preprocess the data graph, features, id_map, class_map = self.preprocessor.test_data( df_test, self.G_train, authors_df=authors_df, class_map=self.class_map_train) else: raise ValueError("Graph type not recognised.") # Inference on test data if self.fast_ver: sampler_name = "FastML" else: sampler_name = "ML" predictions = self.graphsage_model.inference( [graph, features, id_map, None, class_map], sampler_name)[1] # Compute predictions sorted_predictions = (-predictions).argsort(axis=1) conferences = list() confidences = list() for i in range(len(predictions)): one_hot_preds = np.zeros((self.recs, len(predictions[0])), dtype=int) for j in range(self.recs): one_hot_preds[j][sorted_predictions[i, j]] = 1 conferences.append( list( self.label_encoder.inverse_transform( one_hot_preds).flatten())) confidences.append( list(predictions[i, sorted_predictions[:, :self.recs][i]])) results = [conferences, confidences] return results def train(self): pass def _load_training_graph(self): graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-G.json") if os.path.isfile(graph_file): print("Loading training graph...") with open(graph_file) as f: self.G_train = json_graph.node_link_graph(json.load(f)) print("Loaded.") return True return False def _load_training_class_map(self): class_map_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-class_map.json") self.class_map_train = {} if isinstance(list(self.G_train.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n if os.path.isfile(class_map_file): print("Loading training class map...") self.class_map_train = json.load(open(class_map_file)) if isinstance(list(self.class_map_train.values())[0], list): lab_conversion = lambda n: n else: lab_conversion = lambda n: int(n) self.class_map_train = { conversion(k): lab_conversion(v) for k, v in self.class_map_train.items() } print("Loaded.") return True return False def _load_label_encoder(self): label_encoder_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "label_encoder.pkl") if os.path.isfile(label_encoder_file): with open(label_encoder_file, "rb") as f: print("Loading label encoder.") self.label_encoder = pickle.load(f) print("Loaded.") return True return False
def __init__(self, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs self.gpu = gpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, "citations", self.gpu) # Prepare the training data d_train = DataLoader() self.df_train = d_train.training_data_with_abstracts_citations().data print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(self.df_train.chapter) self.training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.sim = Similarities(self.training_embeddings, training_ids) print("Loading training graph...") if not self._load_training_graph(): print("The training graph does not exist.") else: print("Loaded.") print("Loading training walks...") if not self._load_training_walks(): print("The walks do not exist.") else: print("Loaded.")
class GraphSAGENeighbourModel(AbstractModel): def __init__(self, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs self.gpu = gpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, "citations", self.gpu) # Prepare the training data d_train = DataLoader() self.df_train = d_train.training_data_with_abstracts_citations().data print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(self.df_train.chapter) self.training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.sim = Similarities(self.training_embeddings, training_ids) print("Loading training graph...") if not self._load_training_graph(): print("The training graph does not exist.") else: print("Loaded.") print("Loading training walks...") if not self._load_training_walks(): print("The walks do not exist.") else: print("Loaded.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title chapter abstract, and chapter " + "citations are required.") # Generate an ID for the query query_id = "new_node_id:" + "-".join( [str(i) for i in random.sample(range(0, 10000), 5)]) return self.query_batch([(query_id, query[0], query[1], query[2])]) def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ df_test = pd.DataFrame(batch, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train) # Infer embeddings test_nodes, test_embeddings = self.graphsage_model.predict( [graph, features, id_map, self.walks], self.model_checkpoint) # Obtain the most similar neighbours similarities = [] with tqdm(desc="Computing similarities", total=len(test_embeddings)) as pbar: for vector in test_embeddings: similarities.append( self.sim.similar_by_vector(vector, topn=self.recs * 10)) pbar.update(1) # Map similar papers to conferences conferenceseries = [] confidences = [] with tqdm(desc="Computing conference predicitons.", total=len(similarities)) as pbar: for similarity in similarities: conferences = set() scores = [] for idx in range(len(similarity)): conferences_length = len(conferences) if conferences_length < self.recs: conferences.add( list(self.df_train[self.df_train.chapter == similarity[idx] [0]].conferenceseries)[0]) if len(conferences) != conferences_length: scores.append(similarity[idx][1]) conferenceseries.append(list(conferences)) confidences.append(scores) pbar.update(1) results = [conferenceseries, confidences] return results def train(self): pass def _load_train_embeddings(self): embeddings_file = os.path.join(self.graphsage_model._log_dir(), "embeddings.npy") embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(), "embeddings_ids.txt") if os.path.isfile(embeddings_file) and os.path.isfile( embeddings_ids_file): self.pretrained_embeddings = np.load(embeddings_file) self.pretrained_embeddings_id_map = {} with open(embeddings_ids_file) as f: for i, line in enumerate(f): self.pretrained_embeddings_id_map[line.strip()] = i return True return False def _load_training_graph(self): graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-G.json") if os.path.isfile(graph_file): with open(graph_file) as f: self.G_train = json_graph.node_link_graph(json.load(f)) return True return False def _load_training_walks(self): walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-walks.txt") self.walks = [] if isinstance(list(self.G_train.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n if os.path.isfile(walks_file): with open(walks_file) as f: for line in f: self.walks.append(map(conversion, line.split())) return True return False
class GraphSAGEClassifierModel(AbstractModel): def __init__(self, classifier, embedding_type, graph_type, model_checkpoint, train_prefix, model_name, model_size="small", learning_rate=0.00001, epochs=10, dropout=0.0, weight_decay=0.0, max_degree=100, samples_1=25, samples_2=10, dim_1=128, dim_2=128, random_context=True, neg_sample_size=20, batch_size=512, identity_dim=0, save_embeddings=False, base_log_dir='../../../data/processed/graphsage/', validate_iter=5000, validate_batch_size=256, gpu=0, print_every=50, max_total_steps=10**10, log_device_placement=False, recs=10): self.classifier = classifier self.embedding_type = embedding_type self.graph_type = graph_type self.model_checkpoint = model_checkpoint self.recs = recs os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) self.graphsage_model = UnsupervisedModel( train_prefix, model_name, model_size, learning_rate, epochs, dropout, weight_decay, max_degree, samples_1, samples_2, dim_1, dim_2, random_context, neg_sample_size, batch_size, identity_dim, save_embeddings, base_log_dir, validate_iter, validate_batch_size, gpu, print_every, max_total_steps, log_device_placement) self.preprocessor = Processor(self.embedding_type, self.graph_type, gpu) self.classifier_file = os.path.join( self.graphsage_model._log_dir(), self.classifier.__class__.__name__ + ".pkl") if not self._load_training_graph(): print("The training graph does not exist.") if not self._load_training_walks(): print("The walks do not exist.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ # Generate an ID for the query query_id = "new_node_id:" + "-".join( [str(i) for i in random.sample(range(0, 10000), 5)]) if self.graph_type == "citations": if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title chapter abstract, and " + "chapter citations are required.") return self.query_batch([(query_id, query[0], query[1], query[2])]) elif self.graph_type == "authors": if len(query) < 4: raise ValueError( "The input does not contain enough data; " + "chapter title chapter abstract, chapter " + "citations, and chapter authors are required.") authors_df = pd.DataFrame({ "author_name": query[3], "chapter": [query_id] * len(query[3]) }) return self.query_batch([(query_id, query[0], query[1], query[2])], authors_df) else: raise ValueError("Graph type not recognised.") def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ if self.graph_type == "citations": df_test = pd.DataFrame(batch, columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train) elif self.graph_type == "authors": df_test = pd.DataFrame(batch[0], columns=[ "chapter", "chapter_title", "chapter_abstract", "chapter_citations" ]) authors_df = batch[1] # Preprocess the data graph, features, id_map = self.preprocessor.test_data( df_test, self.G_train, authors_df=authors_df) else: raise ValueError("Graph type not recognised.") # Infer embeddings test_embeddings = self.graphsage_model.predict( [graph, features, id_map, self.walks], self.model_checkpoint)[1] # Compute predictions predictions = self.classifier.predict_proba(test_embeddings) sorted_predictions = np.argsort(-np.array(predictions)) conferenceseries = list() confidences = list() for index, order in enumerate(sorted_predictions): conferences = list() scores = list() i = 0 while len(conferences) < self.recs: conf = self.label_encoder.inverse_transform([order[i] ]).tolist()[0] if conf not in conferences: conferences.append(conf) scores.append(predictions[index][order][i]) i += 1 conferenceseries.append(conferences) confidences.append(scores) results = [conferenceseries, confidences] return results def train(self, data): if not self._load_model_classifier(): print("Classifier not trained yet. Training now...") timer = Timer() timer.tic() print("Loading the training embeddings...") if not self._load_train_embeddings(): print("The pretrained embeddings are missing.") else: print("Loaded.") training_ids = list(data.chapter) training_embeddings = self.pretrained_embeddings[[ self.pretrained_embeddings_id_map[id] for id in training_ids ]] self.label_encoder = LabelEncoder() self.labels = self.label_encoder.fit_transform( data.conferenceseries) self.classifier.fit(training_embeddings, self.labels) self._save_model_classifier() print("Training finished.") timer.toc() def _load_training_graph(self): graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-G.json") if os.path.isfile(graph_file): print("Loading training graph...") with open(graph_file) as f: self.G_train = json_graph.node_link_graph(json.load(f)) print("Loaded.") return True return False def _load_training_walks(self): walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "graphsage", self.embedding_type, self.graph_type, "train_val-walks.txt") self.walks = [] if isinstance(list(self.G_train.nodes)[0], int): conversion = lambda n: int(n) else: conversion = lambda n: n if os.path.isfile(walks_file): print("Loading training walks...") with open(walks_file) as f: for line in f: self.walks.append(map(conversion, line.split())) print("Loaded.") return True return False def _load_train_embeddings(self): embeddings_file = os.path.join(self.graphsage_model._log_dir(), "embeddings.npy") embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(), "embeddings_ids.txt") if os.path.isfile(embeddings_file) and os.path.isfile( embeddings_ids_file): self.pretrained_embeddings = np.load(embeddings_file) self.pretrained_embeddings_id_map = {} with open(embeddings_ids_file) as f: for i, line in enumerate(f): self.pretrained_embeddings_id_map[line.strip()] = i return True return False def _load_model_classifier(self): if os.path.isfile(self.classifier_file): print("Loading classifier...") with open(self.classifier_file, "rb") as f: self.label_encoder, self.labels, self.classifier = pickle.load( f) print("Loaded.") return True return False def _save_model_classifier(self): with open(self.classifier_file, "wb") as f: pickle.dump([self.label_encoder, self.labels, self.classifier], f, protocol=4) def _has_persistent_model(self): if os.path.isfile(self.classifier_file): return True return False
class ASGCNModel(AbstractModel): def __init__(self, embedding_type, dataset, model_name, max_degree=696, learning_rate=0.001, weight_decay=5e-4, dropout=0.0, epochs=300, early_stopping=30, hidden1=16, rank=128, skip=0, var=0.5, sampler_device="cpu", gpu=None, recs=10): self.embedding_type = embedding_type self.dataset = dataset self.model_name = model_name self.learning_rate = learning_rate self.weight_decay = weight_decay self.dropout = dropout self.epochs = epochs self.early_stopping = early_stopping self.hidden1 = hidden1 self.rank = rank self.skip = skip self.var = var self.sampler_device = sampler_device self.recs = recs self.preprocessor = Processor(self.embedding_type, self.dataset, gpu) self.training_data = self._load_training_data() if not self._load_label_encoder(): print("The label encoder does not exist.") def query_single(self, query): """Queries the model and returns a list of recommendations. Args: query (list): The query as needed by the model, is in the form [chapter_title, chapter_abstract, list(chapter_citations)]. Returns list: ids of the conferences double: confidence scores """ # Generate an ID for the query if len(query) < 3: raise ValueError("The input does not contain enough data; " + "chapter title, chapter abstract, and chapter " + "citations are required.") return self.query_batch([(query[0], query[1], query[2])]) def query_batch(self, batch): """Queries the model and returns a lis of recommendations. Args: batch (list of ntuples): The list of queries as needed by the model. The ntuples are in the form (chapter_id, chapter_title, chapter_abstract, list(chapter_citations)). Returns list: ids of the conferences double: confidence scores """ if len(batch) == 3: df_test = pd.DataFrame(batch, columns=["chapter_title", "chapter_abstract", "chapter_citations"]) else: df_test_extended = pd.DataFrame(batch, columns=["chapter", "chapter_title", "chapter_abstract", "chapter_citations"]) df_test = df_test_extended[["chapter_title", "chapter_abstract", "chapter_citations"]] train_features, train_labels, train_val_features, train_val_labels, graph = self.training_data # Reindex test dataframe such that indices follow those from the # train data df_test.index = range(train_val_features.shape[0], train_val_features.shape[0] + len(df_test)) # Preprocess the data test_data, max_degree = self.preprocessor.test_data( df_test, train_features, train_labels, train_val_features, train_val_labels, graph) # Inference on test data asgcn_model = ASGCN( self.embedding_type, self.dataset, self.model_name, max_degree, self.learning_rate, self.weight_decay, self.dropout, self.epochs, self.early_stopping, self.hidden1, self.rank, self.skip, self.var, self.sampler_device, gpu=None) predictions = asgcn_model.test(test_data) # Compute predictions sorted_predictions = (-predictions).argsort(axis=1) conferences = list() confidences = list() for i in range(len(predictions)): one_hot_preds = np.zeros((self.recs, len(predictions[0])), dtype=int) for j in range(self.recs): one_hot_preds[j][sorted_predictions[i, j]] = 1 conferences.append(list(self.label_encoder.inverse_transform( one_hot_preds).flatten())) confidences.append(list( predictions[i, sorted_predictions[:, :self.recs][i]])) results = [conferences, confidences] return results def train(self): pass def _load_training_data(self): print("Loading training data.") path_persistent = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "gat", self.embedding_type, self.dataset) names = ['x', 'y', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open(path_persistent + "/ind.{}.{}".format( self.dataset, names[i]), 'rb') as f: objects.append(pickle.load(f, encoding='latin1')) x, y, allx, ally, graph = tuple(objects) print("Loaded.") return x, y, allx, ally, graph def _load_label_encoder(self): label_encoder_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", "data", "interim", "gat", self.embedding_type, self.dataset, "label_encoder.pkl") if os.path.isfile(label_encoder_file): with open(label_encoder_file, "rb") as f: print("Loading label encoder.") self.label_encoder = pickle.load(f) print("Loaded.") return True return False