def __init__(self,
                 classifier,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.classifier = classifier
        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      gpu)

        self.classifier_file = os.path.join(
            self.graphsage_model._log_dir(),
            self.classifier.__class__.__name__ + ".pkl")

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_walks():
            print("The walks do not exist.")
Example #2
0
def main(args):
    # global parser, args, dev_triplets, test_triplets, we_wrapper, data_handler, model
    parser = argparse.ArgumentParser(description='Train word2vec model.')
    parser.add_argument('dev_file', help='dev input file')
    parser.add_argument('test_file', help='test input file')
    parser.add_argument('we_file', help='word embeddings normed model file')
    # parser.add_argument('output_folder', help='path to the output folder')
    parser.add_argument(
        'org_we_file',
        help='path to the original we model file - before adjectives clustering'
    )
    parser.add_argument('-s',
                        '--supervised',
                        default=False,
                        action='store_true',
                        help='train and evaluate also the supervised model')
    args = parser.parse_args(args)
    dev_triplets = read_HeiPLAS_data(args.dev_file)
    test_triplets = read_HeiPLAS_data(args.test_file)
    # load pre-trained, normalized word2ec
    we_wrapper = MultiSenseWE(args.org_we_file, args.we_file)
    we_wrapper.set_model()
    data_handler = DataHandler(we_wrapper)
    data_handler.run(dev_triplets, test_triplets)
    if args.supervised:
        model = SupervisedModel(data_handler)
        model.run()
    model = UnsupervisedModel(data_handler)
    model.run()
    logger.info("Done!!!!!")
Example #3
0
    def pretrain(self, train_set, validation_set=None):
        """Perform Unsupervised pretraining of the autoencoder."""
        self.do_pretrain = True

        def set_params_func(rbmmachine, rbmgraph):
            params = rbmmachine.get_model_parameters(graph=rbmgraph)
            self.encoding_w_.append(params['W'])
            self.encoding_b_.append(params['bh_'])

        return UnsupervisedModel.pretrain_procedure(
            self,
            self.rbms,
            self.rbm_graphs,
            set_params_func=set_params_func,
            train_set=train_set,
            validation_set=validation_set)
Example #4
0
    def __init__(self,
                 num_hidden,
                 visible_unit_type='bin',
                 main_dir='rbm/',
                 models_dir='models/',
                 data_dir='data/',
                 summary_dir='logs/',
                 model_name='rbm',
                 dataset='mnist',
                 loss_func='mean_squared',
                 l2reg=5e-4,
                 regtype='none',
                 gibbs_sampling_steps=1,
                 learning_rate=0.01,
                 batch_size=10,
                 num_epochs=10,
                 stddev=0.1,
                 D=[],
                 verbose=0):
        """Constructor.

        :param num_hidden: number of hidden units
        :param loss_function: type of loss function
        :param visible_unit_type: type of the visible units (bin, gauss or rsm)
        :param gibbs_sampling_steps: optional, default 1
        :param stddev: default 0.1. Ignored if visible_unit_type is not 'gauss'
        :param D: default []. Optional documents dimensions array. Used only if
            visible_unit_type is 'rsm'
        :param verbose: level of verbosity. optional, default 0
        """
        UnsupervisedModel.__init__(self, model_name, main_dir, models_dir,
                                   data_dir, summary_dir)

        self._initialize_training_parameters(loss_func=loss_func,
                                             learning_rate=learning_rate,
                                             num_epochs=num_epochs,
                                             batch_size=batch_size,
                                             dataset=dataset,
                                             regtype=regtype,
                                             l2reg=l2reg)

        self.num_hidden = num_hidden
        self.visible_unit_type = visible_unit_type
        self.gibbs_sampling_steps = gibbs_sampling_steps
        self.stddev = stddev
        self.D = D
        self.verbose = verbose

        self.W = None
        self.bh_ = None
        self.bv_ = None

        self.w_upd8 = None
        self.bh_upd8 = None
        self.bv_upd8 = None

        self.cost = None

        self.input_data = None
        self.hrand = None
        self.vrand = None
Example #5
0
if __name__ == "__main__":

    data = pd.read_csv("../tp2_training_dataset.csv",
                       header=None)  #.to_numpy()
    config = yaml.load(open("./config.yml"), Loader=yaml.FullLoader)

    label = data[0].to_numpy()
    dataset = data.drop(columns=[0]).to_numpy()

    model = UnsupervisedModel(
        dataset,
        dataset.shape[-1],
        config["output"],
        error=0.001,
        # error=config["output"],
        max_epochs=config["max_epochs"],
        lr=float(config["lr"]),
        algorithm=config["algorithm"],
        normalize=True,
        normal_params=(config["normal_params"]["mean"],
                       config["normal_params"]["var"]))

    train = True
    model_name = config["model_name"] + "_" + config["algorithm"]
    for f in glob.glob("*.npy"):
        if model_name + ".npy" == f:
            train = False
            break

    if train or config["force_train"]:
        print(model)
Example #6
0
    def __init__(self,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs
        self.gpu = gpu

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, "citations",
                                      self.gpu)

        # Prepare the training data
        d_train = DataLoader()
        self.df_train = d_train.training_data_with_abstracts_citations().data

        print("Loading the training embeddings...")
        if not self._load_train_embeddings():
            print("The pretrained embeddings are missing.")
        else:
            print("Loaded.")

        training_ids = list(self.df_train.chapter)
        self.training_embeddings = self.pretrained_embeddings[[
            self.pretrained_embeddings_id_map[id] for id in training_ids
        ]]
        self.sim = Similarities(self.training_embeddings, training_ids)

        print("Loading training graph...")
        if not self._load_training_graph():
            print("The training graph does not exist.")
        else:
            print("Loaded.")

        print("Loading training walks...")
        if not self._load_training_walks():
            print("The walks do not exist.")
        else:
            print("Loaded.")
Example #7
0
class GraphSAGENeighbourModel(AbstractModel):
    def __init__(self,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs
        self.gpu = gpu

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, "citations",
                                      self.gpu)

        # Prepare the training data
        d_train = DataLoader()
        self.df_train = d_train.training_data_with_abstracts_citations().data

        print("Loading the training embeddings...")
        if not self._load_train_embeddings():
            print("The pretrained embeddings are missing.")
        else:
            print("Loaded.")

        training_ids = list(self.df_train.chapter)
        self.training_embeddings = self.pretrained_embeddings[[
            self.pretrained_embeddings_id_map[id] for id in training_ids
        ]]
        self.sim = Similarities(self.training_embeddings, training_ids)

        print("Loading training graph...")
        if not self._load_training_graph():
            print("The training graph does not exist.")
        else:
            print("Loaded.")

        print("Loading training walks...")
        if not self._load_training_walks():
            print("The walks do not exist.")
        else:
            print("Loaded.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if len(query) < 3:
            raise ValueError("The input does not contain enough data; " +
                             "chapter title chapter abstract, and chapter " +
                             "citations are required.")
        # Generate an ID for the query
        query_id = "new_node_id:" + "-".join(
            [str(i) for i in random.sample(range(0, 10000), 5)])
        return self.query_batch([(query_id, query[0], query[1], query[2])])

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """

        df_test = pd.DataFrame(batch,
                               columns=[
                                   "chapter", "chapter_title",
                                   "chapter_abstract", "chapter_citations"
                               ])

        # Preprocess the data
        graph, features, id_map = self.preprocessor.test_data(
            df_test, self.G_train)

        # Infer embeddings
        test_nodes, test_embeddings = self.graphsage_model.predict(
            [graph, features, id_map, self.walks], self.model_checkpoint)

        # Obtain the most similar neighbours
        similarities = []
        with tqdm(desc="Computing similarities",
                  total=len(test_embeddings)) as pbar:
            for vector in test_embeddings:
                similarities.append(
                    self.sim.similar_by_vector(vector, topn=self.recs * 10))
                pbar.update(1)

        # Map similar papers to conferences
        conferenceseries = []
        confidences = []
        with tqdm(desc="Computing conference predicitons.",
                  total=len(similarities)) as pbar:
            for similarity in similarities:
                conferences = set()
                scores = []
                for idx in range(len(similarity)):
                    conferences_length = len(conferences)
                    if conferences_length < self.recs:
                        conferences.add(
                            list(self.df_train[self.df_train.chapter ==
                                               similarity[idx]
                                               [0]].conferenceseries)[0])
                        if len(conferences) != conferences_length:
                            scores.append(similarity[idx][1])
                conferenceseries.append(list(conferences))
                confidences.append(scores)
                pbar.update(1)

        results = [conferenceseries, confidences]
        return results

    def train(self):
        pass

    def _load_train_embeddings(self):
        embeddings_file = os.path.join(self.graphsage_model._log_dir(),
                                       "embeddings.npy")
        embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(),
                                           "embeddings_ids.txt")
        if os.path.isfile(embeddings_file) and os.path.isfile(
                embeddings_ids_file):
            self.pretrained_embeddings = np.load(embeddings_file)
            self.pretrained_embeddings_id_map = {}
            with open(embeddings_ids_file) as f:
                for i, line in enumerate(f):
                    self.pretrained_embeddings_id_map[line.strip()] = i
            return True
        return False

    def _load_training_graph(self):
        graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-G.json")
        if os.path.isfile(graph_file):
            with open(graph_file) as f:
                self.G_train = json_graph.node_link_graph(json.load(f))
            return True
        return False

    def _load_training_walks(self):
        walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-walks.txt")
        self.walks = []
        if isinstance(list(self.G_train.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n

        if os.path.isfile(walks_file):
            with open(walks_file) as f:
                for line in f:
                    self.walks.append(map(conversion, line.split()))
            return True
        return False
class GraphSAGEClassifierModel(AbstractModel):
    def __init__(self,
                 classifier,
                 embedding_type,
                 graph_type,
                 model_checkpoint,
                 train_prefix,
                 model_name,
                 model_size="small",
                 learning_rate=0.00001,
                 epochs=10,
                 dropout=0.0,
                 weight_decay=0.0,
                 max_degree=100,
                 samples_1=25,
                 samples_2=10,
                 dim_1=128,
                 dim_2=128,
                 random_context=True,
                 neg_sample_size=20,
                 batch_size=512,
                 identity_dim=0,
                 save_embeddings=False,
                 base_log_dir='../../../data/processed/graphsage/',
                 validate_iter=5000,
                 validate_batch_size=256,
                 gpu=0,
                 print_every=50,
                 max_total_steps=10**10,
                 log_device_placement=False,
                 recs=10):

        self.classifier = classifier
        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.model_checkpoint = model_checkpoint
        self.recs = recs

        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.graphsage_model = UnsupervisedModel(
            train_prefix, model_name, model_size, learning_rate, epochs,
            dropout, weight_decay, max_degree, samples_1, samples_2, dim_1,
            dim_2, random_context, neg_sample_size, batch_size, identity_dim,
            save_embeddings, base_log_dir, validate_iter, validate_batch_size,
            gpu, print_every, max_total_steps, log_device_placement)
        self.preprocessor = Processor(self.embedding_type, self.graph_type,
                                      gpu)

        self.classifier_file = os.path.join(
            self.graphsage_model._log_dir(),
            self.classifier.__class__.__name__ + ".pkl")

        if not self._load_training_graph():
            print("The training graph does not exist.")

        if not self._load_training_walks():
            print("The walks do not exist.")

    def query_single(self, query):
        """Queries the model and returns a list of recommendations.

        Args:
            query (list): The query as needed by the model, is in the form
            [chapter_title, chapter_abstract, list(chapter_citations)].

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        # Generate an ID for the query
        query_id = "new_node_id:" + "-".join(
            [str(i) for i in random.sample(range(0, 10000), 5)])

        if self.graph_type == "citations":
            if len(query) < 3:
                raise ValueError("The input does not contain enough data; " +
                                 "chapter  title chapter abstract, and " +
                                 "chapter citations are required.")
            return self.query_batch([(query_id, query[0], query[1], query[2])])
        elif self.graph_type == "authors":
            if len(query) < 4:
                raise ValueError(
                    "The input does not contain enough data; " +
                    "chapter title chapter abstract, chapter " +
                    "citations, and chapter authors are required.")
            authors_df = pd.DataFrame({
                "author_name": query[3],
                "chapter": [query_id] * len(query[3])
            })
            return self.query_batch([(query_id, query[0], query[1], query[2])],
                                    authors_df)
        else:
            raise ValueError("Graph type not recognised.")

    def query_batch(self, batch):
        """Queries the model and returns a lis of recommendations.

        Args:
            batch (list of ntuples): The list of queries as needed
            by the model. The ntuples are in the form (chapter_id,
            chapter_title, chapter_abstract, list(chapter_citations)).

        Returns
            list: ids of the conferences
            double: confidence scores
        """
        if self.graph_type == "citations":
            df_test = pd.DataFrame(batch,
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])

            # Preprocess the data
            graph, features, id_map = self.preprocessor.test_data(
                df_test, self.G_train)
        elif self.graph_type == "authors":
            df_test = pd.DataFrame(batch[0],
                                   columns=[
                                       "chapter", "chapter_title",
                                       "chapter_abstract", "chapter_citations"
                                   ])
            authors_df = batch[1]
            # Preprocess the data
            graph, features, id_map = self.preprocessor.test_data(
                df_test, self.G_train, authors_df=authors_df)
        else:
            raise ValueError("Graph type not recognised.")

        # Infer embeddings
        test_embeddings = self.graphsage_model.predict(
            [graph, features, id_map, self.walks], self.model_checkpoint)[1]

        # Compute predictions
        predictions = self.classifier.predict_proba(test_embeddings)
        sorted_predictions = np.argsort(-np.array(predictions))

        conferenceseries = list()
        confidences = list()

        for index, order in enumerate(sorted_predictions):
            conferences = list()
            scores = list()
            i = 0
            while len(conferences) < self.recs:
                conf = self.label_encoder.inverse_transform([order[i]
                                                             ]).tolist()[0]
                if conf not in conferences:
                    conferences.append(conf)
                    scores.append(predictions[index][order][i])
                i += 1
            conferenceseries.append(conferences)
            confidences.append(scores)

        results = [conferenceseries, confidences]
        return results

    def train(self, data):
        if not self._load_model_classifier():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()

            print("Loading the training embeddings...")
            if not self._load_train_embeddings():
                print("The pretrained embeddings are missing.")
            else:
                print("Loaded.")
            training_ids = list(data.chapter)
            training_embeddings = self.pretrained_embeddings[[
                self.pretrained_embeddings_id_map[id] for id in training_ids
            ]]

            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(
                data.conferenceseries)
            self.classifier.fit(training_embeddings, self.labels)
            self._save_model_classifier()

            print("Training finished.")
            timer.toc()

    def _load_training_graph(self):
        graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-G.json")
        if os.path.isfile(graph_file):
            print("Loading training graph...")
            with open(graph_file) as f:
                self.G_train = json_graph.node_link_graph(json.load(f))
            print("Loaded.")
            return True
        return False

    def _load_training_walks(self):
        walks_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..", "..", "data", "interim",
                                  "graphsage", self.embedding_type,
                                  self.graph_type, "train_val-walks.txt")
        self.walks = []
        if isinstance(list(self.G_train.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n

        if os.path.isfile(walks_file):
            print("Loading training walks...")
            with open(walks_file) as f:
                for line in f:
                    self.walks.append(map(conversion, line.split()))
            print("Loaded.")
            return True
        return False

    def _load_train_embeddings(self):
        embeddings_file = os.path.join(self.graphsage_model._log_dir(),
                                       "embeddings.npy")
        embeddings_ids_file = os.path.join(self.graphsage_model._log_dir(),
                                           "embeddings_ids.txt")
        if os.path.isfile(embeddings_file) and os.path.isfile(
                embeddings_ids_file):
            self.pretrained_embeddings = np.load(embeddings_file)
            self.pretrained_embeddings_id_map = {}
            with open(embeddings_ids_file) as f:
                for i, line in enumerate(f):
                    self.pretrained_embeddings_id_map[line.strip()] = i
            return True
        return False

    def _load_model_classifier(self):
        if os.path.isfile(self.classifier_file):
            print("Loading classifier...")
            with open(self.classifier_file, "rb") as f:
                self.label_encoder, self.labels, self.classifier = pickle.load(
                    f)
                print("Loaded.")
                return True
        return False

    def _save_model_classifier(self):
        with open(self.classifier_file, "wb") as f:
            pickle.dump([self.label_encoder, self.labels, self.classifier],
                        f,
                        protocol=4)

    def _has_persistent_model(self):
        if os.path.isfile(self.classifier_file):
            return True
        return False
Example #9
0
    def __init__(self,
                 layers,
                 model_name='srbm',
                 main_dir='srbm/',
                 models_dir='models/',
                 data_dir='data/',
                 summary_dir='logs/',
                 num_epochs=[10],
                 batch_size=[10],
                 dataset='mnist',
                 learning_rate=[0.01],
                 gibbs_k=[1],
                 loss_func=['mean_squared'],
                 momentum=0.5,
                 finetune_dropout=1,
                 verbose=1,
                 finetune_loss_func='cross_entropy',
                 finetune_enc_act_func=[tf.nn.relu],
                 finetune_dec_act_func=[tf.nn.sigmoid],
                 finetune_opt='gradient_descent',
                 finetune_learning_rate=0.001,
                 l2reg=5e-4,
                 finetune_num_epochs=10,
                 noise=['gauss'],
                 stddev=0.1,
                 finetune_batch_size=20,
                 do_pretrain=False,
                 tied_weights=False,
                 regtype=['none'],
                 finetune_reg_type='none'):
        """Constructor.

        :param layers: list containing the hidden units for each layer
        :param finetune_loss_func: Loss function for the softmax layer.
            string, default ['cross_entropy', 'mean_squared']
        :param finetune_dropout: dropout parameter
        :param finetune_learning_rate: learning rate for the finetuning.
            float, default 0.001
        :param finetune_enc_act_func: activation function for the encoder
            finetuning phase
        :param finetune_dec_act_func: activation function for the decoder
            finetuning phase
        :param finetune_opt: optimizer for the finetuning phase
        :param finetune_num_epochs: Number of epochs for the finetuning.
            int, default 20
        :param finetune_batch_size: Size of each mini-batch for the finetuning.
            int, default 20
        :param verbose: Level of verbosity. 0 - silent, 1 - print accuracy.
            int, default 0
        :param do_pretrain: True: uses variables from pretraining,
            False: initialize new variables.
        """
        # WARNING! This must be the first expression in the function or else it
        # will send other variables to expanded_args()
        # This function takes all the passed parameters that are lists and
        # expands them to the number of layers, if the number
        # of layers is more than the list of the parameter.
        expanded_args = utilities.expand_args(**locals())

        UnsupervisedModel.__init__(self, model_name, main_dir, models_dir,
                                   data_dir, summary_dir)

        self._initialize_training_parameters(
            loss_func=finetune_loss_func,
            learning_rate=finetune_learning_rate,
            regtype=finetune_reg_type,
            num_epochs=finetune_num_epochs,
            batch_size=finetune_batch_size,
            l2reg=l2reg,
            dropout=finetune_dropout,
            dataset=dataset,
            opt=finetune_opt,
            momentum=momentum)

        self.do_pretrain = do_pretrain
        self.layers = layers
        self.tied_weights = tied_weights
        self.verbose = verbose

        self.finetune_enc_act_func = expanded_args['finetune_enc_act_func']
        self.finetune_dec_act_func = expanded_args['finetune_dec_act_func']

        self.input_ref = None

        # Model parameters
        self.encoding_w_ = []  # list of matrices of encoding weights per layer
        self.encoding_b_ = []  # list of arrays of encoding biases per layer

        self.decoding_w = []  # list of matrices of decoding weights per layer
        self.decoding_b = []  # list of arrays of decoding biases per layer

        self.reconstruction = None
        self.rbms = []
        self.rbm_graphs = []

        for l, layer in enumerate(layers):
            rbm_str = 'rbm-' + str(l + 1)
            new_rbm = rbm.RBM(model_name=self.model_name + '-' + rbm_str,
                              loss_func=expanded_args['loss_func'][l],
                              models_dir=os.path.join(self.models_dir,
                                                      rbm_str),
                              data_dir=os.path.join(self.data_dir, rbm_str),
                              summary_dir=os.path.join(self.tf_summary_dir,
                                                       rbm_str),
                              visible_unit_type=expanded_args['noise'][l],
                              stddev=stddev,
                              num_hidden=expanded_args['layers'][l],
                              main_dir=self.main_dir,
                              learning_rate=expanded_args['learning_rate'][l],
                              gibbs_sampling_steps=expanded_args['gibbs_k'][l],
                              num_epochs=expanded_args['num_epochs'][l],
                              batch_size=expanded_args['batch_size'][l],
                              verbose=self.verbose,
                              regtype=expanded_args['regtype'][l])
            self.rbms.append(new_rbm)
            self.rbm_graphs.append(tf.Graph())
Example #10
0
    def __init__(self,
                 n_components,
                 name='dae',
                 loss_func='mse',
                 enc_act_func=tf.nn.tanh,
                 dec_act_func=None,
                 num_epochs=10,
                 batch_size=10,
                 opt='sgd',
                 learning_rate=0.01,
                 momentum=0.9,
                 corr_type='none',
                 corr_frac=0.,
                 regtype='none',
                 regcoef=5e-4):
        """Constructor.

        Parameters
        ----------

        n_components : int
            Number of hidden units.

        name : str, optional (default = "dae")
            Model name (used for save/load from disk).

        loss_func : str, optional (default = "mse")
            Loss function. ['mse', 'cross_entropy']

        enc_act_func : tf.nn.[activation]
            Activation function for the encoder.

        dec_act_func : tf.nn.[activation]
            Activation function for the decoder.

        num_epochs : int, optional (default = 10)
            Number of epochs.

        batch_size : int, optional (default = 10)
            Size of each mini-batch.

        opt : str, optional (default = "sgd")
            Which tensorflow optimizer to use.
            Possible values: ['sgd', 'momentum', 'adagrad', 'adam']

        learning_rate : float, optional (default = 0.01)
            Initial learning rate.

        momentum : float, optional (default = 0.9)
            Momentum parameter (only used if opt = "momentum").

        corr_type : str, optional (default = "none")
            Type of input corruption.
            Can be one of: ["none", "masking", "salt_and_pepper"]

        corr_frac : float, optional (default = 0.0)
            Fraction of the input to corrupt.

        regtype : str, optional (default = "none")
            Type of regularization to apply.
            Can be one of: ["none", "l1", "l2"].

        regcoef : float, optional (default = 5e-4)
            Regularization parameter. If 0, no regularization.
            Only considered if regtype != "none".
        """
        print("correct dae file")
        UnsupervisedModel.__init__(self, name)

        self.loss_func = loss_func
        self.learning_rate = learning_rate
        self.opt = opt
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.momentum = momentum
        self.regtype = regtype
        self.regcoef = regcoef

        self.loss = Loss(self.loss_func)
        self.trainer = Trainer(opt,
                               learning_rate=learning_rate,
                               momentum=momentum)

        self.n_components = n_components
        self.enc_act_func = enc_act_func
        self.dec_act_func = dec_act_func
        self.corr_type = corr_type
        self.corr_frac = corr_frac

        self.input_data_orig = None
        self.input_data = None

        self.W_ = None
        self.bh_ = None
        self.bv_ = None
Example #11
0
    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for unsupervised GraphSAGE model.')
        parser.add_argument('train_prefix',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument("model_name",
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_false",
                            default=True,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        args = parser.parse_args()

        print("Starting...")
        print("Loading training data..")
        train_data = load_data(args.train_prefix, load_walks=True)
        print("Done loading training data..\n")
        from unsupervised_model import UnsupervisedModel
        model = UnsupervisedModel(
            args.train_prefix, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        model.train(train_data)
        print("Finished.")
    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for GraphSAGE concatenated ' +
            'classifier model evaluation.')
        parser.add_argument(
            "classifier_name",
            choices=["KNN", "MLP", "MultinomialLogisticRegression"],
            help="The name of the classifier.")
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('model_checkpoint_citations',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the citations graph.')
        parser.add_argument('model_checkpoint_authors',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the authors graph.')
        parser.add_argument('train_prefix_citations',
                            help='Name of the object file that stores the ' +
                            'citations training data.')
        parser.add_argument('train_prefix_authors',
                            help='Name of the object file that stores the ' +
                            'authors training data.')
        parser.add_argument('model_name',
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_true",
                            default=False,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        parser.add_argument('--recs',
                            type=int,
                            default=10,
                            help='Number of recommendations.')
        args = parser.parse_args()

        print("Starting evaluation...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        print("Using GPU {}.".format(str(args.gpu)))

        from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation
        evaluation_model = GraphSAGEClassifierConcatEvaluation(
            args.classifier_name, args.embedding_type, args.model_name,
            args.model_size, args.learning_rate, args.gpu, args.recs)

        # Initialize GraphSAGE models
        graphsage_model_citations = UnsupervisedModel(
            args.train_prefix_citations, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        graphsage_model_authors = UnsupervisedModel(
            args.train_prefix_authors, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)

        # Train model if needed:
        if not evaluation_model._has_persistent_model():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()
            evaluation_model.train(graphsage_model_citations,
                                   graphsage_model_authors)
            print("Training finished.")
            timer.toc()
        else:
            evaluation_model._load_model_classifier()

        # Load test data
        print("Loading test data...")
        query_test, query_test_authors, truth = evaluation_model.load_data()
        print("Loaded.")

        # Infer embeddings
        print("Inferring embeddings for citations graph.")
        queue_citations = mp.Queue()
        process_citations = mp.Process(
            target=evaluation_model.infer_embeddings,
            args=(query_test, None, "citations", graphsage_model_citations,
                  args.model_checkpoint_citations, queue_citations))
        process_citations.start()
        embeddings_citations = queue_citations.get()
        process_citations.join()
        process_citations.terminate()

        print("Inferring embeddings for authors graphs.")
        queue_authors = mp.Queue()
        process_authors = mp.Process(target=evaluation_model.infer_embeddings,
                                     args=(query_test, query_test_authors,
                                           "authors", graphsage_model_authors,
                                           args.model_checkpoint_authors,
                                           queue_authors))
        process_authors.start()
        embeddings_authors = queue_authors.get()
        process_authors.join()
        process_authors.terminate()

        # Concatenate embeddings
        test_embeddings = np.concatenate(
            (embeddings_citations, embeddings_authors), axis=1)

        print("Computing predictions...")
        recommendation = evaluation_model.compute_predictions(test_embeddings)
        print("Predictions computed.")

        # Evaluate
        print("Evaluating...")
        evaluation = EvaluationContainer()
        evaluation.evaluate(recommendation, truth)
        print("Finished.")