def generate_corpus(self):
        # get all AW (vocabulary size)
        self.g2v = AnonymousWalks()
        if self.graph_labels is None:
            self.g2v._all_paths(self.steps, keep_last=True)
        elif self.graph_labels == 'nodes':
            self.g2v._all_paths_nodes(self.steps, keep_last=True)
        elif self.graph_labels == 'edges':
            self.g2v._all_paths_edges(self.steps, keep_last=True)
        elif self.graph_labels == 'edges_nodes':
            self.g2v._all_paths_edges_nodes(self.steps, keep_last=True)

        self.walk_ids = dict()
        for i, path in enumerate(self.g2v.paths[self.steps]):
            self.walk_ids[tuple(path)] = i

        self.nodes_per_graphs = dict()

        label_suffix = ''
        if self.graph_labels is not None:
            label_suffix = '_' + self.graph_labels

        if self.regenerate_corpus == True or not os.path.exists(
                self.ROOT + self.dataset + '_corpus' + label_suffix):
            if not os.path.exists(self.ROOT + self.dataset + '_corpus' +
                                  label_suffix):
                os.mkdir(self.ROOT + self.dataset + '_corpus' + label_suffix)

            for en, graph_fn in enumerate(self.sorted_graphs):
                if en > 0 and not en % 100:
                    print(f"Graph {en}")
                g2v = AnonymousWalks()
                g2v.read_graphml(self.folder + graph_fn)
                self.nodes_per_graphs[en] = len(g2v.graph)

                g2v.write_corpus(
                    self.neiborhood_size, self.walk_ids, self.steps,
                    self.graph_labels,
                    self.ROOT + self.dataset + '_corpus{}/{}'.format(
                        label_suffix, self.corpus_fn_name.format(en)))
class AWE(object):
    '''
    Computes distributed Anonymous Walk Embeddings.
    '''
    def __init__(self,
                 dataset='imdb_b',
                 batch_size=128,
                 window_size=8,
                 concat=False,
                 embedding_size_w=64,
                 embedding_size_d=64,
                 loss_type='sampled_softmax',
                 num_samples=64,
                 optimize='Adagrad',
                 learning_rate=1.0,
                 root='../',
                 ext='graphml',
                 steps=7,
                 epochs=1,
                 batches_per_epoch=1,
                 candidate_func=None,
                 graph_labels=None,
                 regenerate_corpus=False,
                 neighborhood_size=1):
        '''
        Initialize AWE model.
        :param dataset: name of the dataset and corresponding name of the folder.
        :param batch_size: number of batches per iteration of AWE model.
        :param window_size: number of context words.
        :param concat: Concatenate context words or not.
        :param embedding_size_w: embedding size of word
        :param embedding_size_d: embedding size of document
        :param loss_type: sampled softmax or nce
        :param num_samples: number of (negative) samples for every target word.
        :param optimize: SGD or Adagrad
        :param learning_rate: learning rate of the model
        :param root: root folder of the dataset
        :param ext: extension of files with graphs (e.g. graphml)
        :param steps: length of anonymous walk
        :param epochs: number of epochs for iterations
        :param batches_per_epoch: number of batches per epoch for each graph
        :param candidate_func: None (loguniform by default) or uniform
        :param graph_labels: None, edges, nodes, edges_nodes
        '''

        # bind params to class
        self.batch_size = batch_size
        self.window_size = window_size
        self.concat = concat
        self.embedding_size_w = embedding_size_w
        self.embedding_size_d = embedding_size_d
        self.loss_type = loss_type
        self.num_samples = num_samples
        self.optimize = optimize
        self.learning_rate = learning_rate
        self.candidate_func = candidate_func
        self.graph_labels = graph_labels

        self.ROOT = root
        self.ext = ext
        self.steps = steps
        self.epochs = epochs
        self.dataset = dataset

        self.batches_per_epoch = batches_per_epoch

        # switch to have batches_per_epoch = N for every graph with N nodes
        self.flag2iterations = False
        if batches_per_epoch is None:
            self.flag2iterations = True

        # get all graph filenames (document size)
        self.folder = self.ROOT + self.dataset + '/'
        folder_graphs = filter(lambda g: g.endswith(max(self.ext, '')),
                               os.listdir(self.folder))

        self.sorted_graphs = sorted(
            folder_graphs, key=lambda g: int(re.findall(r'\d+', g)[0]))
        self.document_size = len(self.sorted_graphs)
        print('Number of graphs: {}'.format(self.document_size))

        print('Generating corpus... ', end='')
        self.corpus_fn_name = '{}.corpus'
        self.regenerate_corpus = regenerate_corpus
        self.neiborhood_size = neighborhood_size
        start2gen = time.time()
        self.generate_corpus()
        print('Finished {}'.format(time.time() - start2gen))

        self.vocabulary_size = max(self.walk_ids.values()) + 1
        print('Number of words: {}'.format(self.vocabulary_size))

        # init all variables in a tensorflow graph
        self._init_graph()

        # create a session
        self.sess = tf.Session(graph=self.graph)

    def generate_corpus(self):
        # get all AW (vocabulary size)
        self.g2v = AnonymousWalks()
        if self.graph_labels is None:
            self.g2v._all_paths(self.steps, keep_last=True)
        elif self.graph_labels == 'nodes':
            self.g2v._all_paths_nodes(self.steps, keep_last=True)
        elif self.graph_labels == 'edges':
            self.g2v._all_paths_edges(self.steps, keep_last=True)
        elif self.graph_labels == 'edges_nodes':
            self.g2v._all_paths_edges_nodes(self.steps, keep_last=True)

        self.walk_ids = dict()
        for i, path in enumerate(self.g2v.paths[self.steps]):
            self.walk_ids[tuple(path)] = i

        self.nodes_per_graphs = dict()

        label_suffix = ''
        if self.graph_labels is not None:
            label_suffix = '_' + self.graph_labels

        if self.regenerate_corpus == True or not os.path.exists(
                self.ROOT + self.dataset + '_corpus' + label_suffix):
            if not os.path.exists(self.ROOT + self.dataset + '_corpus' +
                                  label_suffix):
                os.mkdir(self.ROOT + self.dataset + '_corpus' + label_suffix)

            for en, graph_fn in enumerate(self.sorted_graphs):
                if en > 0 and not en % 100:
                    print(f"Graph {en}")
                g2v = AnonymousWalks()
                g2v.read_graphml(self.folder + graph_fn)
                self.nodes_per_graphs[en] = len(g2v.graph)

                g2v.write_corpus(
                    self.neiborhood_size, self.walk_ids, self.steps,
                    self.graph_labels,
                    self.ROOT + self.dataset + '_corpus{}/{}'.format(
                        label_suffix, self.corpus_fn_name.format(en)))

    def _init_graph(self):
        '''
        Init a tensorflow Graph containing:
        input data, variables, model, loss function, optimizer
        '''
        self.graph = tf.Graph()
        with self.graph.as_default(), tf.device('/cpu:0'):
            tf.set_random_seed(SEED)

            self.train_dataset = tf.placeholder(
                tf.int32, shape=[self.batch_size, self.window_size + 1])
            self.train_labels = tf.placeholder(tf.int32,
                                               shape=[self.batch_size, 1])

            # embeddings for anonymous walks
            self.word_embeddings = tf.Variable(
                tf.random_uniform(
                    [self.vocabulary_size, self.embedding_size_w], -1.0, 1.0))

            # embedding for graphs
            self.doc_embeddings = tf.Variable(
                tf.random_uniform([self.document_size, self.embedding_size_d],
                                  -1.0, 1.0))

            if self.concat:  # concatenating word vectors and doc vector
                combined_embed_vector_length = self.embedding_size_w * self.window_size + self.embedding_size_d
            else:  # concatenating the average of word vectors and the doc vector
                combined_embed_vector_length = self.embedding_size_w + self.embedding_size_d

            # softmax weights, W and D vectors should be concatenated before applying softmax
            self.weights = tf.Variable(
                tf.truncated_normal(
                    [self.vocabulary_size, combined_embed_vector_length],
                    stddev=1.0 / math.sqrt(combined_embed_vector_length)))
            # softmax biases
            self.biases = tf.Variable(tf.zeros([self.vocabulary_size]))

            # shape: (batch_size, embeddings_size)
            embed = [
            ]  # collect embedding matrices with shape=(batch_size, embedding_size)
            if self.concat:
                for j in range(self.window_size):
                    embed_w = tf.nn.embedding_lookup(self.word_embeddings,
                                                     self.train_dataset[:, j])
                    embed.append(embed_w)
            else:
                # averaging word vectors
                embed_w = tf.zeros([self.batch_size, self.embedding_size_w])
                for j in range(self.window_size):
                    embed_w += tf.nn.embedding_lookup(self.word_embeddings,
                                                      self.train_dataset[:, j])
                embed.append(embed_w)

            embed_d = tf.nn.embedding_lookup(
                self.doc_embeddings, self.train_dataset[:, self.window_size])
            embed.append(embed_d)
            # concat word and doc vectors
            self.embed = tf.concat(embed, 1)

            # choosing negative sampling function
            sampled_values = None  # log uniform by default
            if self.candidate_func == 'uniform':  # change to uniform
                sampled_values = tf.nn.uniform_candidate_sampler(
                    true_classes=tf.to_int64(self.train_labels),
                    num_true=1,
                    num_sampled=self.num_samples,
                    unique=True,
                    range_max=self.vocabulary_size)

            # Compute the loss, using a sample of the negative labels each time.
            loss = None
            if self.loss_type == 'sampled_softmax':
                loss = tf.nn.sampled_softmax_loss(
                    self.weights,
                    self.biases,
                    self.train_labels,
                    self.embed,
                    self.num_samples,
                    self.vocabulary_size,
                    sampled_values=sampled_values)
            elif self.loss_type == 'nce':
                loss = tf.nn.nce_loss(self.weights,
                                      self.biases,
                                      self.train_labels,
                                      self.embed,
                                      self.num_samples,
                                      self.vocabulary_size,
                                      sampled_values=sampled_values)

            self.loss = tf.reduce_mean(loss)

            # Optimizer.
            if self.optimize == 'Adagrad':
                self.optimizer = tf.train.AdagradOptimizer(
                    self.learning_rate).minimize(loss)
            elif self.optimize == 'SGD':
                self.optimizer = tf.train.GradientDescentOptimizer(
                    self.learning_rate).minimize(loss)

            # Normalize embeddings
            norm_w = tf.sqrt(
                tf.reduce_sum(tf.square(self.word_embeddings),
                              1,
                              keep_dims=True))
            self.normalized_word_embeddings = self.word_embeddings / norm_w

            norm_d = tf.sqrt(
                tf.reduce_sum(tf.square(self.doc_embeddings),
                              1,
                              keep_dims=True))
            self.normalized_doc_embeddings = self.doc_embeddings / norm_d

            self.init_op = tf.global_variables_initializer()
            self.saver = tf.train.Saver()

    def _train_thread_body(self):
        '''Train model on random anonymous walk batches.'''
        label_suffix = ''
        if self.graph_labels is not None:
            label_suffix = '_' + graph_labels

        while True:
            batch_data, batch_labels = self.g2v.generate_file_batch(
                self.batch_size, self.window_size, self.doc_id,
                self.ROOT + self.dataset + '_corpus{}/{}'.format(
                    label_suffix, self.corpus_fn_name.format(self.doc_id)),
                self.nodes_per_graphs[self.doc_id])
            # batch_data, batch_labels = self.g2v.generate_random_batch(batch_size=self.batch_size,
            #                                                         window_size=self.window_size,
            #                                                         steps=self.steps, walk_ids=self.walk_ids,
            #                                                         doc_id=self.doc_id,
            #                                                         graph_labels = self.graph_labels)
            feed_dict = {
                self.train_dataset: batch_data,
                self.train_labels: batch_labels
            }
            op, l = self.sess.run([self.optimizer, self.loss],
                                  feed_dict=feed_dict)
            self.sample += 1
            self.global_step += 1

            self.average_loss += l
            # The average loss is an estimate of the loss over the last 100 batches.
            # if self.global_step % 100 == 0:
            # print('Average loss at step %d: %f' % (self.global_step, self.average_loss))
            # self.average_loss = 0

            if self.sample >= self.batches_per_epoch:
                break

    def train(self):
        '''Train the model.'''
        session = self.sess

        session.run(self.init_op)

        self.average_loss = 0
        self.global_step = 0
        print('Initialized')
        random_order = list(range(len(self.sorted_graphs)))
        random.shuffle(random_order)
        for ep in range(self.epochs):
            print('Epoch: {}'.format(ep))
            time2epoch = time.time()
            for rank_id, doc_id in enumerate(random_order):
                # for doc_id, graph_fn in enumerate(self.sorted_graphs):
                #     graph_fn = self.sorted_graphs[doc_id]

                time2graph = time.time()
                self.sample = 0
                self.doc_id = doc_id
                # self.g2v.read_graphml(self.folder + graph_fn)
                # self.g2v.create_random_walk_graph()

                # print('{}-{}. Graph-{}: {} nodes'.format(ep, rank_id, doc_id, len(self.g2v.rw_graph)))
                # if self.flag2iterations == True: # take sample of N words per each graph with N nodes
                #     self.batches_per_epoch = len(self.g2v.rw_graph)

                self._train_thread_body()

                if rank_id > 0 and not rank_id % 100:
                    print('Graph {}-{}: {:.2f}'.format(
                        ep, rank_id,
                        time.time() - time2graph))
            print('Time for epoch {}: {:.2f}'.format(ep,
                                                     time.time() - time2epoch))
            # save temporary embeddings
            if not ep % 10:
                self.graph_embeddings = session.run(
                    self.normalized_doc_embeddings)
                np.savez_compressed('doc2vec_results' + '/' + 'mutag' +
                                    '/tmp/embeddings_{}.txt'.format(ep),
                                    E=self.graph_embeddings)

        self.graph_embeddings = session.run(self.normalized_doc_embeddings)

        return self
Exemple #3
0
    def __init__(self,
                 dataset='imdb_b',
                 batch_size=128,
                 window_size=8,
                 concat=False,
                 embedding_size_w=64,
                 embedding_size_d=64,
                 loss_type='sampled_softmax',
                 num_samples=64,
                 optimize='Adagrad',
                 learning_rate=1.0,
                 root='../',
                 ext='graphml',
                 steps=7,
                 epochs=1,
                 batches_per_epoch=1,
                 candidate_func=None,
                 graph_labels=None,
                 graph_idxs=None):
        '''
        Initialize AWE model.
        :param dataset: name of the dataset and corresponding name of the folder.
        :param batch_size: number of batches per iteration of AWE model.
        :param window_size: number of context words.
        :param concat: Concatenate context words or not.
        :param embedding_size_w: embedding size of word
        :param embedding_size_d: embedding size of document
        :param loss_type: sampled softmax or nce
        :param num_samples: number of (negative) samples for every target word.
        :param optimize: SGD or Adagrad
        :param learning_rate: learning rate of the model
        :param root: root folder of the dataset
        :param ext: extension of files with graphs (e.g. graphml)
        :param steps: length of anonymous walk
        :param epochs: number of epochs for iterations
        :param batches_per_epoch: number of batches per epoch for each graph
        :param candidate_func: None (loguniform by default) or uniform
        :param graph_labels: None, edges, nodes, edges_nodes
        '''

        # bind params to class
        self.batch_size = batch_size
        self.window_size = window_size
        self.concat = concat
        self.embedding_size_w = embedding_size_w
        self.embedding_size_d = embedding_size_d
        self.loss_type = loss_type
        self.num_samples = num_samples
        self.optimize = optimize
        self.learning_rate = learning_rate
        self.candidate_func = candidate_func
        self.graph_labels = graph_labels

        self.ROOT = root
        self.ext = ext
        self.steps = steps
        self.epochs = epochs
        self.dataset = dataset

        self.batches_per_epoch = batches_per_epoch

        # switch to have batches_per_epoch = N for every graph with N nodes
        self.flag2iterations = False
        if batches_per_epoch is None:
            self.flag2iterations = True

        # get all graph filenames (document size)
        self.folder = self.ROOT + self.dataset + '/'
        folder_graphs = filter(lambda g: g.endswith(max(self.ext, '')),
                               os.listdir(self.folder))

        self.sorted_graphs = sorted(
            folder_graphs, key=lambda g: int(re.findall(r'\d+', g)[0]))
        if graph_idxs is None:
            graph_idxs = range(len(self.sorted_graphs))
        self.sorted_graphs = np.array(self.sorted_graphs)[graph_idxs]
        self.document_size = len(self.sorted_graphs)
        print('Number of graphs: {}'.format(self.document_size))

        # get all AW (vocabulary size)
        self.g2v = AnonymousWalks()
        if self.graph_labels is None:
            self.g2v._all_paths(self.steps, keep_last=True)
        elif self.graph_labels == 'nodes':
            self.g2v._all_paths_nodes(self.steps, keep_last=True)
        elif self.graph_labels == 'edges':
            self.g2v._all_paths_edges(self.steps, keep_last=True)
        elif self.graph_labels == 'edges_nodes':
            self.g2v._all_paths_edges_nodes(self.steps, keep_last=True)

        self.walk_ids = dict()
        for i, path in enumerate(self.g2v.paths[self.steps]):
            self.walk_ids[tuple(path)] = i

        self.vocabulary_size = max(self.walk_ids.values()) + 1
        print('Number of words: {}'.format(self.vocabulary_size))

        # init all variables in a tensorflow graph
        self._init_graph()

        # create a session
        self.sess = tf.Session(graph=self.graph)