Beispiel #1
0
    def __init__(self):
        print("reading graphs...")
        self.n_node, self.graph = utils.read_edges(config.train_filename,
                                                   config.test_filename)
        self.root_nodes = [i for i in range(self.n_node)]

        print("reading initial embeddings...")
        self.node_embed_init_d = utils.read_embeddings(
            filename=config.pretrain_emb_filename_d,
            n_node=self.n_node,
            n_embed=config.n_emb)
        self.node_embed_init_g = utils.read_embeddings(
            filename=config.pretrain_emb_filename_g,
            n_node=self.n_node,
            n_embed=config.n_emb)

        # construct or read BFS-trees
        self.trees = None
        if os.path.isfile(config.cache_filename):
            print("reading BFS-trees from cache...")
            pickle_file = open(config.cache_filename, 'rb')
            self.trees = pickle.load(pickle_file)
            pickle_file.close()
        else:
            print("constructing BFS-trees...")
            pickle_file = open(config.cache_filename, 'wb')
            if config.multi_processing:
                self.construct_trees_with_mp(self.root_nodes)
            else:
                self.trees = self.construct_trees(self.root_nodes)
            pickle.dump(self.trees, pickle_file)
            pickle_file.close()

        print("building GAN model...")
        self.discriminator = None
        self.generator2 = None
        self.build_generator()
        self.build_discriminator()
        ###################################

        self.latest_checkpoint = tf.train.latest_checkpoint(config.model_log)
        self.saver = tf.compat.v1.train.Saver()

        self.config = tf.compat.v1.ConfigProto()
        ###########
        #self.config.gpu_options.allow_growth = True
        ###########
        self.init_op = tf.group(tf.compat.v1.global_variables_initializer(),
                                tf.compat.v1.local_variables_initializer())
        self.sess = tf.compat.v1.Session(config=self.config)
        self.sess.run(self.init_op)
Beispiel #2
0
    def __init__(self):

        t = time.time()
        print "reading graph..."
        self.n_node, self.n_relation, self.graph = utils.read_graph(
            config.graph_filename)
        self.node_list = self.graph.keys()  #range(0, self.n_node)
        print '[%.2f] reading graph finished. #node = %d #relation = %d' % (
            time.time() - t, self.n_node, self.n_relation)

        t = time.time()
        print "read initial embeddings..."
        self.node_embed_init_d = utils.read_embeddings(
            filename=config.pretrain_node_emb_filename_d,
            n_node=self.n_node,
            n_embed=config.n_emb)
        self.node_embed_init_g = utils.read_embeddings(
            filename=config.pretrain_node_emb_filename_g,
            n_node=self.n_node,
            n_embed=config.n_emb)

        #self.rel_embed_init_d = utils.read_embeddings(filename=config.pretrain_rel_emb_filename_d,
        #                                              n_node=self.n_node,
        #                                              n_embed=config.n_emb)
        #self.rel_embed_init_g = utils.read_embeddings(filename=config.pretrain_rel_emb_filename_g,
        #                                              n_node=self.n_node,
        #                                              n_embed=config.n_emb)
        print "[%.2f] read initial embeddings finished." % (time.time() - t)

        print "build GAN model..."
        self.discriminator = None
        self.generator = None
        self.build_generator()
        self.build_discriminator()

        self.latest_checkpoint = tf.train.latest_checkpoint(config.model_log)
        self.saver = tf.train.Saver()

        self.dblp_evaluation = DBLP_evaluation()
        self.yelp_evaluation = Yelp_evaluation()
        self.aminer_evaluation = Aminer_evaluation()

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.init_op = tf.group(tf.global_variables_initializer(),
                                tf.local_variables_initializer())
        self.sess = tf.Session(config=self.config)
        self.sess.run(self.init_op)

        self.show_config()
Beispiel #3
0
def build_classifier(data_path):
  print('Constructing classifier...')

  classes = sorted(get_classes_from_data(data_path))

  if len(classes) <= 1:
    print("Not building classifier since we don't have enough faces")
    return

  class_to_num = {x: i for i, x in enumerate(classes)}
  X = None
  Y = None

  for i, aclass in enumerate(classes):
    embeddings = read_embeddings(os.path.join(data_path, aclass, 'embedding.h5'))
    X = embeddings if X is None else np.concatenate((X, embeddings), axis=0)
    labels = np.repeat(class_to_num[aclass], len(embeddings))
    Y = labels if Y is None else np.concatenate((Y, labels), axis=0)

  model = svm.SVC(kernel='linear', probability=True)
  model.fit(X, Y)

  write_classifier(os.path.join(data_path, 'classifier.pickle'), model, classes)

  return (model, classes, X, Y)
 def __init__(self, embed_filename, test_filename, test_neg_filename, n_node, n_embed):
     self.embed_filename = embed_filename  # each line: node_id, embeddings(dim: n_embed)
     self.test_filename = test_filename  # each line: node_id1, node_id2
     self.test_neg_filename = test_neg_filename  # each line: node_id1, node_id2
     self.n_node = n_node
     self.n_embed = n_embed
     self.emd = utils.read_embeddings(embed_filename, n_node=n_node, n_embed=n_embed)
 def load_embed(self):
     self.emb = utils.read_embeddings(self.embed_filename,
                                      n_node=self.n_node,
                                      n_embed=self.n_embed)
     epsilon = 1e-8  # ref to BIGCLAM
     threshold = math.sqrt(-math.log(1 - epsilon))  # ref to BIGCLAM
     self.emb = self.emb > threshold
     self.embed_m = sp.csr_matrix(self.emb.T, dtype=np.uint32)
Beispiel #6
0
def build_graph(graph, embedding_size=100, embedding_path=None, token2idx=None,
                input_dropout_rate=0.25, dropout_rate=0.5, l1=None, l2=None,
                convolutional_kernels=16, filter_extensions=[3, 4, 5], fix_embeddings=False,
                max_features=100000, max_len=100, output_dim=80):
    '''
    Builds Keras Graph model that, given a query (in the form of a list of indices), returns a vector of output_dim
    non-negative weights that sum up to 1.
    The Convolutional Neural Network architecture is inspired by the following paper:
    Yoon Kim - Convolutional Neural Networks for Sentence Classification - arXiv:1408.5882v2
    '''
    regularizer = utils.get_regularizer(l1, l2)

    graph.add_input(name='input_query', input_shape=(None,), dtype='int32')

    E = None
    if embedding_path is not None:
        E = utils.read_embeddings(embedding_path, token2idx=token2idx, max_features=max_features)

    embedding_layer = Embedding(input_dim=max_features, output_dim=embedding_size, input_length=max_len, weights=E)

    if fix_embeddings is True:
        embedding_layer.params = []
        embedding_layer.updates = []

    graph.add_node(embedding_layer, name='embedding', input='input_query')

    graph.add_node(Dropout(input_dropout_rate), name='embedding_dropout', input='embedding')

    flatten_layer_names = []
    for w_size in filter_extensions:
        convolutional_layer = Convolution1D(input_dim=embedding_size, nb_filter=convolutional_kernels,
                                            filter_length=w_size, border_mode='valid', activation='relu',
                                            W_regularizer=regularizer, subsample_length=1)

        convolutional_layer_name = 'convolutional' + str(w_size)
        graph.add_node(convolutional_layer, name=convolutional_layer_name , input='embedding_dropout')

        pool_length = convolutional_layer.output_shape[1]
        pooling_layer = MaxPooling1D(pool_length=pool_length)

        pooling_layer_name = 'pooling' + str(w_size)
        graph.add_node(pooling_layer, name=pooling_layer_name, input=convolutional_layer_name)

        flatten_layer_name = 'flatten' + str(w_size)
        flatten_layer = Flatten()
        graph.add_node(flatten_layer, name=flatten_layer_name, input=pooling_layer_name)
        flatten_layer_names += [flatten_layer_name]

    graph.add_node(Dropout(dropout_rate), name='dropout', inputs=flatten_layer_names, merge_mode='concat')

    dense_layer = Dense(output_dim=output_dim, W_regularizer=regularizer)
    graph.add_node(dense_layer, name='dense', input='dropout')

    softmax_layer = Activation('softmax')
    graph.add_node(softmax_layer, name='softmax', input='dense')

    return graph
Beispiel #7
0
def main(args):
    G_train = nx.read_weighted_edgelist(args.train,
                                        nodetype=int,
                                        create_using=nx.Graph())
    G_test = nx.read_weighted_edgelist(args.test,
                                       nodetype=int,
                                       create_using=nx.Graph())
    vector = read_node_vectors(args.embedding, G_test)

    print("=====Compute AUC====")
    auc = []
    for node in tqdm(list(G_test.nodes())):
        try:
            auc.append(AUC_MR.compute(G_test, node, vector))
        except ValueError:
            continue
    auc_mean = float(sum(auc) / len(auc))

    print("=====Compute MR====")
    sequence_order = AUC_MR.result_rank(G_test, vector)
    mr = []
    for node in tqdm(G_test.nodes()):
        try:
            mr.append(AUC_MR.mean_rank(G_test, node, sequence_order))
        except ValueError:
            continue
    Mean_Rank = sum(mr) / len(mr)

    print("=====Compute ACC====")
    n_node = len(G_train.nodes())

    neg_sample_link = []
    for node in tqdm(G_test.nodes()):
        neg_sample_link.append(
            [node,
             ACC.generate_neg_link(G_test, args.negative_num, node)])
    np.savetxt("temp/negtive_link.txt",
               np.asarray(neg_sample_link),
               fmt="%s",
               newline="\n",
               delimiter="\t")

    test_edge = utils.read_edges_from_file(args.test)
    test_edge_neg = utils.read_edges_from_file("temp/negtive_link.txt")
    test_edge.extend(test_edge_neg)
    EMB, EMBMAP = utils.read_embeddings(args.embedding, n_node,
                                        args.dimensions)
    acc = ACC.eval_link_prediction(test_edge, EMB, EMBMAP)

    print("=====Show Results====")
    dataset_name = args.train.split("/")[-1].split(".")[0]
    tb = pt.PrettyTable()
    tb.field_names = ["dataset", "AUC", "MR", "ACC"]
    tb.add_row([dataset_name, auc_mean, Mean_Rank, acc])
    print(tb)
Beispiel #8
0
    def __init__(self):
        t = time.time()
        print('reading graph...')
        self.graph, self.n_node, self.node_list, self.node_list_s, self.egs = utils.read_graph(config.train_file)
        self.node_emd_shape = [2, self.n_node, config.n_emb]
        print('[%.2f] reading graph finished. #node = %d' % (time.time() - t, self.n_node))

        self.dis_node_embed_init = None
        self.gen_node_embed_init = None
        if config.pretrain_dis_node_emb:
            t = time.time()
            print('reading initial embeddings...')
            dis_node_embed_init = np.array([utils.read_embeddings(filename=x, n_node=self.n_node, n_embed=config.n_emb) \
                                            for x in [config.pretrain_dis_node_emb]])
            gen_node_embed_init = np.array([utils.read_embeddings(filename=x, n_node=self.n_node, n_embed=config.n_emb) \
                                            for x in [config.pretrain_gen_node_emb]])
            print('[%.2f] read initial embeddings finished.' % (time.time() - t))

        print('building DGGAN model...')
        self.discriminator = None
        self.generator = None
        self.build_generator()
        self.build_discriminator()
        if config.experiment == 'link_prediction':
            self.link_prediction = evaluation.LinkPrediction(config)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config = self.config)
        self.saver = tf.train.Saver(max_to_keep=0)
        if config.pretrain_ckpt:
            print('restore...')
            pretrain_ckpt = tf.train.latest_checkpoint(config.pretrain_ckpt)
            self.saver.restore(self.sess, pretrain_ckpt)
        else:
            print('initial...')
            self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
            self.sess.run(self.init_op)
Beispiel #9
0
def main():
    if len(sys.argv) > 1:
        emb_path = sys.argv[1]
        if not os.path.exists(emb_path):
            print('Error. Embeddings file is not found')
            return
    else:
        print('Error. Specify path to embeddings file')
        return
    embeddings, words2ids = read_embeddings(emb_path)
    embeddings = normalize_embeddings(embeddings)
    print('SIMILARITY test:')
    human_vs_cos_sim_correlation('datasets/tt_similarity.csv', embeddings,
                                 words2ids)
    print('RELATEDNESS test:')
    human_vs_cos_sim_correlation('datasets/tt_relatedness.csv', embeddings,
                                 words2ids)
    print('ANALOGIES test:')
    top_k = 10
    answer_analogy_questions('datasets/tt_analogies.txt', embeddings,
                             words2ids, top_k)
Beispiel #10
0
#graph_filename="graphs/ca-netscience.txt"
#graph_filename="graphs/test1.txt"
#embedding_filename="output_ca_netscience.txt"
graph_filename = "graphs/web-google.txt"
#graph_filename="graphs/CA-GrQc.txt"
#graph_filename="graphs/test1.txt"
#embedding_filename="output_ca_netscience.txt"
embedding_filename = "output.txt"

adj_matrix, vertex_map, edge_count = utils.generate_adj_matrix(graph_filename)
test_edges, test_edges_neg = utils.generate_edges(adj_matrix, len(vertex_map),
                                                  edge_count)
test_edges.extend(test_edges_neg)

embeddings = utils.read_embeddings(embedding_filename)

score_res = []
for i in range(len(test_edges)):
    score_res.append(
        np.dot(embeddings[test_edges[i][0]], embeddings[test_edges[i][1]]))
test_label = np.array(score_res)
bar = np.median(test_label)
ind_pos = test_label >= bar
ind_neg = test_label < bar
test_label[ind_pos] = 1
test_label[ind_neg] = 0
true_label = np.zeros(test_label.shape)
true_label[0:len(true_label) // 2] = 1

accuracy = accuracy_score(true_label, test_label)
Beispiel #11
0
    return accuracies

# --- --- ---


if __name__ == '__main__':
    """
    Trains baseline and 2 Bi-LSTM layer models singularly for each dataset.
    """
    resources_path = parse_args().resources_path
    emb_path_1grams = resources_path + "/train/embeddings_1grams.utf8"
    emb_path_2grams = resources_path + "/train/embeddings_2grams.utf8"
    emb_path_3grams = resources_path + "/train/embeddings_3grams.utf8"

    word_to_idx_1grams, idx_to_word_1grams, emb_matrix_1grams = u.read_embeddings(emb_path_1grams)
    word_to_idx_2grams, idx_to_word_2grams, emb_matrix_2grams = u.read_embeddings(emb_path_2grams)
    word_to_idx_3grams, idx_to_word_3grams, emb_matrix_3grams = u.read_embeddings(emb_path_3grams)

    labels_to_idx, idx_to_labels = u.get_label_dictionaries()

    #grid_search(resources_path)

    # Train on AS dataset
    tf.reset_default_graph()

    train_baseline_model(train_datasets=[resources_path + "/train/as_training_simpl_input.utf8", resources_path + "/train/as_training_simpl_label.utf8"],
                         dev_datasets=["../resources/dev/as_dev_inputs.utf8", resources_path + "/dev/as_dev_labels.utf8"],
                         test_datasets=[resources_path + "/dev/as_test_inputs.utf8", resources_path + "/dev/as_test_labels.utf8"],
                         model_path=resources_path + "/base_model_as/base_model.ckpt",
                         model_ID=0)
Beispiel #12
0
def predict(input_path, output_path, resources_path):
    """
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the BIES format.
    
    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
    
    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.

    :param input_path: the path of the input file to predict.
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """

    print("Loading embeddings...")
    emb_path_1grams = resources_path + "/train/embeddings_1grams.utf8"
    emb_path_2grams = resources_path + "/train/embeddings_2grams.utf8"

    word_to_idx_1grams, idx_to_word_1grams, emb_matrix_1grams = u.read_embeddings(
        emb_path_1grams)
    word_to_idx_2grams, idx_to_word_2grams, emb_matrix_2grams = u.read_embeddings(
        emb_path_2grams)

    labels_to_idx, idx_to_labels = u.get_label_dictionaries()

    print("Done.")

    tf.reset_default_graph()

    x_1grams, x_2grams, y, \
        keep_pr, recurrent_keep_pr, \
        lengths, train, \
        loss, preds = m.get_layered_model(pretrained_emb_1grams=emb_matrix_1grams,
                                          pretrained_emb_2grams=emb_matrix_2grams,
                                          hidden_size=96,
                                          layers=1,
                                          y_size=len(labels_to_idx),
                                          learning_rate=0.005)

    model_path = resources_path + "/2layers_model_cityu/base_model.ckpt"
    print("Loading model saved in path: %s" % model_path)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, model_path)

        with open(output_path, mode='w', encoding='utf-8') as preds_file:
            pass

        print("\nGenerating predictions...")
        predictions = []
        with open(output_path, mode='a', encoding='utf-8') as preds_file:
            for batch_inputs, \
                batch_labels, \
                batch_lengths in u.generate_batches(dataset_input=input_path,
                                                    dataset_label="",
                                                    batch_size=32,
                                                    label_to_idx=labels_to_idx,
                                                    ngram_features=[1, 2],
                                                    word_to_idx=[word_to_idx_1grams, word_to_idx_2grams],
                                                    to_shuffle=False,
                                                    testing=True):
                preds_val = sess.run(
                    [preds],
                    feed_dict={
                        x_1grams: batch_inputs[0],
                        x_2grams: batch_inputs[1],
                        lengths: batch_lengths,
                        keep_pr: 1.0,
                        recurrent_keep_pr: 1.0
                    })

                for p in preds_val[0]:
                    p = p[1:np.count_nonzero(p) - 1]
                    p = p.tolist()

                    # default to "S" if some special tag (either '-' or '<PAD>') is predicted
                    p = [
                        idx_to_labels[c] if c > 1 else idx_to_labels[5]
                        for c in p
                    ]
                    predictions.append(p)

                if len(predictions) == 128:
                    preds_file.writelines("%s\n" % ''.join(p)
                                          for p in predictions)
                    predictions = []

            if len(predictions) > 0:
                preds_file.writelines("%s\n" % ''.join(p) for p in predictions)

    print("Done.\nYour predictions have been stored in path: %s" % output_path)