def main(args):
    '''
    :param args: arguments for
    1. training the skigram model for learning subgraph representations
    2. construct the deep WL kernel using the learnt subgraph representations
    3. performing graph classification using  the WL and deep WL kernel
    :return: None
    '''
    corpus_dir = args.corpus
    output_dir = args.output_dir
    batch_size = args.batch_size
    epochs = args.epochs
    embedding_size = args.embedding_size
    num_negsample = args.num_negsample
    learning_rate = args.learning_rate
    wlk_h = args.wlk_h
    label_filed_name = args.label_filed_name
    class_labels_fname = args.class_labels_file_name

    wl_extn = 'g2v'+str(wlk_h)
    assert os.path.exists(corpus_dir), "File {} does not exist".format(corpus_dir)
    # assert os.path.exists(output_dir), "Dir {} does not exist".format(output_dir)

    graph_files = get_files(dirname=corpus_dir, extn='.gexf', max_files=0)
    logging.info('Loaded {} graph file names form {}'.format(len(graph_files),corpus_dir))


    t0 = time()
    wlk_relabel_and_dump_memory_version(graph_files, max_h=wlk_h, node_label_attr_name=label_filed_name)
    logging.info('dumped sg2vec sentences in {} sec.'.format(time() - t0))

    t0 = time()
    embedding_fname = train_skipgram(corpus_dir, wl_extn, learning_rate, embedding_size, num_negsample,
                                     epochs, batch_size, output_dir, class_labels_fname)
Beispiel #2
0
def main(args):
    '''
    :param args: arguments for
    1. training the skigram model for learning subgraph representations
    2. construct the deep WL kernel using the learnt subgraph representations
    3. performing graph classification using  the WL and deep WL kernel
    :return: None
    '''
    corpus_dir = args.corpus
    output_dir = args.output_dir
    batch_size = args.batch_size
    epochs = args.epochs
    embedding_size = args.embedding_size
    num_negsample = args.num_negsample
    learning_rate = args.learning_rate
    valid_size = args.valid_size
    n_cpus = args.n_cpus
    wlk_h = args.wlk_h
    label_filed_name = args.label_filed_name
    class_labels_fname = args.class_labels_file_name

    wl_extn = 'WL'+str(wlk_h)

    assert os.path.exists(corpus_dir), "File {} does not exist".format(corpus_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    graph_files = get_files(dirname=corpus_dir, extn='.gexf', max_files=0)
    logging.info('Loaded {} graph file names form {}'.format(len(graph_files),corpus_dir))

    t0 = time.time()
    Parallel(n_jobs=n_cpus)(delayed(dump_subgraph2vec_sentences)(f, wlk_h, label_filed_name) for f in graph_files)
    # for f in graph_files: dump_subgraph2vec_sentences (f, wlk_h, label_filed_name)
    logging.info('Dumped subgraph2vec sentences for all {} graphs in {} in {} sec'.format(len(graph_files),
                                                                                          corpus_dir, round(time.time()-t0)))

    t0 = time.time()
    embedding_fname = train_skipgram(corpus_dir, wl_extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir,valid_size)
    logging.info('Trained the skipgram model in {} sec.'.format(round(time.time()-t0, 2)))

    perform_classification (corpus_dir, wl_extn, embedding_fname, class_labels_fname)
                             max_h=wlk_h,
                             node_label_attr_name=label_field_name)
        print("Generated Graph Document Corpus in %s seconds" %
              (round(time() - t0, 2)))

        #############################################
        #### Neural Language Model Training
        #############################################
        print("SKIPGRAM LEARNING PHASE")
        # train the skipgram architecture
        t0 = time()
        embedding_fname = train_skipgram(corpus_dir,
                                         wl_extension,
                                         learning_rate,
                                         embedding_size,
                                         num_negsample,
                                         epochs,
                                         batch_size,
                                         wlk_h,
                                         output_dir,
                                         min_count=minCount)
        print("Trained the skipgram model in %s seconds" %
              (round(time() - t0, 2)))
        print("SKIPGRAM LEARNING DONE")

        #############################################
        #### Classification Phase on Learned Embeddings
        #############################################
        print("DOING classification")
        # perform single classification and evaluation
        # classify_scores = perform_classification(corpus_dir, wl_extension, embedding_fname, class_labels_fname)
        # acc, prec, recall, f_score = classify_scores