def test_skipgrams():
    # test with no window size and binary labels
    couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3)
    for couple in couples:
        assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]

    # test window size and categorical labels
    couples, labels = sequence.skipgrams(np.arange(5),
                                         vocabulary_size=5,
                                         window_size=1,
                                         categorical=True)
    for couple in couples:
        assert couple[0] - couple[1] <= 3
    for l in labels:
        assert len(l) == 2
def test_skipgrams():
    # test with no window size and binary labels
    couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3)
    for couple in couples:
        assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]

    # test window size and categorical labels
    couples, labels = sequence.skipgrams(np.arange(5),
                                         vocabulary_size=5,
                                         window_size=1,
                                         categorical=True)
    for couple in couples:
        assert couple[0] - couple[1] <= 3
    for l in labels:
        assert len(l) == 2
def generate_skipgrams(graph_filename, algorithm):
    """
    Create dataset with skipgrams from random walking.

    :param algorithm: algorithm for random walking. Available values: 'node2vec', 'metapath2vec', 'multimetapath2vec'
    :param graph_filename: name of file containing graph
    """
    g = load_graph_from_csv(graph_filename)

    num_walks = 10
    walk_length = 80

    walks = []
    if algorithm == "node2vec":
        ng = node2vec.Graph(g, is_directed=False, p=1., q=1.)
        ng.preprocess_transition_probs()
        walks = ng.simulate_walks(num_walks, walk_length)
    elif algorithm == "metapath2vec":
        walks = metapath2vec.Graph(g).simulate_walks(
            num_walks,
            walk_length,
            metapath=["JCH", "O", "NO", "O", "WO", "O", "JCH"])
    elif algorithm == "multimetapath2vec":
        walks = multimetapath2vec.Graph(g).simulate_walks(
            num_walks,
            walk_length,
            metapaths=[["JCH", "O", "NO", "O", "JCH"],
                       ["JCH", "O", "WO", "O", "JCH"]])

    print("Encoding to integers")
    walks_encoded = to_integers(g.nodes, walks)

    print("Generating skipgrams")

    all_couples = []
    all_labels = []
    for walk_encoded in walks_encoded:
        couples, labels = skipgrams(sequence=walk_encoded,
                                    vocabulary_size=len(g.nodes) + 1)
        all_couples += couples
        all_labels += labels

    print(len(all_couples))
    print(len(all_labels))

    print("Saving dataset")

    pickle.dump((all_couples, all_labels),
                open(file=graph_filename + "_" + algorithm + ".pickle",
                     mode='wb'))
Example #4
0
    #shuffle(unpadded_x)

    #flat_list = [item for sublist in unpadded_x for item in sublist]
    flat_list = []
    for sublist in unpadded_x:
        flat_list.extend(sublist)
        flat_list.extend([0]*window_size)

    print(f'start generating skip-grams | len:{len(flat_list)}')

    #ITERATIONS = 30000

    #grams_x = []
    #grams_y = []
    #for i, doc in enumerate(unpadded_x):
    sampling_table = sequence.make_sampling_table(vocab_size)
    data, labels = skipgrams(sequence=flat_list, vocabulary_size=vocab_size, window_size=window_size,
                             negative_samples=1., sampling_table=sampling_table)
    #grams_x.extend(data)
    #grams_y.extend(labels)

    #    if i % 1000 == 0:
    #        print(f'progress: {i / ITERATIONS*100}%')

    #    if i == ITERATIONS:
    #        break

    print(f'generated {len(data)} samples')
    save_pickle(data, 'tokenized/learn/grams_x.pickle')
    save_pickle(labels, 'tokenized/learn/grams_y.pickle')