def test_skipgrams(): # test with no window size and binary labels couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3) for couple in couples: assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2] # test window size and categorical labels couples, labels = sequence.skipgrams(np.arange(5), vocabulary_size=5, window_size=1, categorical=True) for couple in couples: assert couple[0] - couple[1] <= 3 for l in labels: assert len(l) == 2
def test_skipgrams(): # test with no window size and binary labels couples, labels = sequence.skipgrams(np.arange(3), vocabulary_size=3) for couple in couples: assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2] # test window size and categorical labels couples, labels = sequence.skipgrams(np.arange(5), vocabulary_size=5, window_size=1, categorical=True) for couple in couples: assert couple[0] - couple[1] <= 3 for l in labels: assert len(l) == 2
def generate_skipgrams(graph_filename, algorithm): """ Create dataset with skipgrams from random walking. :param algorithm: algorithm for random walking. Available values: 'node2vec', 'metapath2vec', 'multimetapath2vec' :param graph_filename: name of file containing graph """ g = load_graph_from_csv(graph_filename) num_walks = 10 walk_length = 80 walks = [] if algorithm == "node2vec": ng = node2vec.Graph(g, is_directed=False, p=1., q=1.) ng.preprocess_transition_probs() walks = ng.simulate_walks(num_walks, walk_length) elif algorithm == "metapath2vec": walks = metapath2vec.Graph(g).simulate_walks( num_walks, walk_length, metapath=["JCH", "O", "NO", "O", "WO", "O", "JCH"]) elif algorithm == "multimetapath2vec": walks = multimetapath2vec.Graph(g).simulate_walks( num_walks, walk_length, metapaths=[["JCH", "O", "NO", "O", "JCH"], ["JCH", "O", "WO", "O", "JCH"]]) print("Encoding to integers") walks_encoded = to_integers(g.nodes, walks) print("Generating skipgrams") all_couples = [] all_labels = [] for walk_encoded in walks_encoded: couples, labels = skipgrams(sequence=walk_encoded, vocabulary_size=len(g.nodes) + 1) all_couples += couples all_labels += labels print(len(all_couples)) print(len(all_labels)) print("Saving dataset") pickle.dump((all_couples, all_labels), open(file=graph_filename + "_" + algorithm + ".pickle", mode='wb'))
#shuffle(unpadded_x) #flat_list = [item for sublist in unpadded_x for item in sublist] flat_list = [] for sublist in unpadded_x: flat_list.extend(sublist) flat_list.extend([0]*window_size) print(f'start generating skip-grams | len:{len(flat_list)}') #ITERATIONS = 30000 #grams_x = [] #grams_y = [] #for i, doc in enumerate(unpadded_x): sampling_table = sequence.make_sampling_table(vocab_size) data, labels = skipgrams(sequence=flat_list, vocabulary_size=vocab_size, window_size=window_size, negative_samples=1., sampling_table=sampling_table) #grams_x.extend(data) #grams_y.extend(labels) # if i % 1000 == 0: # print(f'progress: {i / ITERATIONS*100}%') # if i == ITERATIONS: # break print(f'generated {len(data)} samples') save_pickle(data, 'tokenized/learn/grams_x.pickle') save_pickle(labels, 'tokenized/learn/grams_y.pickle')