def test_loading_unordered_feature_lists(tmpdir): d = { 'a/N': [('f1', 1), ('f2', 2), ('f3', 3)], 'b/N': [('f3', 3), ('f1', 1), ('f2', 2), ], 'c/N': [('f3', 3), ('f2', 2), ('f1', 1)], } # three identical vectors v = Vectors(d) filename = str(tmpdir.join('outfile.txt')) v.to_tsv(filename) v1 = v.from_tsv(filename) assert v.columns == v1.columns # rows can be in any order, but columns need to be sorted for word in d.keys(): assert_array_equal(v.get_vector(word).A, v1.get_vector(word).A)
def write_gensim_vectors_to_tsv(model, output_path, vocab=None): # get word2vec vectors for each word, write to TSV if not vocab: vocab = model.vocab.keys() vectors = dict() dims = len(model[next(iter(vocab))]) # vector dimensionality dimension_names = ['f%02d' % i for i in range(dims)] for word in vocab: # watch for non-DocumentFeatures, these break to_tsv # also ignore words with non-ascii characters # if DocumentFeature.from_string(word).type == 'EMPTY': # todo assumes there is a PoS tag # logging.info('Ignoring vector for %s', word) # continue vectors[word] = zip(dimension_names, model[word]) vectors = Vectors(vectors) vectors.to_tsv(output_path, gzipped=True, enforce_word_entry_pos_format=True, entry_filter=lambda _: True, dense_hd5=True) del model return vectors