def iter_XY(config): (embeddingconfig, gramconfig, runconfig) = config embedding = WordEmbedding(embeddingpath(embeddingconfig)) unknown_embedding = None if not runconfig.filter_unknown: unknown_embedding = zeros(embedding.values.shape[1]) def embed(gram): return embedding.embed(gram, unknown_embedding) with gzip.open(grampath(gramconfig), mode='rt') as f: for line in f: line = line.split() if len(line) == gramconfig.gram_size: gram = line lbl = word_to_label(gramconfig.skipwords, '') elif len(line) != gramconfig.gram_size + 1: print('Skip length > 1 not supported. Dropping Gram:', *line, file=stderr) continue else: gram = line[ : gramconfig.skippos] + line[gramconfig.skippos + 1 : ] skip = line[gramconfig.skippos] lbl = word_to_label(gramconfig.skipwords, skip) try: gramvec = embed(gram) yield gramvec, lbl except KeyError: pass
def main(): embedding = WordEmbedding(embeddingpath(default_embeddingconfig)) for old, new in spelling_changes: print(old, '--', new) print(embedding.nearest_words([old])) print() print() war, ist = tense_changes[0] tensediff = embedding[ist] - embedding[war] for past, present in tense_changes[1 : ]: print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff])) print('Should be:', present) print() # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes] spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]] tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes] def metric(u, v): return max(distance.cosine(u, v), 0) while True: try: model = TSNE(n_components=2, metric=metric) reduced = model.fit_transform(spelling_diffs + tense_diffs) print(reduced) return except Exception: pass
def create_embedding(embeddingconfig): makedirs('data/embeddings', exist_ok=True) corpuspath = config.corpuspath(embeddingconfig.corpus) embeddingpath = config.embeddingpath(embeddingconfig) if path.isfile(embeddingpath): return create_word_embedding( infile=corpuspath \ , outfile=embeddingpath \ , size=embeddingconfig.dimension \ , estimator=embeddingconfig.estimator \ , negative=embeddingconfig.negative \ , downsample=embeddingconfig.downsampling \ , min_count=embeddingconfig.min_count )