Example #1
0
def iter_XY(config):
    (embeddingconfig, gramconfig, runconfig) = config

    embedding = WordEmbedding(embeddingpath(embeddingconfig))
    unknown_embedding = None
    if not runconfig.filter_unknown:
        unknown_embedding = zeros(embedding.values.shape[1])
    def embed(gram):
        return embedding.embed(gram, unknown_embedding)

    with gzip.open(grampath(gramconfig), mode='rt') as f:
        for line in f:
            line = line.split()
            if len(line) == gramconfig.gram_size:
                gram = line
                lbl = word_to_label(gramconfig.skipwords, '')
            elif len(line) != gramconfig.gram_size + 1:
                print('Skip length > 1 not supported. Dropping Gram:', *line, file=stderr)
                continue
            else:
                gram = line[ : gramconfig.skippos] + line[gramconfig.skippos + 1 : ]
                skip = line[gramconfig.skippos]
                lbl = word_to_label(gramconfig.skipwords, skip)

            try:
                gramvec = embed(gram)
                yield gramvec, lbl
            except KeyError:
                pass
def main():
    embedding = WordEmbedding(embeddingpath(default_embeddingconfig))


    for old, new in spelling_changes:
        print(old, '--', new)
        print(embedding.nearest_words([old]))
        print()

    print()
    war, ist = tense_changes[0]
    tensediff = embedding[ist] - embedding[war]
    for past, present in tense_changes[1 : ]:
        print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff]))
        print('Should be:', present)
        print()

    # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes]

    spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]]
    tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes]

    def metric(u, v):
        return max(distance.cosine(u, v), 0)

    while True:
        try:
            model = TSNE(n_components=2, metric=metric)
            reduced = model.fit_transform(spelling_diffs + tense_diffs)
            print(reduced)
            return
        except Exception:
            pass
def create_embedding(embeddingconfig):
    makedirs('data/embeddings', exist_ok=True)

    corpuspath = config.corpuspath(embeddingconfig.corpus)
    embeddingpath = config.embeddingpath(embeddingconfig)

    if path.isfile(embeddingpath):
        return

    create_word_embedding( infile=corpuspath \
                         , outfile=embeddingpath \
                         , size=embeddingconfig.dimension \
                         , estimator=embeddingconfig.estimator \
                         , negative=embeddingconfig.negative \
                         , downsample=embeddingconfig.downsampling \
                         , min_count=embeddingconfig.min_count )