def rare_words():
    c = config.default_config
    vocab = load_vocabulary(corpuspath(c.embedding.corpus))
    min_count = c.embedding.min_count
    rare_words = sorted([word for (word, count) in vocab.items() if count < min_count])
    frequent_words = sorted([word for (word, count) in vocab.items() if count >= min_count])
    print("Total number of words:", len(vocab))
    print("Number of words with embedding:", len(frequent_words))
    print("Number of words without embeddings:", len(rare_words))
    return rare_words, frequent_words
def create_embedding(embeddingconfig):
    makedirs('data/embeddings', exist_ok=True)

    corpuspath = config.corpuspath(embeddingconfig.corpus)
    embeddingpath = config.embeddingpath(embeddingconfig)

    if path.isfile(embeddingpath):
        return

    create_word_embedding( infile=corpuspath \
                         , outfile=embeddingpath \
                         , size=embeddingconfig.dimension \
                         , estimator=embeddingconfig.estimator \
                         , negative=embeddingconfig.negative \
                         , downsample=embeddingconfig.downsampling \
                         , min_count=embeddingconfig.min_count )
def create_grams(gramconfig):
    makedirs('data/grams', exist_ok=True)
    corpuspath = config.corpuspath(gramconfig.corpus)
    grampath = config.grampath(gramconfig)
    if path.isfile(grampath):
        return
    uncompressed_grampath = str(Path(grampath).parent / Path(grampath).stem)
    with gzip.open(corpuspath, mode='rt') as inf:
        with open(uncompressed_grampath, mode='wt') as outf: # may overwrite
            for line in inf:
                c = gramconfig
                for gram, skip in iter_grams(line, c.gram_size, c.skipwords, c.skippos, c.filter_skips):
                    completegram = ' '.join(chain(gram[ : c.skippos], skip, gram[c.skippos : ]))
                    print(completegram, file=outf)

    subprocess.run(['/usr/bin/shuf', '-o', uncompressed_grampath, uncompressed_grampath])
    compress(uncompressed_grampath)