def rare_words(): c = config.default_config vocab = load_vocabulary(corpuspath(c.embedding.corpus)) min_count = c.embedding.min_count rare_words = sorted([word for (word, count) in vocab.items() if count < min_count]) frequent_words = sorted([word for (word, count) in vocab.items() if count >= min_count]) print("Total number of words:", len(vocab)) print("Number of words with embedding:", len(frequent_words)) print("Number of words without embeddings:", len(rare_words)) return rare_words, frequent_words
def create_embedding(embeddingconfig): makedirs('data/embeddings', exist_ok=True) corpuspath = config.corpuspath(embeddingconfig.corpus) embeddingpath = config.embeddingpath(embeddingconfig) if path.isfile(embeddingpath): return create_word_embedding( infile=corpuspath \ , outfile=embeddingpath \ , size=embeddingconfig.dimension \ , estimator=embeddingconfig.estimator \ , negative=embeddingconfig.negative \ , downsample=embeddingconfig.downsampling \ , min_count=embeddingconfig.min_count )
def create_grams(gramconfig): makedirs('data/grams', exist_ok=True) corpuspath = config.corpuspath(gramconfig.corpus) grampath = config.grampath(gramconfig) if path.isfile(grampath): return uncompressed_grampath = str(Path(grampath).parent / Path(grampath).stem) with gzip.open(corpuspath, mode='rt') as inf: with open(uncompressed_grampath, mode='wt') as outf: # may overwrite for line in inf: c = gramconfig for gram, skip in iter_grams(line, c.gram_size, c.skipwords, c.skippos, c.filter_skips): completegram = ' '.join(chain(gram[ : c.skippos], skip, gram[c.skippos : ])) print(completegram, file=outf) subprocess.run(['/usr/bin/shuf', '-o', uncompressed_grampath, uncompressed_grampath]) compress(uncompressed_grampath)