def train_w2v(data_directory, corpus_path, wiki_text_output_path, word2vec_output_path, w2v_dim, multiwords=True, druid_cutoff_score=0.4): start_time = time.time() # Convert Wikipedia XML dump into .txt format if not exists(wiki_text_output_path): logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path) wikidump2text.convert(corpus_path, wiki_text_output_path) # Load Multiword Expressions as Dictionary stopwords_path = join(data_directory, 'stopwords_en.txt') if multiwords: logger.info('Using druid_en.bz2 in ' + data_directory + ' as multiword dictionary.') druid_path = join(data_directory, 'druid_en.bz2') druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score) logger.info('Loaded Druid with cutoff'+str(druid_cutoff_score)) # Train the word2vec model, also use DRUID multiwords sentences = MySentences(wiki_text_output_path, druid_dict, multiwords=True) # a memory-friendly iterator else: logger.info('Using no multiword dicitionary, just single words') sentences = MySentences(wiki_text_output_path, None, multiwords=False) # bigram_transformer = Phrases(sentences) # logger.info("Finished transforming bigrams. Time needed: " + str(time.time() - start_time)) logger.info("Starting model training, will save to: " + word2vec_output_path) model = Word2Vec(sentences, size=w2v_dim, window=5, min_count=5, workers=multiprocessing.cpu_count()) # trim unneeded model memory = use(much) less RAM model.init_sims(replace=True) logger.info("Saving to the following path: " + word2vec_output_path) model.save(word2vec_output_path, ignore=[]) logger.info("Finished building Word2Vec model. Time needed: " + str(time.time() - start_time))
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3): stemmer = nltk.stem.PorterStemmer() tokenid_dictionary = corpora.Dictionary() if not exists(wiki_text_output_path): logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path) # Convert Wikipedia XML dump into .txt format wikidump2text.convert(corpus_path, wiki_text_output_path) else: logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.') # Load Multiword Expressions as Dictionary stopwords_path = join(data_directory, 'stopwords_en.txt') if multiwords: druid_path = join(data_directory, 'druid_en.bz2') druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score) logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score)) else: druid_dict = None logger.info("Building tfidf model...") start_time = time.time() if multiwords: logger.info('Using druid_en.bz2 in ' + data_directory + ' as multiword dictionary.') articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True) # a memory-friendly iterator else: logger.info('Using no multiword dicitionary, just single words') articles = TextCorpus(wiki_text_output_path, None, multiwords=False) tokenid_dictionary.add_documents(articles) model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary) model.save(model_output_path) logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
program = basename(sys.argv[0]) logger = logging.getLogger(program) def data_directory(): return join(dirname(dirname(abspath(__file__))), 'data') corpus_path = join(data_directory(), 'enwiki-latest-pages-articles12.xml-p001825001p002425000.bz2') wiki_text_output_path = join(data_directory(), 'enwiki-latest-pages-articles12.txt') model_output_path = join(data_directory(), 'wiki.tfidf') stemmer = nltk.stem.PorterStemmer() dictionary = corpora.Dictionary() # Convert Wikipedia XML dump into .txt format wikidump2text.convert(corpus_path, wiki_text_output_path) # Load Multiword Expressions as Dictionary stopwords_path = join(data_directory(), 'stopwords_en.txt') druid_path = join(data_directory(), 'druid_en.bz2') druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=0.0) logger.info("Building tfidf model...") start_time = time.time() class TextCorpus(object): def __init__(self, filename): self.corpus = codecs.open(filename, 'r', encoding='utf-8')