output.write('%s\n' % doc) # each doc in a new line corpus_source = Corpus() if Amazon_reviews: corpus_source.load_text(urpath + "temporary_files/docs_filtered_source.txt", valid_split=1 - rate_usageOfData_source_Amazon) else: corpus_source.load_text(urpath + "temporary_files/docs_filtered_source.txt", valid_split=1 - rate_usageOfData_source_Novel) corpus_source.process_wordvectors(joint_modelvw) words2vec_nyOfWord2vec_format = corpus_source.convert_dictionary_to_words2vec( fname=urpath + 'temporary_files/train_source.txt') model_source = GLDA(n_topics = n_topic, corpus = corpus_source.index2doc, words2vec_ny = corpus_source.words2vec_ny, \ words2vec = joint_modelvw, vocab_ny = corpus_source.vocab_ny, alpha = learning_rate) model_source.fit(iterations=n_gibbs_iteration_source) for k in range(n_topic): """ top words acc. to similairty to positive direction of cosine (standard direction) """ print("TOPIC {0} w2v pos:".format(k), \ list(zip(*model_source.words2vec.most_similar( positive = [model_source.topic_params[k]["Topic Mean"]], topn = 20) ))[0]) print('\n') time_source2 = time.time() print(
else: corpus.load_text(urpath + "temporary_files/docs_filtered.txt", valid_split=1 - rate_usageOfData_novels) if pre_trained_embedding: """ These files cantain pre-trained embeddings and should be downloaded manually from the given links. """ gensim_file = urpath + 'data/wiki.en.simple.vec' # English https://fasttext.cc/docs/en/pretrained-vectors.html """ Alternativt: other pre-trained embeddings gensim_file = urpath + 'data/wiki-news-300d-1M.vec' # English https://fasttext.cc/docs/en/english-vectors.html gensim_file = urpath + 'data/wiki.en.vec' # English https://fasttext.cc/docs/en/pretrained-vectors.html gensim_file = urpath + 'data/wiki.sv.vec' # Swedish https://fasttext.cc/docs/en/pretrained-vectors.html gensim_file = urpath + 'data/wiki.fa.vec' # Persian https://fasttext.cc/docs/en/pretrained-vectors.html """ corpus.process_wordvectors(filepath=gensim_file) shrinkedEmbedding_OfWord2vecFormat = corpus.convert_dictionary_to_words2vec( fname=urpath + 'temporary_files/train.txt') model = GLDA(n_topics = n_topic, corpus=corpus.index2doc, words2vec_ny=corpus.words2vec_ny, words2vec=corpus.words2vec, \ vocab_ny = corpus.vocab_ny, alpha = learning_rate) else: """ train and create word enbedding by Facebook's fasttext from the current data """ modelvw = FastText(corpus.docs, size = dim_trained_embedding , window=3, min_count=1,workers=5, alpha = 0.1,\ iter = 10, sg = 1, word_ngrams=1) model = GLDA(n_topics = n_topic, corpus=corpus.index2doc, words2vec_ny=modelvw, words2vec=modelvw ,vocab_ny = corpus.vocab,\ alpha = learning_rate) start = timeit.default_timer() model.fit(iterations=n_gibbs_iteration) stop = timeit.default_timer() print('Time for fitting the model is: ', (stop - start) / 60) """ printing the top words (results) of corresponding topic most_similar """