def get_model(self, n_topics=50, n_workers=6, recalculate=False, from_scratch=True): filepath = self.paths.get_lda_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError( 'No LDA file exists but from_scratch is False') trigram_dictionary = self.get_corpus_dict() trigram_bow_corpus = self.get_trigram_bow_corpus( trigram_dictionary) print('Building LDA model...') lda = LdaMulticore(trigram_bow_corpus, num_topics=n_topics, id2word=trigram_dictionary, workers=n_workers) lda.save(filepath) print('LDA model (n_topics={}) written to {}'.format( n_topics, filepath)) else: print('Loading LDA model (n_topics={})...'.format(n_topics)) lda = LdaMulticore.load(filepath) return lda
def create_LDA_dict(): #ONE TIME USE, to create and save LDA model trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict' trigram_reviews = LineSentence( '../Dataset/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dictionary_filepath) print('LDA dict saved.') trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm' MmCorpus.serialize( trigram_bow_filepath, trigram_bow_generator( '../Dataset/trigram_transformed_reviews_all.txt')) trigram_bow_corpus = MmCorpus(trigram_bow_filepath) lda_model_filepath = '../Models/lda_model_all' #lda_model_all_30, lda_model_10topic # created LDA model with 10, 30, 50 topics, found 30 has best result with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore( trigram_bow_corpus, num_topics=30, #10, 30, 50 id2word=trigram_dictionary, workers=8) lda.save(lda_model_filepath) print('LDA model saved.')
def createlda(num_topics, filename): dumppick(filename) num_topics = 50 texts, texts_tf_idf, dictionary = loadpcik() # 利用lsi做主题分类的情况 """ print("**************LSI*************") lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20) # 初始化一个LSI转换 texts_lsi = lsi[texts_tf_idf] # 对其在向量空间进行转换 print(lsi.print_topics(num_topics=20, num_words=10)) """ # 利用LDA做主题分类的情况 print("**************LDA*************") #ppl = [] #for i in range(1,50,1): #texts = shuffle(texts) #texts_train = texts[:int(24012*(0.9))] #texts_vad = texts[int(24012*(0.9)):] lda = LdaMulticore(corpus=texts, iterations=1000, id2word=dictionary, num_topics=num_topics, passes=200, per_word_topics=True) #texts_lda = lda[texts_tf_idf] out = open("./ldamd/{}tpc-tpc".format(num_topics), mode="w", encoding="utf8") print(lda.print_topics(num_topics=num_topics, num_words=10), file=out) lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18])) #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i) return lda, texts, texts_tf_idf, dictionary
def run_model(self, collection_name, num_topics, save_dir=None, save_file=None, alpha=0.1, beta=0.01, iterations=800, passes=1): model = LdaMulticore(corpus=self.corpus, id2word=self.dictionary, num_topics=num_topics, alpha=alpha, eta=beta, iterations=iterations, passes=passes) if save_dir is None: save_dir = Constants.SAVE_DIR.format( collection_name.lower().replace(' ', '_')) if not os.path.isdir(save_dir): os.makedirs(save_dir) if save_file is None: save_file = Constants.SAVE_FILE_FORMAT.format( collection_name.lower().replace(' ', '_'), num_topics, alpha, beta, iterations) logging.info(save_dir) model.save(os.path.join(save_dir, save_file)) return model
def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm( 'There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time( time() - start) self.model = LdaMulticore.load(fname) return self.model
def lda_trainer(sentences, modelPath=None, nb_topics=190, multicore=False): ''' @return: lda_model: the lda_model model trained by gensim, dictionary: all terms dictionary in lda_model model ''' # load doc2bow dictionary = corpora.Dictionary(sentences) print('finish load dictionary!') corpus = [dictionary.doc2bow(text) for text in sentences] print('finish load doc2bow corpus!') # train lda_model model print('training lda_model model...') if multicore == True: # can just use in linux # very hard for CPU, cautiously use it lda_model = LdaMulticore(corpus=corpus, num_topics=nb_topics, id2word=dictionary) else: lda_model = LdaModel(corpus=corpus, num_topics=nb_topics, id2word=dictionary) print('finished lda_model model training, nb terms: %d' % lda_model.num_terms) # save lda_model model on disk if modelPath != None: lda_model.save(fname=modelPath) dictionary.save(fname_or_handle=modelPath.replace('.topic', '.dict')) print( 'producing lda_model & dictionary model ... ok! model store in {0}(.dict)' .format(modelPath)) return lda_model, dictionary
def train_lda(args): print "[LDA > n_topics: %d ]" % args.dim lda_reader = LDAReader(args.ds, max_sent=args.max_sent) ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd, num_topics=args.dim, workers=args.workers) ldazito.save(args.out)
def guidedLDA_Model(topics, cores=11): """ Topics represents desired LDA topics, cores should be physical cores minus one. Both should be integers. """ # load finished dictionary from disk trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('./models2/trigram_bow_corpus.nm', trigram_bow_generator('./models2/trigram_transformed_reviews.txt')) # load finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs, # along with the number of topics the model should learn # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=topics, id2word=trigram_dictionary, workers=cores) lda.save('./models2/lda_model') # load the finished LDA model from disk #lda = LdaMulticore.load('./models/lda_model_neg') return trigram_bow_corpus, lda
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True): module_path = os.path.dirname(__file__) model_path = module_path + "/models" if verbose: print("prepare data") corpus = corpus.apply(lambda x: x.split(" ")) dictionary = Dictionary(corpus) bow = [dictionary.doc2bow(doc) for doc in corpus] if type(load) == str: if verbose: print("loading lda") lda = LdaMulticore.load(model_path + "/" + load) else: if verbose: print("training lda") lda = LdaMulticore(bow, num_topics=num_topics) if save_as: try: os.mkdir(model_path) except: pass lda.save(model_path + "/" + save_as) if verbose: print("generate visualization") vis = pyLDAvis.gensim.prepare(lda, bow, dictionary) return lda, vis
def createLDAModel(docs, dictionary, num_topics = 100, iterations = NUM_ITERATIONS, passes = NUM_PASSES, workers = 3, output = 'lda_model'): """Creates the LDA model for the given documents. Args: docs (lst): List of tokenized documents dictionary (lst): The dictionary num_topics (int): The number of topics to discover iterations (int): The number of iterations of the LDA method passes (int): The number of passes of the LDA method workers (int): The number of workers employed in the creation of the model output (str): Prefix used to store the model in a set of files Returns: ldamodel: The LDA model """ # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in docs] # generate LDA model ldamodel = LdaMulticore(corpus, id2word = dictionary, num_topics = NUM_TOPICS, iterations = iterations, passes = passes, workers = workers) ldamodel.save(output + '_i' + str(iterations) + '_p' + str(passes) + '_T' + str(num_topics) + '.lda') return ldamodel
def train_lda(args): print "[LDA > n_topics: %d ]" % args.dim lda_reader = LDAReader(args.ds, max_sent=args.max_sent) ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd, num_topics=args.dim, workers=args.workers) ldazito.save(args.out)
def fit_universal_models(self): vec = CountVectorizer(stop_words='english', max_features=10000) vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences) id2word = {v: k for k, v in vec.vocabulary_.iteritems()} vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T) if os.path.isfile('lda.modl'): lda = LdaMulticore.load('lda.modl') else: lda = LdaMulticore(corpus=vec_corpus, id2word=id2word, iterations=200, num_topics=2, passes=10, workers=4) lda.save('lda.modl') all_counts = vec.transform(' '.join(x) for x in self.all_sentences) self.d['all']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0]) labeled_counts = vec.transform(' '.join(x) for x in self.X) self.d['labeled']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0]) w2vmodel = Word2Vec(self.all_sentences, size=100, window=5, min_count=3, workers=4) best_centroids = None best_score = None for _ in xrange( 10): # todo -- implement kmeans++ instead of best of 10 km = Kmeans(50) km.fit(w2vmodel.syn0) score = km.compute_sse(w2vmodel.syn0) if best_score is None or score < best_score: best_score = score best_centroids = km.centroids km.centroids = best_centroids self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words())) self.d['all']['_t'] = self.tfidf.fit_transform( ' '.join(x) for x in self.all_sentences) self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x) for x in self.X) self.d['all']['_kmeans'] = np.array( kmeans_word2vecify(self.all_sentences, w2vmodel, km, self.d['all']['_t'], self.tfidf)) self.d['labeled']['_kmeans'] = np.array( kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'], self.tfidf))
def train_lda(args): print "[LDA > n_topics: %d ]" % args.dim lda_reader = LDAReader(args.input, max_sent=args.max_sent) lda_reader.compute_vocabulary() lda_model = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd, num_topics=args.dim, workers=args.workers) lda_model.save(args.output) idx_path = os.path.splitext(args.output)[0]+"_idx.pkl" lda_reader.save_vocabulary(idx_path)
def get_lda_model(corpus, dictionary, num_topics, SAVE_FILE=OUT_FILE, passes=20, iterations=100): if not os.path.exists(SAVE_FILE + '.lda'): print('creating lda model for the {} file..'.format(SAVE_FILE)) print('num_topics: {}'.format(num_topics)) lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=2500) lda_model.save(SAVE_FILE + '.lda') else: print('LDA model for the file:{} already exists.. loading..'.format(SAVE_FILE)) lda_model = LdaMulticore.load(SAVE_FILE + '.lda') return lda_model
def train_lda(): """ Train the LDA model. generate_dictionary() must be called before this method. """ print("------------------") print("Training LDA model") print("------------------") # load dictionary, as generated by generate_dictionary() print("Loading dictionary...") dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) # generate a mapping from word id to word print("Generating id2word...") id2word = {} for word in dictionary.token2id: id2word[dictionary.token2id[word]] = word # initialize LDA print("Initializing LDA...") lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word, workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE) # Train the LDA model print("Training...") examples = [] update_every_n_windows = 25000 windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE, only_labeled_windows=True) for i, window in enumerate(windows): tokens_str = [token.word.lower() for token in window.tokens] bow = dictionary.doc2bow(tokens_str) # each window as bag of words examples.append(bow) if len(examples) >= update_every_n_windows: print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA)) # this is where the LDA model is trained lda_model.update(examples) examples = [] if i >= COUNT_EXAMPLES_FOR_LDA: print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,)) break # i don't update here with the remainder of windows, because im not sure if each update step's # results are heavily influenced/skewed by the the number of examples #if len(examples) > 0: # print("Updating with remaining windows...") # lda_model.update(examples) # save trained model to HDD print("Saving...") lda_model.save(cfg.LDA_MODEL_FILEPATH)
def run_model(modelname, opts, corpus, outdir): """Run an LDA model with specified options. Run an LDA model using ``LdaMulticore` with options ``opts` using ``corpus`` and save results to directory ``outdir``. Call the model ``modelname``, which will be used to prefix the result filenames. """ LOGGER.info(f"Running model {modelname}") mod = LdaMulticore(ShuffledCorpus(corpus['corpus']), **opts, id2word=corpus['id2word']) filename = os.path.join(outdir, modelname + ".pickle") LOGGER.info(f"Saving to {filename}") mod.save(filename) model_to_csv(mod, corpus, modelname, outdir)
def create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary): """ creates and saves topic to file called lda """ with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=50, id2word=trigram_dictionary, workers=3) lda.save(lda_model_filepath)
def train_lda(corpus, dictionary, lda_model_filepath, num_topics, run_or_load_flag): if run_or_load_flag: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, workers=3) lda.save(lda_model_filepath) else: lda = LdaMulticore.load(lda_model_filepath) return lda
def perform(self, option="load"): """ Perform LDA analysis to generate topics and topic distribution for each app """ logging.info("Start Lda analysis") ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration) logging.info("LDA multicore modeling done") ldamodel.save(self.lda_out_file_name) self.topics = {} for i in range(0, self.ntopic, 1): self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword) logging.info("Topic{}".format(i)) words = [w[1] for w in self.topics["topic{}".format(i)]] logging.info(words)
def LDAmulticoreModel(df, num_topics=10): import warnings def fxn(): warnings.warn('deprecated', DeprecationWarning) with warnings.catch_warnings(): warnings.simplefilter('ignore') dictionary = Dictionary(df['AbstractNarration'].apply(get_tokens)) dictionary_from_nlpAbstract = Dictionary(df['nlp_abstract']) dictionary_from_nlpAbstract.save('gensim_dict_fromNLPAbstract.gensim') corpus = [dictionary.doc2bow(text) for text in df['nlp_abstract']] # multicore model np.random.seed(44) lda_multicore = LdaMulticore(corpus, num_topics, id2word=dictionary,workers=4) lda_multicore.save('../models/lda_multicoremodel.gensim') #print('Topics from LDA Multicore model', lda_multicore.print_topics()) return lda_multicore
def main(argv): cli_parser = make_cli_parser() opts, args = cli_parser.parse_args(argv) if len(args) != 2: cli_parser.error("Please provide an input/output file") if not os.path.isfile(args[1] + '.lda'): if os.path.isfile(args[1] + '.bow2mm') and os.path.isfile(args[1] + '.id2word'): id2word = corpora.Dictionary.load(args[1] + '.id2word') else: id2word = corpora.Dictionary(iter_file(args[0], opts.numlines)) # ignore words that appear in less than 5 documents or more than 20% documents # when we do filtering, some vector becomes empty! it generates a huge problem!! # id2word.filter_extremes(no_below=5, no_above=0.2, keep_n=None) # save dictionary id2word.save(args[1] + '.id2word') # save doc2bow vector corpora.MmCorpus.serialize( args[1] + '.bow2mm', iter_doc2bow(args[0], opts.numlines, id2word)) mm_corpus = corpora.MmCorpus(args[1] + '.bow2mm') model = LdaMulticore(mm_corpus, id2word=id2word, num_topics=opts.numtopics, workers=opts.numprocs, passes=opts.numepochs) model.save(args[1] + '.lda') infile = open(args[0]) outfile = open(args[1] + '.csv', "w") out_csvfile = csv.writer(outfile, delimiter=',') in_csvfile = csv.reader(infile, delimiter=',') for row in in_csvfile: if row[0] == 0: break processed_post = preprocess(row[3]).split() if len(processed_post) == 0: # skip 0~2 word documents (quite useless) continue result_list = row[1:3] result_list.extend(query_tag(id2word, model, processed_post)) out_csvfile.writerow(result_list) infile.close() outfile.close()
def create_LDA_model(self): trigram_articles = LineSentence(self.trigram_articles_filepath) trigram_dictionary = Dictionary(trigram_articles) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save_as_text(self.trigram_dictionary_filepath) # trigram_dictionary = Dictionary.load(self.trigram_dictionary_filepath) MmCorpus.serialize(self.trigram_bow_filepath, self.trigram_bow_generator(self.trigram_articles_filepath, trigram_dictionary)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) print(trigram_bow_corpus) with warnings.catch_warnings(): warnings.simplefilter("ignore") lda = LdaMulticore(trigram_bow_corpus, num_topics=20, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = parse_args() dictionary = corpora.Dictionary.load(os.path.join(args.prefix, 'review.dict')) logging.info('Pruning dictionary') dictionary.filter_extremes(no_below=args.no_below, no_above=args.no_above) corpus = ReviewCorpus(os.path.join(args.prefix, 'review.json'), dictionary) logging.info('Computing LDA model') lda = LdaMulticore(corpus, num_topics=args.num_topics, id2word=dictionary, workers=args.workers) logging.info('Persisting LDA model') lda.save(os.path.join(args.prefix, 'review.ldamodel'))
class LDAembedding(InputEmbedding): def __init__(self, workdir="./embedding-models", name="lda-embedding"): """ Erstellt durch Aufruf von Pretrain ein Vokabular :param workdir: :param name: """ super(LDAembedding, self).__init__(workdir=workdir, name=name) self._normalizer = TweetNormalisation() def _load(self): modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name)) if not modeldir.exists(): return False self._lda = LdaMulticore.load(str(modeldir)) self._dictionary = Dictionary.load( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) def pretrain(self, texts: typing.Iterable[typing.Text]): texts = [self._normalizer(text).split() for text in tqdm(texts)] self._dictionary = Dictionary(texts, prune_at=200000) corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)] self._lda = LdaMulticore(corpus=corpus, id2word=self._dictionary, workers=15, num_topics=50) self._dictionary.save( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) self._lda.save( str(self._workdir.joinpath("ldamodel_{}".format(self._name)))) def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array: to_array = lambda x: np.array([ v for _, v in self._lda.get_document_topics(x, minimum_probability=0) ]) return np.stack([ to_array(self._dictionary.doc2bow(self._normalizer(text).split())) for text in texts ])
def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start) self.model = LdaMulticore.load(fname) return self.model
def create_LDA_model(coursesList): warnings.filterwarnings('ignore') text_clean = [doc.split(' ') for doc in coursesList['description']] bigrams, trigrams = create_n_grams(text_clean) text_clean = add_n_grams(text_clean, bigrams, trigrams) id2word = Dictionary(text_clean) id2word.filter_extremes(no_below=5, no_above=0.45) corpus = [id2word.doc2bow(text) for text in text_clean] num_topics = config.num_lda_topic lda_model = LDA(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, alpha='asymmetric', passes=25) lda_model.save("./best_model.lda") coherence_model_c_v = CoherenceModel(model=lda_model, texts=text_clean, dictionary=id2word, coherence='c_v') c_v = coherence_model_c_v.get_coherence() term_topic_mat = lda_model.get_topics() aver_cosine_similarities = 0 for i in range(0, (num_topics - 1)): cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1), term_topic_mat[i + 1:]).flatten() aver_cosine_similarities += sum(cosine_similarities) if num_topics != 1: aver_cosine_similarities = aver_cosine_similarities / ( num_topics * (num_topics - 1) / 2) print(c_v) print(aver_cosine_similarities) create_vector_topics(lda_model, corpus, id2word, coursesList) visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(visual_data, 'topics.html') return lda_model, id2word, bigrams, trigrams
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True): """ Args: num_topics_list = list of number of topics, a model will be fitted for each save: indicates whether model should be saved Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics """ topics_dict = {} logfile = open(logfilename, 'w') for num_topics in num_topics_list: print('training', num_topics) np.random.seed(NUM) start_time = time.time() model = LdaMulticore(corpus=train_corpus, id2word=id2word, num_topics=num_topics, iterations=iters, eval_every=None, workers=workers, chunksize=chunksize) end_time = time.time() if save: fname = 'data\\orig_' + str(num_topics) + 'topics.lda' model.save(fname) per_word_bound = model.log_perplexity(test_corpus) perplexity = np.exp2(-1.0 * per_word_bound) logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n') logfile.write('perplexity: ' + str(perplexity) + '\n') logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n') topics = model.show_topics(num_topics=num_topics, num_words=20) topics_dict[str(num_topics)] = topics for topic in topics: logfile.write('\n\t' + topic.encode('ascii', 'ignore') + '\n') logfile.close() return topics_dict
def build_lda_model(self, topics: int=20): ignore_words = [ 'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure', 'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going', 'WEBLINK', 'got', 'way', '' ] filename = op.join(self.input_dir, f'{self.board}.dictionary') dictionary: Dictionary = Dictionary.load(filename) documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: dictionary.doc2bow( [w for w in y.split() if w not in ignore_words] ) ) lda = LdaMulticore( documents, id2word=dictionary, num_topics=topics, iterations=2) filename = op.join(self.input_dir, f'{self.board}.lda') lda.save(filename) return lda
def _lda_(gensim_dictionary, corpus_path=fpathroot + fpathappend + '_serialized.mm', lda_model_filepath=fpathroot + fpathappend + '_lda_' + str(numtopics), returnlda=True, numtopics=numtopics, passes=1, iterations=50, args=None): """ Run Gensim LDA, optional return of model """ if (type(corpus_path) == str) | (type(corpus_path) == unicode): corpus = MmCorpus(corpus_path) else: corpus = corpus_path with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one if args == None: lda = LdaMulticore(corpus, num_topics=numtopics, id2word=gensim_dictionary, workers=n_threads, passes=passes, iterations=iterations) else: lda = LdaMulticore(corpus, num_topics=numtopics, id2word=gensim_dictionary, workers=n_threads, passes=passes, iterations=iterations, **args) lda.save(lda_model_filepath) if returnlda == True: return lda
def generate_lda_topics(self): from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore import pyLDAvis import pyLDAvis.gensim import warnings import _pickle as pickle trigram_sentences = LineSentence(self.trigram_sentences_filepath) trigram_dictionary = Dictionary(trigram_sentences) # trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(self.trigram_dictionary_filepath) def trigram_bow_generator(filepath): for sentence in LineSentence(filepath): yield trigram_dictionary.doc2bow(sentence) MmCorpus.serialize( self.trigram_bow_filepath, trigram_bow_generator(self.trigram_sentences_filepath)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore(trigram_bow_corpus, num_topics=3, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath) lda = LdaMulticore.load(self.lda_model_filepath) lda.show_topic(0) lda.show_topic(1) lda.show_topic(2) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) pyLDAvis.save_html(LDAvis_prepared, self.LDAvis_html_filepath)
from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore from gensim.models import Phrases from gensim.models.word2vec import LineSentence import pyLDAvis import pyLDAvis.gensim def bow(filepath, d): # output bag of words representation for review in LineSentence(filepath): yield d.doc2bow(review) real_sent = LineSentence('real.txt') real_dict = Dictionary(real_sent) real_dict.filter_extremes(no_below=5, no_above=0.2) real_dict.compactify() real_dict.save('real.dict') real_dict = Dictionary.load('real.dict') MmCorpus.serialize('real.mm', bow('real.txt', real_dict)) real_corpus = MmCorpus('real.mm') real_lda = LdaMulticore(real_corpus, num_topics=10, id2word=real_dict, workers=2) real_lda.save('./real_lda_model')
#============================================================================== # No need to run LDA everytime, model has bee stored vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(res) vocab = vectorizer.get_feature_names() start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=9,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #============================================================================== # # Get all topics from training # topic_number, number_of_aritcles, top_words #============================================================================== def get_topic(n): doc_lda = model[doc_list[n]] current_prob = 0 for var in doc_lda: if var[1]>current_prob: current_prob = var[1]
# this is a bit time consuming - make the if statement True # if you want to train the LDA model yourself. if 0 == 1: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=5, id2word=trigram_dictionary, workers=3) lda.save(lda_model_filepath) # load the finished LDA model from disk lda = LdaMulticore.load(lda_model_filepath) explore_topic(topic_number=0) topic_names = { 0: 'looking_at_websites_for_info', 1: 'doesnt_have_the_negative_exercise_effect', 2: 'spend_time_looking_on_websites', 3: 'games_and_information', 4: 'bad_if_kids_spend_too_much_time' } topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
vocab = vectorizer.get_feature_names() # single LDA topic_number = 15 start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X, documents_columns=False), num_topics=topic_number, passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = folder_name + 'LDA' + str(topic_number) + 'topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #perplexity perplexity = model.log_perplexity(matutils.Sparse2Corpus( X, documents_columns=False), total_docs=None) # batch LDA model_eval = [] for k in range(2, 21): topic_number = k start_time = time.time()
def LDA_Machine(lst_dict, handle_lst): assert type(lst_dict) == list, "Please enter a list of dictionary's" assert type(handle_lst) == list, "Please enter a list of handles" file_path_corpus = "/home/igabr/new-project-4/mm_corpus/" cnt_1 = -1 cnt_2 = -1 for handle in handle_lst: cnt_1 += 1 clean_tweet_list = [] handle_tweets = lst_dict[cnt_1][handle]['content'] if handle_tweets == []: continue else: for raw_tweet in handle_tweets: clean_tweet = "" tokenized_tweet = nlp(raw_tweet) for token in tokenized_tweet: if token.is_space: continue elif token.is_punct: continue elif token.is_stop: continue elif token.is_digit: continue elif len(token) == 1: continue elif len(token) == 2: continue else: clean_tweet += str(token.lemma_) + " " clean_tweet_list.append(clean_tweet) clean_tweet_list = list(map(str.strip, clean_tweet_list)) clean_tweet_list = [x for x in clean_tweet_list if x != ""] lst_dict[cnt_1][handle]['tokenized_tweets'] = clean_tweet_list print("{} tokenized_tweets inserted!".format(handle)) print() master_df = make_df(lst_dict) to_remove = list(master_df[master_df['tokenized_tweets'].isnull()].index) index_to_remove = [] for i in to_remove: index_to_remove.append(handle_lst.index(i)) new_handle_list = [ v for i, v in enumerate(handle_lst) if i not in frozenset(index_to_remove) ] master_df.dropna(subset=['tokenized_tweets'], inplace=True) master_df = filtration(master_df, "tokenized_tweets") clean_lst_dict = dataframe_to_dict(master_df) print() print("Cleaning of master dataframe complete!") for handle in new_handle_list: cnt_2 += 1 try: list_of_tweets = clean_lst_dict[cnt_2][handle]['tokenized_tweets'] except KeyError: continue gensim_format_tweets = [] for tweet in list_of_tweets: list_form = tweet.split() gensim_format_tweets.append(list_form) gensim_dictionary = Dictionary(gensim_format_tweets) gensim_dictionary.filter_extremes(no_below=10, no_above=0.4) gensim_dictionary.compactify( ) # remove gaps after words that were removed MmCorpus.serialize( file_path_corpus + "{}.mm".format(handle), bag_of_words_generator(gensim_format_tweets, gensim_dictionary)) corpus = MmCorpus( file_path_corpus + "{}.mm".format(handle)) #loading the corpus from disk if corpus.num_terms == 0: continue else: lda = LdaMulticore(corpus, num_topics=10, id2word=gensim_dictionary, passes=100, workers=100) lda.save(file_path_corpus + "lda_model_{}".format(handle)) print("LDA model for {} saved!".format(handle)) word_list = [] for i in range(10): for term, frequency in lda.show_topic(i, topn=100): if frequency != 0: word_list.append(term) LDA_Counter = Counter(word_list) clean_lst_dict[cnt_2][handle]['LDA'] = LDA_Counter print("Inserted LDA Counter into {} dictionary".format(handle)) pickle_object(clean_lst_dict, "2nd_degree_connections_LDA_complete") print("Script Complete")
starttime = datetime.datetime.now() print('dataset:', data, 'num_topics:', n_topics, 'random_state:', random_state) data_dir = './%s_data'%data dictionary = Dictionary.load(os.path.join(data_dir, 'ne_nedf_weighting.dict')) bow_news = load_model(os.path.join(data_dir, 'ne8_nedf_%s_weighting.bow')%(topn_concepts)) dict_id2token = dict(dictionary.items()) lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\ eval_every=eval_every, workers=workers, random_state=random_state) name = 'ne8_nedf_%s_topic%s_passes%s_iteration%s_random%s' % (topn_concepts, n_topics, passes, iterations, random_state) result_dir = os.path.join(data_dir, name) if not os.path.exists(result_dir): os.mkdir(result_dir) lda.save(os.path.join(result_dir, 'lda_model')) topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False) with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f: for topic in topics: f.write('topic ' + str(topic[0]) + ':\n') for t in topic[1]: f.write(t[0] + ': ' + str(t[1]) + '\n') f.write('\n') endtime = datetime.datetime.now() duration = (endtime - starttime).seconds duration_list.append(duration) print('Totol running for ', (endtime - starttime).seconds, ' seconds.') print(sum(duration_list)/len(duration_list))
trigram_users_bow_corpus = MmCorpus(trigram_users_bow_file) lda_threads_model_file = "lda_threads_model" lda_users_model_file = "lda_users_model" for i in range(5, 50, 5): print("Starting to process with " + str(i) + " topics") t0 = time.time() with warnings.catch_warnings(): warnings.simplefilter('ignore') lda_threads = LdaMulticore(trigram_threads_bow_corpus, num_topics=i, id2word=trigram_dictionary, workers=4) t1 = time.time() lda_threads.save("models/" + lda_threads_model_file + str(i)) print("Time to generate lda_threads " + str(i) + " : " + str(t1 - t0)) ''' Starting to process with 5 topics Time to generate lda_threads 5 : 53.75977849960327 Starting to process with 10 topics Time to generate lda_threads 10 : 75.05263686180115 Starting to process with 15 topics Time to generate lda_threads 15 : 99.37945866584778 Starting to process with 20 topics Time to generate lda_threads 20 : 118.13127422332764 Starting to process with 25 topics Time to generate lda_threads 25 : 138.435448884964 Starting to process with 30 topics Time to generate lda_threads 30 : 166.0134561061859 Starting to process with 35 topics
def learn(corpus): dictionary = Dictionary.load('lda.dict') lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5) for line in lda.print_topics(NUM_TOPICS): print line lda.save('lda.gensim')
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus from gensim.models import TfidfModel, LdaModel from gensim.utils import smart_open, simple_preprocess from gensim.corpora.wikicorpus import _extract_pages, filter_wiki from gensim import corpora from gensim.models.ldamulticore import LdaMulticore wiki_corpus = MmCorpus('Wiki_Corpus.mm') # Loading the corpus print (".... successfully loaded the corpus") wiki_dict = Dictionary.load('WikiDictionary200k.dict') # Loading the dictionary print (".... successfully loaded the dictionary") lda = LdaMulticore(corpus=wiki_corpus, id2word=wiki_dict, num_topics=300, chunksize=10000, passes=2) print ".... successfully extracted the topics; saving the model" lda.save('WikiLDA_300.lda') print "finished ...."