def do_nmf(run_id, no_processes=16): stat = RunStats.objects.get(run_id=run_id) qid = stat.query.id K = stat.K TopicTerm.objects.filter(run_id=run_id).delete() DocTopic.objects.filter(run_id=run_id).delete() Topic.objects.filter(run_id=run_id).delete() stat.term_set.clear() alpha = stat.alpha n_features = stat.max_features if n_features == 0: n_features = 100000000000 limit = stat.limit ng = stat.ngram # if stat.method=="LD" and stat.lda_library!=RunStats.WARP: # if stat.max_iter == 200: # stat.max_iter = 10 # if stat.max_iter > 100: # stat.max_iter = 90 n_samples = stat.max_iter stat.process_id = os.getpid() stat.status = 1 stat.save() if stat.fulltext: docs = Doc.objects.filter(query=qid, fulltext__iregex='\w') else: docs = Doc.objects.filter(query=qid, content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] print('\n###############################\ \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \ and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid, docs.count(), K, run_id)) # Get the docs into lists abstracts, docsizes, ids, citations = proc_docs(docs, stoplist, stat.fulltext, stat.citations) scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform( np.array(citations).reshape(-1, 1)) sentences = [get_sentence(x) for x in abstracts] w2v = gensim.models.Word2Vec(sentences) validation_measure = WithinTopicMeasure(ModelSimilarity(w2v)) if stat.fancy_tokenization: ###################################### ## A fancy tokenizer from nltk import wordpunct_tokenize from nltk import WordNetLemmatizer from nltk import sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords as sw punct = set(string.punctuation) from nltk.corpus import wordnet as wn stopwords = set(sw.words('english')) if stat.extra_stopwords: stopwords = stopwords | set(stat.extra_stopwords) def lemmatize(token, tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return WordNetLemmatizer().lemmatize(token, tag) kws = Doc.objects.filter( query=stat.query, kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate( n=Count('pk')).filter(n__gt=len(abstracts) // 200).order_by('-n') kw_text = set([x['kw__text'].replace('-', ' ') for x in kws]) kw_ws = set([x['kw__text'].replace('-', ' ').split()[0] for x in kws]) - stopwords def fancy_tokenize(X): common_words = set([x.lower() for x in X.split()]) & kw_ws for w in list(common_words): w = w.replace('(', '').replace(')', '') wpat = "({}\W*\w*)".format(w) wn = [ x.lower().replace('-', ' ') for x in re.findall(wpat, X, re.IGNORECASE) ] kw_matches = set(wn) & kw_text if len(kw_matches) > 0: for m in kw_matches: insensitive_m = re.compile(m, re.IGNORECASE) X = insensitive_m.sub(' ', X) yield m.replace(" ", "-") for sent in sent_tokenize(X): for token, tag in pos_tag(wordpunct_tokenize(sent)): token = token.lower().strip() if token in stopwords: continue if all(char in punct for char in token): continue if len(token) < 3: continue if all(char in string.digits for char in token): continue lemma = lemmatize(token, tag) yield lemma tokenizer = fancy_tokenize else: tokenizer = snowball_stemmer() ####################################### ############################################# # Use tf-idf features for NMF. print("Extracting tf-idf features ...") tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) count_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) t0 = time() if stat.method == "NM": tfidf = tfidf_vectorizer.fit_transform(abstracts) vectorizer = tfidf_vectorizer else: tfidf = count_vectorizer.fit_transform(abstracts) vectorizer = count_vectorizer print("done in %0.3fs." % (time() - t0)) stat.tfidf_time = time() - t0 stat.save() if citations is not False: tfidf = tfidf.multiply(scaled_citations) del abstracts gc.collect() if stat.db: vocab = vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=no_processes) vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab)) pool.terminate() #del vocab vocab_ids = vocab_ids[0] ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() # Fit the NMF model print("Fitting the model with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() if stat.method == "NM": model = NMF(n_components=K, random_state=1, alpha=alpha, l1_ratio=.1, verbose=True, init='nndsvd', max_iter=n_samples).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) else: if stat.lda_library == RunStats.LDA_LIB: model = lda.LDA( n_topics=K, alpha=stat.alpha, eta=stat.alpha, n_iter=stat.max_iter * 10, ).fit(tfidf) dtm = model.doc_topic_ components = csr_matrix(model.components_) elif stat.lda_library == RunStats.WARP: # Export warp lda try: warp_path = settings.WARP_LDA_PATH os.chdir(warp_path) except: print( "warplda is not installed, or its path is not defined in settings, exiting...." ) return fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id) # preformat os.system(f'./format -input {fname} -prefix {run_id} train') # Run warp lda runcmd = f'./warplda --prefix {run_id} --k {stat.K}' if stat.alpha: runcmd += f' -alpha {stat.alpha}' if stat.beta: runcmd += f' -beta {stat.beta}' else: stat.beta = 0.01 # default beta value stat.save() if stat.max_iter: runcmd += f' --niter {stat.max_iter}' runcmd += ' train.model' print("Running warplda.") os.system(runcmd) print("Finished running warplda, importing results.") warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str) warp_translate = np.argsort(warp_vocab).argsort() # Import warp lda as matrices with open(f'{run_id}.model', 'r') as f: for i, l in enumerate(f): if i == 0: M = int(l.split()[0]) N = int(l.split()[1]) components = lil_matrix((N, M)) else: largs = l.split('\t')[1].strip().split() for la in largs: wid = warp_translate[i - 1] t, n = la.split(':') components[int(t), wid] = int(n) components = components.todense() for k in range(components.shape[0]): components[k, :] = (components[k, :] + stat.beta) / ( components[k, :].sum() + stat.K * stat.beta) components = csr_matrix(components) dtm = lil_matrix((len(ids), N)) with open(f'{run_id}.z.estimate', 'r') as f: for i, l in enumerate(f): largs = l.split(' ', maxsplit=1)[1].strip().split() for la in largs: w, t = la.split(':') dtm[i, int(t)] += 1 theta = dtm.todense() for i in range(dtm.shape[0]): theta[i, :] = (theta[i, :] + stat.alpha) / ( theta[i, :].sum() + stat.K * stat.alpha) dtm = csr_matrix(theta) else: model = LDA( n_components=K, doc_topic_prior=stat.alpha, topic_word_prior=stat.beta, learning_method=stat.get_lda_learning_method_display().lower(), max_iter=stat.max_iter, n_jobs=2).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) print("done in %0.3fs." % (time() - t0)) stat.nmf_time = time() - t0 if stat.db: ## Add topics terms print("Adding topicterms to db") t0 = time() ldalambda = find(components) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=no_processes) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) stat.db_time = stat.db_time + time() - t0 ## Add topic-docs print("Adding DocTopics") gamma = find(dtm) glength = len(gamma[0]) chunk_size = 100000 parallel_add = True all_dts = [] make_t = 0 add_t = 0 t0 = time() ### Go through in chunks for i in range(glength // chunk_size + 1): dts = [] values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_processes): doc_batches.append([x for x in docs if x % no_processes == p]) pool = Pool(processes=no_processes) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) #dts.append(pool.map(partial(f_gamma, gamma=gamma, # docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches)) pool.terminate() make_t += time() - make_t0 print(make_t) django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_processes) pool.map(insert_many, values_list) pool.terminate() add_t += time() - add_t0 print(add_t) gc.collect() sys.stdout.flush() stat.db_time = stat.db_time + time() - t0 print("done in %0.3fs." % (time() - t0)) em = 0 for i in range(K): if dtm[:, i].nnz == 0: em += 1 stat.empty_topics = em if stat.method == "NM": stat.error = model.reconstruction_err_ stat.errortype = "Frobenius" elif stat.method == "LD": if stat.lda_library == RunStats.LDA_LIB: stat.error = model.loglikelihood() stat.errortype = "Log likelihood" stat.iterations = model.n_iter elif stat.lda_library == RunStats.WARP: pass else: stat.error = model.perplexity(tfidf) stat.errortype = "Perplexity" stat.iterations = model.n_iter_ stat.last_update = timezone.now() stat.status = 3 stat.save() if stat.db: term_rankings = [] topics = Topic.objects.filter(run_id=run_id) for topic in topics: term_ranking = list( Term.objects.filter(topicterm__topic=topic).order_by( '-topicterm__score').values_list('title', flat=True)[:50]) term_rankings.append(term_ranking) stat.coherence = validation_measure.evaluate_rankings(term_rankings) stat.save() if stat.db: management.call_command('update_run', run_id)