def generate_model(method, no_topics, word_embedding): if method == 'nmf': # Run NMF model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd', verbose=1).fit(word_embedding) elif method == 'lda': # Run LDA model = LatentDirichletAllocation(n_topics=no_topics, max_iter=50, learning_method='online', learning_offset=50., random_state=0, verbose=1, evaluate_every=1).fit(word_embedding) print("Log Likelihood: ", model.score(word_embedding)) print("Perplexity: ", model.perplexity(word_embedding)) return model
class TopicModel(object): """ A wrapper around various topic modeling algorithms and libraries, intended to provide a standardized way to train \ and apply models. When you initialize a ``TopicModel``, it will fit a vectorizer, and split the data into a train \ and test set if ``holdout_pct`` is provided. For more information about the available implementations, refer to the \ documentation for the ``fit()`` method below. :param df: A :py:class:`pandas.DataFrame` :param text_col: Name of the column containing text :type text_col: str :param method: The topic model implementation to use. Options are: sklearn_lda, sklearn_nmf, gensim_lda, \ gensim_hdp, corex :type method: str :param num_topics: The number of topics to extract. Required for every method except ``gensim_hdp``. :type num_topics: int :param max_ngram_size: Maximum ngram size (2=bigrams, 3=trigrams, etc) :type max_ngram_size: int :param holdout_pct: Proportion of the documents to hold out for goodness-of-fit scoring :type holdout_pct: float :param use_tfidf: Whether to use binary counts or a TF-IDF representation :type use_tfidf: bool :param vec_kwargs: All remaining arguments get passed to TfidfVectorizer or CountVectorizer Usage:: from pewanalytics.text.topics import TopicModel import nltk import pandas as pd nltk.download("movie_reviews") reviews = [{"fileid": fileid, "text": nltk.corpus.movie_reviews.raw(fileid)} for fileid in nltk.corpus.movie_reviews.fileids()] df = pd.DataFrame(reviews) >>> model = TopicModel(df, "text", "sklearn_nmf", num_topics=5, min_df=25, max_df=.5, use_tfidf=False) Initialized sklearn_nmf topic model with 3285 features 1600 training documents, 400 testing documents >>> model.fit() >>> model.print_topics() 0: bad, really, know, don, plot, people, scene, movies, action, scenes 1: star, trek, star trek, effects, wars, star wars, special, special effects, movies, series 2: jackie, films, chan, jackie chan, hong, master, drunken, action, tarantino, brown 3: life, man, best, characters, new, love, world, little, does, great 4: alien, series, aliens, characters, films, television, files, quite, mars, action >>> doc_topics = model.get_document_topics(df) >>> doc_topics topic_0 topic_1 topic_2 topic_3 topic_4 0 0.723439 0.000000 0.000000 0.000000 0.000000 1 0.289801 0.050055 0.000000 0.000000 0.000000 2 0.375149 0.000000 0.030691 0.059088 0.143679 3 0.152961 0.010386 0.000000 0.121412 0.015865 4 0.294005 0.100426 0.000000 0.137630 0.051241 ... ... ... ... ... ... 1995 0.480983 0.070431 0.135178 0.256951 0.000000 1996 0.139986 0.000000 0.000000 0.107430 0.000000 1997 0.141545 0.005990 0.081986 0.387859 0.057025 1998 0.029228 0.023342 0.043713 0.280877 0.107551 1999 0.044863 0.000000 0.000000 0.718677 0.000000 """ def __init__(self, df, text_col, method, num_topics=None, max_ngram_size=2, holdout_pct=0.25, use_tfidf=False, **vec_kwargs): self.df = df self.text_col = text_col self.method = method self.num_topics = num_topics self.train_df = df.sample(int(round(len(df) * (1.0 - holdout_pct)))) self.train_df = self.train_df.dropna(subset=[self.text_col]) self.test_df = df[~df.index.isin(self.train_df.index)] self.test_df = self.test_df.dropna(subset=[self.text_col]) if "stop_words" not in vec_kwargs: vec_kwargs["stop_words"] = "english" if use_tfidf: vec = TfidfVectorizer else: vec = CountVectorizer self.vectorizer = vec(ngram_range=(1, max_ngram_size), decode_error="ignore", **vec_kwargs) self.vectorizer = self.vectorizer.fit(self.train_df[self.text_col]) self.ngrams = self.vectorizer.get_feature_names() if self.method in ["gensim_lda", "gensim_hdp"]: self.train_features = self.get_features(self.train_df, keep_sparse=True) self.test_features = self.get_features(self.test_df, keep_sparse=True) if self.method == "gensim_hdp": self.topic_ids = None if num_topics: raise Exception( "You cannot specify the number of topics for an HDP model" ) else: self.train_features = self.get_features(self.train_df) self.test_features = self.get_features(self.test_df) self.model = None print("Initialized {} topic model with {} features".format( self.method, len(self.ngrams))) try: print("{} training documents, {} testing documents".format( len(self.train_features), len(self.test_features))) except TypeError: print("{} training documents, {} testing documents".format( self.train_features.shape[0], self.test_features.shape[0])) def get_features(self, df, keep_sparse=False): """ Uses the trained vectorizer to process a :py:class:`pandas.DataFrame` and return a feature matrix. :param df: The :py:class:`pandas.DataFrame` to vectorize (must have ``self.text_col`` in it) :param keep_sparse: Whether or not to keep the feature matrix in sparse format (default=False) :type keep_sparse: bool :return: A :py:class:`pandas.DataFrame` of features or a sparse matrix, depending on the value of \ ``keep_sparse`` """ subset_df = df.dropna(subset=[self.text_col]) features = self.vectorizer.transform(subset_df[self.text_col]) if keep_sparse: return features else: return pd.DataFrame(features.todense(), columns=self.ngrams, index=subset_df.index) def get_fit_params(self, **kwargs): """ Internal helper function to set defaults depending on the specified model. :param kwargs: Arguments passed to ``self.fit()`` :return: Arguments to pass to the model """ defaults = { "sklearn_lda": { "alpha": 1.0, "beta": 1.0, "learning_decay": 0.7, "learning_offset": 50, "learning_method": "online", "max_iter": 500, "batch_size": 128, "verbose": False, }, "sklearn_nmf": { "alpha": 0.0, "l1_ratio": 0.5, "tol": 0.00001, "max_iter": 500, "shuffle": True, }, "gensim_lda": { "chunksize": 1000, "passes": 10, "decay": 0.8, "offset": 1, "workers": 2, "alpha": None, "beta": "auto", "use_multicore": False, }, "gensim_hdp": { "max_chunks": None, "max_time": None, "chunksize": 256, "kappa": 1.0, "tau": 64.0, "T": 150, "K": 15, "alpha": 1, "beta": 0.01, "gamma": 1, "scale": 1.0, "var_converge": 0.0001, }, "corex": { "anchors": [], "anchor_strength": 3 }, } for k, v in kwargs.items(): if k not in defaults[self.method].keys(): raise Exception( "Unknown keyword argument for method '{}': {}. Accepted parameters are: {}" .format(self.method, k, defaults[self.method].keys())) fit_params = copy.deepcopy(defaults[self.method]) fit_params.update(kwargs) if self.method == "sklearn_lda": fit_params["verbose"] = int(fit_params["verbose"]) if "alpha" in fit_params.keys(): fit_params["doc_topic_prior"] = fit_params["alpha"] / float( self.num_topics) del fit_params["alpha"] if "beta" in fit_params.keys(): fit_params["topic_word_prior"] = fit_params["beta"] / float( self.num_topics) del fit_params["beta"] if self.method == "gensim_lda": if not fit_params["alpha"]: if fit_params["use_multicore"]: fit_params["alpha"] = "symmetric" else: fit_params["alpha"] = "auto" if self.method in ["gensim_lda", "gensim_hdp"]: if "beta" in fit_params.keys(): fit_params["eta"] = fit_params["beta"] del fit_params["beta"] return fit_params def fit(self, df=None, **kwargs): """ Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \ parameters are below: **sklearn_lda** Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \ available parameters, please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \ topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \ used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``doc_topic_prior = alpha / num_topics`` :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \ topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``topic_word_prior = beta / num_topics``. :param learning_decay: See sklearn documentation. :param learning_offset: See sklearn documentation. :param learning_method: See sklearn documentation. :param max_iter: See sklearn documentation. :param batch_size: See sklearn documentation. :param verbose: See sklearn documentation. **sklearn_nmf** Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \ please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: See sklearn documentation. :param l1_ratio: See sklearn documentation. :param tol: See sklearn documentation. :param max_iter: See sklearn documentation. :param shuffle: See sklearn documentation. **gensim_lda** Fits an LDA model using :py:class:`gensim.models.LdaModel` or \ :py:class:`gensim.models.ldamulticore.LdaMulticore`. \ When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \ LDA implementation will be used. \ For more information on available parameters, please refer to the official documentation below: - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \ more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \ options are a bit different than sklearn though; refer to the documentation for the accepted values here. :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \ than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \ ``eta``. We renamed it to be consistent with the sklearn implementations. :param chunksize: See gensim documentation. :param passes: See gensim documentation. :param decay: See gensim documentation. :param offset: See gensim documentation. :param workers: Number of cores to use (if using multicore) :param use_multicore: Whether or not to use multicore **gensim_hdp** Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare or occur only in a very few number of documents. To identify the topics that are actually useful, this function passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \ topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \ topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \ the total number of documents. Subsequent use of the model will only make use of topics that meet this \ threshold. For more information on available parameters, please refer to the official documentation: \ https://radimrehurek.com/gensim/models/hdpmodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param max_chunks: See gensim documentation. :param max_time: See gensim documentation. :param chunksize: See gensim documentation. :param kappa: See gensim documentation. :param tau: See gensim documentation. :param T: See gensim documentation. :param K: See gensim documentation. :param alpha: See gensim documentation. :param beta: See gensim documentation. :param gamma: See gensim documentation. :param scale: See gensim documentation. :param var_converge: See gensim documentation. **corex** Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item corresponding to a set of words to be used to seed a topic. For example: .. code-block:: python anchors=[ ['cat', 'kitten'], ['dog', 'puppy'] ] The list of anchors cannot be longer than the specified number of topics, and all of the words must exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to override the suggested words based on the data; providing higher values are a way of "insisting" more strongly that the model keep the provided words together in a single topic. For more information on available \ parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param anchors: A list of lists that contain words that the model should try to group together into topics :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data """ fit_params = self.get_fit_params(**kwargs) if self.method in ["sklearn_lda", "sklearn_nmf"]: if self.method == "sklearn_lda": self.model = LatentDirichletAllocation( n_components=self.num_topics, **fit_params) if self.method == "sklearn_nmf": self.model = NMF(n_components=self.num_topics, **fit_params) if is_not_null(df): features = self.get_features(df) else: features = self.train_features self.model.fit(features) elif self.method in ["gensim_lda", "gensim_hdp"]: vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)]) if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.train_features matrix = gensim.matutils.Sparse2Corpus(features, documents_columns=False) if self.method == "gensim_lda": fit_params["num_topics"] = self.num_topics fit_params["id2word"] = vocab_dict if fit_params["use_multicore"]: model_class = gensim.models.ldamulticore.LdaMulticore else: model_class = gensim.models.LdaModel del fit_params["workers"] del fit_params["use_multicore"] self.model = model_class(**fit_params) self.model.update(matrix) elif self.method == "gensim_hdp": model_class = gensim.models.hdpmodel.HdpModel self.model = model_class(matrix, vocab_dict, **fit_params) doc_topics = self.get_document_topics(self.df) topics = ((doc_topics >= 0.01).astype(int).mean() >= 0.01).astype(int) self.topic_ids = [ int(col.split("_")[-1]) for col in topics[topics == 1].index if col.startswith("topic_") ] self.num_topics = len(self.topic_ids) elif self.method == "corex": if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.get_features(self.train_df, keep_sparse=True) self.model = corextopic.Corex(n_hidden=self.num_topics) self.model.fit(features, words=self.ngrams, **fit_params) def get_score(self): """ Returns goodness-of-fit scores for certain models, based on the holdout documents. .. note:: The following scores are available for the following methods: - perplexity: (sklearn_lda only) The model's perplexity - score: (sklearn_lda only) The model's log-likelihood score - total_correlation: (corex only) The model's total correlation score :return: A dictionary with goodness-of-fit scores :rtype: dict """ if self.model: if self.method == "sklearn_lda": return { "perplexity": self.model.perplexity(self.test_features), "score": self.model.score(self.test_features), } elif self.method == "corex": return {"total_correlation": self.model.tc} else: return {} def get_document_topics(self, df, **kwargs): """ Takes a :py:class:`pandas.DataFrame` and returns a document-topic :py:class:`pandas.DataFrame` \ (rows=documents, columns=topics) :param df: The :py:class:`pandas.DataFrame` to process (must have ``self.text_col`` in it) :param min_probability: (gensim_lda use_multicore=False only) Topics with a probability lower than this \ threshold will be filtered out (Default=0.0) :type min_probability: float :return: A document-topic matrix """ if self.method in ["sklearn_lda", "sklearn_nmf"]: features = self.get_features(df) doc_topics = self.model.transform(features) topic_matrix = pd.DataFrame( doc_topics, columns=[ "topic_{}".format(i) for i in range(0, self.num_topics) ], index=features.index, ) return topic_matrix elif self.method in ["gensim_lda", "gensim_hdp"]: features = self.get_features(df, keep_sparse=True) matrix = gensim.matutils.Sparse2Corpus(features, documents_columns=False) rows = [] for index, bow in zip( df.dropna(subset=[self.text_col]).index, matrix): if self.method == "gensim_lda": if "min_probability" not in kwargs: kwargs["min_probability"] = 0.0 try: doc_topics = self.model.get_document_topics( bow, **kwargs) except TypeError: del kwargs["min_probability"] doc_topics = self.model.get_document_topics( bow, **kwargs) elif self.method == "gensim_hdp": doc_topics = self.model[bow] row = {"index": index} for topic, weight in doc_topics: if self.method == "gensim_lda" or ( not self.topic_ids or topic in self.topic_ids): row["topic_{}".format(topic)] = weight rows.append(row) df = pd.DataFrame(rows).fillna(0) df = df.set_index(df["index"]) del df["index"] return df elif self.method == "corex": features = self.get_features(df, keep_sparse=True) doc_topics = self.model.transform(features) topic_matrix = pd.DataFrame( doc_topics, columns=[ "topic_{}".format(i) for i in range(0, self.num_topics) ], index=df.index, ) return topic_matrix def get_topics(self, include_weights=False, top_n=10, **kwargs): """ Returns a list, equal in length to the number of topics, where each item is a list of words or word-weight tuples. :param include_weights: Whether or not to include weights along with the ngrams :type include_weights: bool :param top_n: The number of words to include for each topic :type top_n: init :return: A list of lists, where each item is a list of ngrams or ngram-weight tuples """ if self.method in ["sklearn_lda", "sklearn_nmf"]: topic_features = self.model.components_ topics = defaultdict(list) for topic_id, topic in enumerate(topic_features): top_ngram_index = sorted( [(ngram_id, float(ngram_value)) for ngram_id, ngram_value in enumerate(topic)], key=lambda x: x[1], reverse=True, ) topics[topic_id] = [ self.ngrams[ngram_id] if not include_weights else (self.ngrams[ngram_id], ngram_value) for ngram_id, ngram_value in top_ngram_index[:top_n] ] return topics elif self.method in ["gensim_lda", "gensim_hdp"]: topics = defaultdict(list) if self.method == "gensim_hdp": topic_ids = self.topic_ids else: topic_ids = range(self.num_topics) for i in topic_ids: for ngram, weight in self.model.show_topic(i, topn=top_n): if include_weights: topics[i].append((ngram, weight)) else: topics[i].append(ngram) return topics elif self.method == "corex": topics = defaultdict(list) for topic_id, topic_ngrams in enumerate( self.model.get_topics(n_words=top_n)): for ngram, weight in topic_ngrams: if include_weights: topics[topic_id].append((ngram, weight)) else: topics[topic_id].append(ngram) return topics def print_topics(self, include_weights=False, top_n=10): """ Prints the top words for each topic from a trained model. :param include_weights: Whether or not to include weights along with the ngrams :type include_weights: bool :param top_n: The number of words to include for each topic :type top_n: int """ for i, topic in self.get_topics(include_weights=include_weights, top_n=top_n).items(): print("{}: {}".format(i, ", ".join(topic)))
def do_nmf(run_id, no_processes=16): stat = RunStats.objects.get(run_id=run_id) qid = stat.query.id K = stat.K TopicTerm.objects.filter(run_id=run_id).delete() DocTopic.objects.filter(run_id=run_id).delete() Topic.objects.filter(run_id=run_id).delete() stat.term_set.clear() alpha = stat.alpha n_features = stat.max_features if n_features == 0: n_features = 100000000000 limit = stat.limit ng = stat.ngram # if stat.method=="LD" and stat.lda_library!=RunStats.WARP: # if stat.max_iter == 200: # stat.max_iter = 10 # if stat.max_iter > 100: # stat.max_iter = 90 n_samples = stat.max_iter stat.process_id = os.getpid() stat.status = 1 stat.save() if stat.fulltext: docs = Doc.objects.filter(query=qid, fulltext__iregex='\w') else: docs = Doc.objects.filter(query=qid, content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] print('\n###############################\ \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \ and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid, docs.count(), K, run_id)) # Get the docs into lists abstracts, docsizes, ids, citations = proc_docs(docs, stoplist, stat.fulltext, stat.citations) scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform( np.array(citations).reshape(-1, 1)) sentences = [get_sentence(x) for x in abstracts] w2v = gensim.models.Word2Vec(sentences) validation_measure = WithinTopicMeasure(ModelSimilarity(w2v)) if stat.fancy_tokenization: ###################################### ## A fancy tokenizer from nltk import wordpunct_tokenize from nltk import WordNetLemmatizer from nltk import sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords as sw punct = set(string.punctuation) from nltk.corpus import wordnet as wn stopwords = set(sw.words('english')) if stat.extra_stopwords: stopwords = stopwords | set(stat.extra_stopwords) def lemmatize(token, tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return WordNetLemmatizer().lemmatize(token, tag) kws = Doc.objects.filter( query=stat.query, kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate( n=Count('pk')).filter(n__gt=len(abstracts) // 200).order_by('-n') kw_text = set([x['kw__text'].replace('-', ' ') for x in kws]) kw_ws = set([x['kw__text'].replace('-', ' ').split()[0] for x in kws]) - stopwords def fancy_tokenize(X): common_words = set([x.lower() for x in X.split()]) & kw_ws for w in list(common_words): w = w.replace('(', '').replace(')', '') wpat = "({}\W*\w*)".format(w) wn = [ x.lower().replace('-', ' ') for x in re.findall(wpat, X, re.IGNORECASE) ] kw_matches = set(wn) & kw_text if len(kw_matches) > 0: for m in kw_matches: insensitive_m = re.compile(m, re.IGNORECASE) X = insensitive_m.sub(' ', X) yield m.replace(" ", "-") for sent in sent_tokenize(X): for token, tag in pos_tag(wordpunct_tokenize(sent)): token = token.lower().strip() if token in stopwords: continue if all(char in punct for char in token): continue if len(token) < 3: continue if all(char in string.digits for char in token): continue lemma = lemmatize(token, tag) yield lemma tokenizer = fancy_tokenize else: tokenizer = snowball_stemmer() ####################################### ############################################# # Use tf-idf features for NMF. print("Extracting tf-idf features ...") tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) count_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) t0 = time() if stat.method == "NM": tfidf = tfidf_vectorizer.fit_transform(abstracts) vectorizer = tfidf_vectorizer else: tfidf = count_vectorizer.fit_transform(abstracts) vectorizer = count_vectorizer print("done in %0.3fs." % (time() - t0)) stat.tfidf_time = time() - t0 stat.save() if citations is not False: tfidf = tfidf.multiply(scaled_citations) del abstracts gc.collect() if stat.db: vocab = vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=no_processes) vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab)) pool.terminate() #del vocab vocab_ids = vocab_ids[0] ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() # Fit the NMF model print("Fitting the model with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() if stat.method == "NM": model = NMF(n_components=K, random_state=1, alpha=alpha, l1_ratio=.1, verbose=True, init='nndsvd', max_iter=n_samples).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) else: if stat.lda_library == RunStats.LDA_LIB: model = lda.LDA( n_topics=K, alpha=stat.alpha, eta=stat.alpha, n_iter=stat.max_iter * 10, ).fit(tfidf) dtm = model.doc_topic_ components = csr_matrix(model.components_) elif stat.lda_library == RunStats.WARP: # Export warp lda try: warp_path = settings.WARP_LDA_PATH os.chdir(warp_path) except: print( "warplda is not installed, or its path is not defined in settings, exiting...." ) return fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id) # preformat os.system(f'./format -input {fname} -prefix {run_id} train') # Run warp lda runcmd = f'./warplda --prefix {run_id} --k {stat.K}' if stat.alpha: runcmd += f' -alpha {stat.alpha}' if stat.beta: runcmd += f' -beta {stat.beta}' else: stat.beta = 0.01 # default beta value stat.save() if stat.max_iter: runcmd += f' --niter {stat.max_iter}' runcmd += ' train.model' print("Running warplda.") os.system(runcmd) print("Finished running warplda, importing results.") warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str) warp_translate = np.argsort(warp_vocab).argsort() # Import warp lda as matrices with open(f'{run_id}.model', 'r') as f: for i, l in enumerate(f): if i == 0: M = int(l.split()[0]) N = int(l.split()[1]) components = lil_matrix((N, M)) else: largs = l.split('\t')[1].strip().split() for la in largs: wid = warp_translate[i - 1] t, n = la.split(':') components[int(t), wid] = int(n) components = components.todense() for k in range(components.shape[0]): components[k, :] = (components[k, :] + stat.beta) / ( components[k, :].sum() + stat.K * stat.beta) components = csr_matrix(components) dtm = lil_matrix((len(ids), N)) with open(f'{run_id}.z.estimate', 'r') as f: for i, l in enumerate(f): largs = l.split(' ', maxsplit=1)[1].strip().split() for la in largs: w, t = la.split(':') dtm[i, int(t)] += 1 theta = dtm.todense() for i in range(dtm.shape[0]): theta[i, :] = (theta[i, :] + stat.alpha) / ( theta[i, :].sum() + stat.K * stat.alpha) dtm = csr_matrix(theta) else: model = LDA( n_components=K, doc_topic_prior=stat.alpha, topic_word_prior=stat.beta, learning_method=stat.get_lda_learning_method_display().lower(), max_iter=stat.max_iter, n_jobs=2).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) print("done in %0.3fs." % (time() - t0)) stat.nmf_time = time() - t0 if stat.db: ## Add topics terms print("Adding topicterms to db") t0 = time() ldalambda = find(components) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=no_processes) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) stat.db_time = stat.db_time + time() - t0 ## Add topic-docs print("Adding DocTopics") gamma = find(dtm) glength = len(gamma[0]) chunk_size = 100000 parallel_add = True all_dts = [] make_t = 0 add_t = 0 t0 = time() ### Go through in chunks for i in range(glength // chunk_size + 1): dts = [] values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_processes): doc_batches.append([x for x in docs if x % no_processes == p]) pool = Pool(processes=no_processes) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) #dts.append(pool.map(partial(f_gamma, gamma=gamma, # docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches)) pool.terminate() make_t += time() - make_t0 print(make_t) django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_processes) pool.map(insert_many, values_list) pool.terminate() add_t += time() - add_t0 print(add_t) gc.collect() sys.stdout.flush() stat.db_time = stat.db_time + time() - t0 print("done in %0.3fs." % (time() - t0)) em = 0 for i in range(K): if dtm[:, i].nnz == 0: em += 1 stat.empty_topics = em if stat.method == "NM": stat.error = model.reconstruction_err_ stat.errortype = "Frobenius" elif stat.method == "LD": if stat.lda_library == RunStats.LDA_LIB: stat.error = model.loglikelihood() stat.errortype = "Log likelihood" stat.iterations = model.n_iter elif stat.lda_library == RunStats.WARP: pass else: stat.error = model.perplexity(tfidf) stat.errortype = "Perplexity" stat.iterations = model.n_iter_ stat.last_update = timezone.now() stat.status = 3 stat.save() if stat.db: term_rankings = [] topics = Topic.objects.filter(run_id=run_id) for topic in topics: term_ranking = list( Term.objects.filter(topicterm__topic=topic).order_by( '-topicterm__score').values_list('title', flat=True)[:50]) term_rankings.append(term_ranking) stat.coherence = validation_measure.evaluate_rankings(term_rankings) stat.save() if stat.db: management.call_command('update_run', run_id)
def run_tm(s_id, K, language="german", verbosity=1, method='NM', max_features=0, max_df=0.95, min_df=5, alpha=0.01, extra_stopwords=set(), top_chain_var=None, rng_seed=None, max_iter=200, **kwargs): if method in ['BD', 'BleiDTM'] and top_chain_var is None: top_chain_var = 0.005 s = Search.objects.get(pk=s_id) stat = RunStats(psearch=s, K=K, min_freq=min_df, max_df=max_df, method=method.upper()[0:2], max_features=max_features, max_iter=max_iter, alpha=alpha, extra_stopwords=list(extra_stopwords), top_chain_var=top_chain_var, status=1, language=language) stat.save() django.db.connections.close_all() if method in ['DT', 'dnmf']: print("Running dynamic NMF algorithm") run_dynamic_nmf(stat, **kwargs) return 0 elif method in ['BD', 'BleiDTM']: print("Running Blei DTM algorithm") if rng_seed: stat.rng_seed = rng_seed else: stat.rng_seed = 1 stat.save() run_blei_dtm(stat, **kwargs) return 0 print("starting topic model for runstat with settings:") for field in stat._meta.fields: field_value = getattr(stat, field.name) if field_value: print("{}: {}".format(field.name, field_value)) start_time = time.time() start_datetime = timezone.now() stat.status = 1 # 3 = finished stat.save() run_id = stat.run_id if s.search_object_type == 1: ps = Paragraph.objects.filter(search_matches=s) docs = ps.filter(text__iregex='\w') texts, docsizes, ids = process_texts(docs) elif s.search_object_type == 2: uts = Utterance.objects.filter(search_matches=s) texts, docsizes, ids = merge_utterance_paragraphs(uts) else: print("search object type invalid") return 1 if stat.max_features == 0: n_features = 10000000 else: n_features = stat.max_features if stat.language is "german": stemmer = SnowballStemmer("german") tokenizer = german_stemmer() stopword_list = [stemmer.stem(t) for t in stopwords.words("german")] elif stat.language is "english": stemmer = SnowballStemmer("english") stopword_list = [stemmer.stem(t) for t in stopwords.words("english")] tokenizer = snowball_stemmer() else: print("Language not recognized.") return 1 if stat.extra_stopwords: stopword_list = list(set(stopword_list) | set(stat.extra_stopwords)) if method in ["NM", "nmf"]: if verbosity > 0: print( "creating term frequency-inverse document frequency matrix ({})" .format(time.time() - start_time)) # get term frequency-inverse document frequency matrix (using log weighting) # and min/max document frequency (min_df, max_df) tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) tfidf = tfidf_vectorizer.fit_transform(texts) vectorizer = tfidf_vectorizer vocab = vectorizer.get_feature_names() elif method in ["LD", "lda"]: if verbosity > 0: print("creating term frequency matrix ({})".format(time.time() - start_time)) # Use tf (raw term count) features for LDA. tf_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) tf = tf_vectorizer.fit_transform(texts) vectorizer = tf_vectorizer vocab = vectorizer.get_feature_names() else: print("method not implemented") return 1 if verbosity > 0: print("save terms to db ({})".format(time.time() - start_time)) paralellized = True if paralellized: vocab_ids = [] # multiprocessing: add vocabulary as Term pool = Pool(processes=8) vocab_ids.append( pool.map(partial(db.add_features, run_id=run_id), vocab)) pool.terminate() del vocab vocab_ids = vocab_ids[0] else: print("without multiprocessing for storing terms") # without multiprocessing objects = [Term(title=term_title) for term_title in vocab] # TODO: if some of the objects already exist, duplicates are created: use uniqueness of field 'title' Term.objects.bulk_create(objects) runstats = RunStats.objects.get(run_id=run_id) runstats.term_set.add(*objects) runstats.save() ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() if verbosity > 1: v = True else: v = False if method in ["NM", "nmf"]: if verbosity > 0: print("running matrix factorization with NMF ({})".format( time.time() - start_time)) # NMF = non-negative matrix factorization model = NMF(n_components=K, random_state=1, alpha=stat.alpha, l1_ratio=.1, verbose=v, init='nndsvd', max_iter=stat.max_iter).fit(tfidf) # initialization with Nonnegative Double Singular Value Decomposition (nndsvd) print("Reconstruction error of nmf: {}".format( model.reconstruction_err_)) stat.error = model.reconstruction_err_ stat.errortype = "Frobenius" # document topic matrix dtm = csr_matrix(model.transform(tfidf)) elif method in ["LD", "lda"]: if verbosity > 0: print( "running Latent Dirichlet Allocation ({})".format(time.time() - start_time)) model = LDA( n_components=K, doc_topic_prior=stat. alpha, # this is the concentration parameter of the Dirichlet distribution of topics in documents topic_word_prior=stat. beta, # this is the concentration parameter of the Dirichlet distribution of words in topics # if None, this defaults to 1/n max_iter=stat.max_iter, learning_method= 'online', # using 'batch' instead could lead to memory problems learning_offset=50. #n_jobs=6 ).partial_fit(tf) stat.error = model.perplexity(tf) stat.errortype = "Perplexity" dtm = csr_matrix(model.transform(tf)) else: print("Method {} not available.".format(method)) return 1 # term topic matrix ldalambda = find(csr_matrix(model.components_)) # find returns the indices and values of the nonzero elements of a matrix topics = range(len(ldalambda[0])) tts = [] # multiprocessing: add TopicTerms and scores pool = Pool(processes=8) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) if verbosity > 0: print("saving document topic matrix to db ({})".format(time.time() - start_time)) #document topic matrix gamma = find(dtm) glength = len(gamma[0]) chunk_size = 100000 no_cores = 16 parallel_add = True all_dts = [] make_t = 0 add_t = 0 ### Go through in chunks for i in range(glength // chunk_size + 1): values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_cores): doc_batches.append([x for x in docs if x % no_cores == p]) pool = Pool(processes=no_cores) values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) pool.terminate() django.db.connections.close_all() print( "... created document topic matrix for saving iteration {}".format( i)) values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_cores) if s.search_object_type == 1: pool.map(db.insert_many_pars, values_list) elif s.search_object_type == 2: pool.map(db.insert_many_utterances, values_list) pool.terminate() gc.collect() sys.stdout.flush() print("... saved document topic matrix iteration {}".format(i)) stat.iterations = model.n_iter_ stat.status = 3 # 3 = finished stat.last_update = timezone.now() stat.runtime = timezone.now() - start_datetime stat.save() update_topic_titles(run_id) update_topic_scores(run_id) if verbosity > 0: print("topic model run done ({})".format(time.time() - start_time)) return 0