def generate_model(method, no_topics, word_embedding):

    if method == 'nmf':
        # Run NMF
        model = NMF(n_components=no_topics,
                    random_state=1,
                    alpha=.1,
                    l1_ratio=.5,
                    init='nndsvd',
                    verbose=1).fit(word_embedding)

    elif method == 'lda':
        # Run LDA
        model = LatentDirichletAllocation(n_topics=no_topics,
                                          max_iter=50,
                                          learning_method='online',
                                          learning_offset=50.,
                                          random_state=0,
                                          verbose=1,
                                          evaluate_every=1).fit(word_embedding)

        print("Log Likelihood: ", model.score(word_embedding))
        print("Perplexity: ", model.perplexity(word_embedding))

    return model
Esempio n. 2
0
class TopicModel(object):
    """
    A wrapper around various topic modeling algorithms and libraries, intended to provide a standardized way to train \
    and apply models. When you initialize a ``TopicModel``, it will fit a vectorizer, and split the data into a train \
    and test set if ``holdout_pct`` is provided. For more information about the available implementations, refer to the \
    documentation for the ``fit()`` method below.

    :param df: A :py:class:`pandas.DataFrame`
    :param text_col: Name of the column containing text
    :type text_col: str
    :param method: The topic model implementation to use. Options are: sklearn_lda, sklearn_nmf, gensim_lda, \
    gensim_hdp, corex
    :type method: str
    :param num_topics: The number of topics to extract. Required for every method except ``gensim_hdp``.
    :type num_topics: int
    :param max_ngram_size: Maximum ngram size (2=bigrams, 3=trigrams, etc)
    :type max_ngram_size: int
    :param holdout_pct: Proportion of the documents to hold out for goodness-of-fit scoring
    :type holdout_pct: float
    :param use_tfidf: Whether to use binary counts or a TF-IDF representation
    :type use_tfidf: bool
    :param vec_kwargs: All remaining arguments get passed to TfidfVectorizer or CountVectorizer

    Usage::

        from pewanalytics.text.topics import TopicModel

        import nltk
        import pandas as pd
        nltk.download("movie_reviews")
        reviews = [{"fileid": fileid, "text": nltk.corpus.movie_reviews.raw(fileid)} for fileid in nltk.corpus.movie_reviews.fileids()]
        df = pd.DataFrame(reviews)

        >>> model = TopicModel(df, "text", "sklearn_nmf", num_topics=5, min_df=25, max_df=.5, use_tfidf=False)
        Initialized sklearn_nmf topic model with 3285 features
        1600 training documents, 400 testing documents

        >>> model.fit()

        >>> model.print_topics()
        0: bad, really, know, don, plot, people, scene, movies, action, scenes
        1: star, trek, star trek, effects, wars, star wars, special, special effects, movies, series
        2: jackie, films, chan, jackie chan, hong, master, drunken, action, tarantino, brown
        3: life, man, best, characters, new, love, world, little, does, great
        4: alien, series, aliens, characters, films, television, files, quite, mars, action

        >>> doc_topics = model.get_document_topics(df)

        >>> doc_topics
               topic_0   topic_1   topic_2   topic_3   topic_4
        0     0.723439  0.000000  0.000000  0.000000  0.000000
        1     0.289801  0.050055  0.000000  0.000000  0.000000
        2     0.375149  0.000000  0.030691  0.059088  0.143679
        3     0.152961  0.010386  0.000000  0.121412  0.015865
        4     0.294005  0.100426  0.000000  0.137630  0.051241
        ...        ...       ...       ...       ...       ...
        1995  0.480983  0.070431  0.135178  0.256951  0.000000
        1996  0.139986  0.000000  0.000000  0.107430  0.000000
        1997  0.141545  0.005990  0.081986  0.387859  0.057025
        1998  0.029228  0.023342  0.043713  0.280877  0.107551
        1999  0.044863  0.000000  0.000000  0.718677  0.000000

    """
    def __init__(self,
                 df,
                 text_col,
                 method,
                 num_topics=None,
                 max_ngram_size=2,
                 holdout_pct=0.25,
                 use_tfidf=False,
                 **vec_kwargs):

        self.df = df
        self.text_col = text_col
        self.method = method
        self.num_topics = num_topics
        self.train_df = df.sample(int(round(len(df) * (1.0 - holdout_pct))))
        self.train_df = self.train_df.dropna(subset=[self.text_col])
        self.test_df = df[~df.index.isin(self.train_df.index)]
        self.test_df = self.test_df.dropna(subset=[self.text_col])
        if "stop_words" not in vec_kwargs:
            vec_kwargs["stop_words"] = "english"

        if use_tfidf:
            vec = TfidfVectorizer
        else:
            vec = CountVectorizer
        self.vectorizer = vec(ngram_range=(1, max_ngram_size),
                              decode_error="ignore",
                              **vec_kwargs)

        self.vectorizer = self.vectorizer.fit(self.train_df[self.text_col])
        self.ngrams = self.vectorizer.get_feature_names()
        if self.method in ["gensim_lda", "gensim_hdp"]:
            self.train_features = self.get_features(self.train_df,
                                                    keep_sparse=True)
            self.test_features = self.get_features(self.test_df,
                                                   keep_sparse=True)
            if self.method == "gensim_hdp":
                self.topic_ids = None
                if num_topics:
                    raise Exception(
                        "You cannot specify the number of topics for an HDP model"
                    )
        else:
            self.train_features = self.get_features(self.train_df)
            self.test_features = self.get_features(self.test_df)

        self.model = None

        print("Initialized {} topic model with {} features".format(
            self.method, len(self.ngrams)))
        try:
            print("{} training documents, {} testing documents".format(
                len(self.train_features), len(self.test_features)))
        except TypeError:
            print("{} training documents, {} testing documents".format(
                self.train_features.shape[0], self.test_features.shape[0]))

    def get_features(self, df, keep_sparse=False):
        """
        Uses the trained vectorizer to process a :py:class:`pandas.DataFrame` and return a feature matrix.

        :param df: The :py:class:`pandas.DataFrame` to vectorize (must have ``self.text_col`` in it)
        :param keep_sparse: Whether or not to keep the feature matrix in sparse format (default=False)
        :type keep_sparse: bool
        :return: A :py:class:`pandas.DataFrame` of features or a sparse matrix, depending on the value of \
        ``keep_sparse``
        """

        subset_df = df.dropna(subset=[self.text_col])
        features = self.vectorizer.transform(subset_df[self.text_col])
        if keep_sparse:
            return features
        else:
            return pd.DataFrame(features.todense(),
                                columns=self.ngrams,
                                index=subset_df.index)

    def get_fit_params(self, **kwargs):
        """
        Internal helper function to set defaults depending on the specified model.

        :param kwargs: Arguments passed to ``self.fit()``
        :return: Arguments to pass to the model
        """

        defaults = {
            "sklearn_lda": {
                "alpha": 1.0,
                "beta": 1.0,
                "learning_decay": 0.7,
                "learning_offset": 50,
                "learning_method": "online",
                "max_iter": 500,
                "batch_size": 128,
                "verbose": False,
            },
            "sklearn_nmf": {
                "alpha": 0.0,
                "l1_ratio": 0.5,
                "tol": 0.00001,
                "max_iter": 500,
                "shuffle": True,
            },
            "gensim_lda": {
                "chunksize": 1000,
                "passes": 10,
                "decay": 0.8,
                "offset": 1,
                "workers": 2,
                "alpha": None,
                "beta": "auto",
                "use_multicore": False,
            },
            "gensim_hdp": {
                "max_chunks": None,
                "max_time": None,
                "chunksize": 256,
                "kappa": 1.0,
                "tau": 64.0,
                "T": 150,
                "K": 15,
                "alpha": 1,
                "beta": 0.01,
                "gamma": 1,
                "scale": 1.0,
                "var_converge": 0.0001,
            },
            "corex": {
                "anchors": [],
                "anchor_strength": 3
            },
        }

        for k, v in kwargs.items():
            if k not in defaults[self.method].keys():
                raise Exception(
                    "Unknown keyword argument for method '{}': {}. Accepted parameters are: {}"
                    .format(self.method, k, defaults[self.method].keys()))
        fit_params = copy.deepcopy(defaults[self.method])
        fit_params.update(kwargs)

        if self.method == "sklearn_lda":
            fit_params["verbose"] = int(fit_params["verbose"])
            if "alpha" in fit_params.keys():
                fit_params["doc_topic_prior"] = fit_params["alpha"] / float(
                    self.num_topics)
                del fit_params["alpha"]
            if "beta" in fit_params.keys():
                fit_params["topic_word_prior"] = fit_params["beta"] / float(
                    self.num_topics)
                del fit_params["beta"]

        if self.method == "gensim_lda":
            if not fit_params["alpha"]:
                if fit_params["use_multicore"]:
                    fit_params["alpha"] = "symmetric"
                else:
                    fit_params["alpha"] = "auto"

        if self.method in ["gensim_lda", "gensim_hdp"]:
            if "beta" in fit_params.keys():
                fit_params["eta"] = fit_params["beta"]
                del fit_params["beta"]

        return fit_params

    def fit(self, df=None, **kwargs):
        """
        Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \
        parameters are below:

        **sklearn_lda**

        Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \
        available parameters, please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \
        topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \
        used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``doc_topic_prior = alpha / num_topics``
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \
        topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``topic_word_prior = beta / num_topics``.
        :param learning_decay: See sklearn documentation.
        :param learning_offset: See sklearn documentation.
        :param learning_method: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param batch_size: See sklearn documentation.
        :param verbose: See sklearn documentation.

        **sklearn_nmf**

        Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \
        please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: See sklearn documentation.
        :param l1_ratio: See sklearn documentation.
        :param tol: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param shuffle: See sklearn documentation.

        **gensim_lda**

        Fits an LDA model using :py:class:`gensim.models.LdaModel` or \
        :py:class:`gensim.models.ldamulticore.LdaMulticore`. \
        When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \
        LDA implementation will be used. \
        For more information on available parameters, please refer to the official documentation below:

            - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html
            - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \
        more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \
        options are a bit different than sklearn though; refer to the documentation for the accepted values here.
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \
        than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \
        ``eta``. We renamed it to be consistent with the sklearn implementations.
        :param chunksize: See gensim documentation.
        :param passes: See gensim documentation.
        :param decay: See gensim documentation.
        :param offset: See gensim documentation.
        :param workers: Number of cores to use (if using multicore)
        :param use_multicore: Whether or not to use multicore

        **gensim_hdp**

        Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the
        correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare
        or occur only in a very few number of documents. To identify the topics that are actually useful, this function
        passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \
        topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \
        topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \
        the total number of documents. Subsequent use of the model will only make use of topics that meet this \
        threshold. For more information on available parameters, please refer to the official documentation: \
        https://radimrehurek.com/gensim/models/hdpmodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param max_chunks: See gensim documentation.
        :param max_time: See gensim documentation.
        :param chunksize: See gensim documentation.
        :param kappa: See gensim documentation.
        :param tau: See gensim documentation.
        :param T: See gensim documentation.
        :param K: See gensim documentation.
        :param alpha: See gensim documentation.
        :param beta: See gensim documentation.
        :param gamma: See gensim documentation.
        :param scale: See gensim documentation.
        :param var_converge: See gensim documentation.

        **corex**

        Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item
        corresponding to a set of words to be used to seed a topic. For example:

        .. code-block:: python

            anchors=[
                ['cat', 'kitten'],
                ['dog', 'puppy']
            ]

        The list of anchors cannot be longer than the specified number of topics, and all of the words must
        exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to
        override the suggested words based on the data; providing higher values are a way of "insisting" more strongly
        that the model keep the provided words together in a single topic. For more information on available \
        parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param anchors: A list of lists that contain words that the model should try to group together into topics
        :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data

        """

        fit_params = self.get_fit_params(**kwargs)

        if self.method in ["sklearn_lda", "sklearn_nmf"]:

            if self.method == "sklearn_lda":
                self.model = LatentDirichletAllocation(
                    n_components=self.num_topics, **fit_params)
            if self.method == "sklearn_nmf":
                self.model = NMF(n_components=self.num_topics, **fit_params)

            if is_not_null(df):
                features = self.get_features(df)
            else:
                features = self.train_features
            self.model.fit(features)

        elif self.method in ["gensim_lda", "gensim_hdp"]:

            vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)])
            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.train_features
            matrix = gensim.matutils.Sparse2Corpus(features,
                                                   documents_columns=False)

            if self.method == "gensim_lda":
                fit_params["num_topics"] = self.num_topics
                fit_params["id2word"] = vocab_dict
                if fit_params["use_multicore"]:
                    model_class = gensim.models.ldamulticore.LdaMulticore
                else:
                    model_class = gensim.models.LdaModel
                    del fit_params["workers"]
                del fit_params["use_multicore"]
                self.model = model_class(**fit_params)
                self.model.update(matrix)
            elif self.method == "gensim_hdp":
                model_class = gensim.models.hdpmodel.HdpModel
                self.model = model_class(matrix, vocab_dict, **fit_params)
                doc_topics = self.get_document_topics(self.df)
                topics = ((doc_topics >= 0.01).astype(int).mean() >=
                          0.01).astype(int)
                self.topic_ids = [
                    int(col.split("_")[-1])
                    for col in topics[topics == 1].index
                    if col.startswith("topic_")
                ]
                self.num_topics = len(self.topic_ids)

        elif self.method == "corex":

            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.get_features(self.train_df, keep_sparse=True)
            self.model = corextopic.Corex(n_hidden=self.num_topics)
            self.model.fit(features, words=self.ngrams, **fit_params)

    def get_score(self):
        """
        Returns goodness-of-fit scores for certain models, based on the holdout documents.

        .. note:: The following scores are available for the following methods:

                - perplexity: (sklearn_lda only) The model's perplexity
                - score: (sklearn_lda only) The model's log-likelihood score
                - total_correlation: (corex only) The model's total correlation score

        :return: A dictionary with goodness-of-fit scores
        :rtype: dict

        """

        if self.model:
            if self.method == "sklearn_lda":
                return {
                    "perplexity": self.model.perplexity(self.test_features),
                    "score": self.model.score(self.test_features),
                }
            elif self.method == "corex":
                return {"total_correlation": self.model.tc}
            else:
                return {}

    def get_document_topics(self, df, **kwargs):
        """
        Takes a :py:class:`pandas.DataFrame` and returns a document-topic :py:class:`pandas.DataFrame` \
        (rows=documents, columns=topics)

        :param df: The :py:class:`pandas.DataFrame` to process (must have ``self.text_col`` in it)
        :param min_probability: (gensim_lda use_multicore=False only) Topics with a probability lower than this \
        threshold will be filtered out (Default=0.0)
        :type min_probability: float
        :return: A document-topic matrix
        """

        if self.method in ["sklearn_lda", "sklearn_nmf"]:

            features = self.get_features(df)
            doc_topics = self.model.transform(features)
            topic_matrix = pd.DataFrame(
                doc_topics,
                columns=[
                    "topic_{}".format(i) for i in range(0, self.num_topics)
                ],
                index=features.index,
            )
            return topic_matrix

        elif self.method in ["gensim_lda", "gensim_hdp"]:

            features = self.get_features(df, keep_sparse=True)
            matrix = gensim.matutils.Sparse2Corpus(features,
                                                   documents_columns=False)
            rows = []
            for index, bow in zip(
                    df.dropna(subset=[self.text_col]).index, matrix):
                if self.method == "gensim_lda":
                    if "min_probability" not in kwargs:
                        kwargs["min_probability"] = 0.0
                    try:
                        doc_topics = self.model.get_document_topics(
                            bow, **kwargs)
                    except TypeError:
                        del kwargs["min_probability"]
                        doc_topics = self.model.get_document_topics(
                            bow, **kwargs)
                elif self.method == "gensim_hdp":
                    doc_topics = self.model[bow]
                row = {"index": index}
                for topic, weight in doc_topics:
                    if self.method == "gensim_lda" or (
                            not self.topic_ids or topic in self.topic_ids):
                        row["topic_{}".format(topic)] = weight
                rows.append(row)
            df = pd.DataFrame(rows).fillna(0)
            df = df.set_index(df["index"])
            del df["index"]
            return df

        elif self.method == "corex":

            features = self.get_features(df, keep_sparse=True)
            doc_topics = self.model.transform(features)
            topic_matrix = pd.DataFrame(
                doc_topics,
                columns=[
                    "topic_{}".format(i) for i in range(0, self.num_topics)
                ],
                index=df.index,
            )
            return topic_matrix

    def get_topics(self, include_weights=False, top_n=10, **kwargs):
        """
        Returns a list, equal in length to the number of topics, where each item is a list of words or word-weight
        tuples.

        :param include_weights: Whether or not to include weights along with the ngrams
        :type include_weights: bool
        :param top_n: The number of words to include for each topic
        :type top_n: init
        :return: A list of lists, where each item is a list of ngrams or ngram-weight tuples
        """

        if self.method in ["sklearn_lda", "sklearn_nmf"]:

            topic_features = self.model.components_
            topics = defaultdict(list)
            for topic_id, topic in enumerate(topic_features):
                top_ngram_index = sorted(
                    [(ngram_id, float(ngram_value))
                     for ngram_id, ngram_value in enumerate(topic)],
                    key=lambda x: x[1],
                    reverse=True,
                )
                topics[topic_id] = [
                    self.ngrams[ngram_id] if not include_weights else
                    (self.ngrams[ngram_id], ngram_value)
                    for ngram_id, ngram_value in top_ngram_index[:top_n]
                ]
            return topics

        elif self.method in ["gensim_lda", "gensim_hdp"]:

            topics = defaultdict(list)
            if self.method == "gensim_hdp":
                topic_ids = self.topic_ids
            else:
                topic_ids = range(self.num_topics)
            for i in topic_ids:
                for ngram, weight in self.model.show_topic(i, topn=top_n):
                    if include_weights:
                        topics[i].append((ngram, weight))
                    else:
                        topics[i].append(ngram)
            return topics

        elif self.method == "corex":

            topics = defaultdict(list)
            for topic_id, topic_ngrams in enumerate(
                    self.model.get_topics(n_words=top_n)):
                for ngram, weight in topic_ngrams:
                    if include_weights:
                        topics[topic_id].append((ngram, weight))
                    else:
                        topics[topic_id].append(ngram)
            return topics

    def print_topics(self, include_weights=False, top_n=10):
        """
        Prints the top words for each topic from a trained model.

        :param include_weights: Whether or not to include weights along with the ngrams
        :type include_weights: bool
        :param top_n: The number of words to include for each topic
        :type top_n: int
        """

        for i, topic in self.get_topics(include_weights=include_weights,
                                        top_n=top_n).items():
            print("{}: {}".format(i, ", ".join(topic)))
Esempio n. 3
0
def do_nmf(run_id, no_processes=16):
    stat = RunStats.objects.get(run_id=run_id)
    qid = stat.query.id
    K = stat.K

    TopicTerm.objects.filter(run_id=run_id).delete()
    DocTopic.objects.filter(run_id=run_id).delete()
    Topic.objects.filter(run_id=run_id).delete()

    stat.term_set.clear()

    alpha = stat.alpha
    n_features = stat.max_features
    if n_features == 0:
        n_features = 100000000000
    limit = stat.limit
    ng = stat.ngram

    # if stat.method=="LD" and stat.lda_library!=RunStats.WARP:
    #     if stat.max_iter == 200:
    #         stat.max_iter = 10
    #     if stat.max_iter > 100:
    #         stat.max_iter = 90

    n_samples = stat.max_iter

    stat.process_id = os.getpid()
    stat.status = 1
    stat.save()

    if stat.fulltext:
        docs = Doc.objects.filter(query=qid, fulltext__iregex='\w')
    else:
        docs = Doc.objects.filter(query=qid, content__iregex='\w')

    # if we are limiting, probably for testing, then do that
    if limit > 0:
        docs = docs[:limit]

    print('\n###############################\
    \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \
and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid,
                                     docs.count(), K, run_id))

    # Get the docs into lists
    abstracts, docsizes, ids, citations = proc_docs(docs, stoplist,
                                                    stat.fulltext,
                                                    stat.citations)

    scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform(
        np.array(citations).reshape(-1, 1))

    sentences = [get_sentence(x) for x in abstracts]
    w2v = gensim.models.Word2Vec(sentences)
    validation_measure = WithinTopicMeasure(ModelSimilarity(w2v))

    if stat.fancy_tokenization:
        ######################################
        ## A fancy tokenizer

        from nltk import wordpunct_tokenize
        from nltk import WordNetLemmatizer
        from nltk import sent_tokenize
        from nltk import pos_tag
        from nltk.corpus import stopwords as sw
        punct = set(string.punctuation)
        from nltk.corpus import wordnet as wn
        stopwords = set(sw.words('english'))

        if stat.extra_stopwords:
            stopwords = stopwords | set(stat.extra_stopwords)

        def lemmatize(token, tag):
            tag = {
                'N': wn.NOUN,
                'V': wn.VERB,
                'R': wn.ADV,
                'J': wn.ADJ
            }.get(tag[0], wn.NOUN)
            return WordNetLemmatizer().lemmatize(token, tag)

        kws = Doc.objects.filter(
            query=stat.query,
            kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate(
                n=Count('pk')).filter(n__gt=len(abstracts) //
                                      200).order_by('-n')

        kw_text = set([x['kw__text'].replace('-', ' ') for x in kws])
        kw_ws = set([x['kw__text'].replace('-', ' ').split()[0]
                     for x in kws]) - stopwords

        def fancy_tokenize(X):

            common_words = set([x.lower() for x in X.split()]) & kw_ws
            for w in list(common_words):
                w = w.replace('(', '').replace(')', '')
                wpat = "({}\W*\w*)".format(w)
                wn = [
                    x.lower().replace('-', ' ')
                    for x in re.findall(wpat, X, re.IGNORECASE)
                ]
                kw_matches = set(wn) & kw_text
                if len(kw_matches) > 0:
                    for m in kw_matches:
                        insensitive_m = re.compile(m, re.IGNORECASE)
                        X = insensitive_m.sub(' ', X)
                        yield m.replace(" ", "-")

            for sent in sent_tokenize(X):
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    token = token.lower().strip()
                    if token in stopwords:
                        continue
                    if all(char in punct for char in token):
                        continue
                    if len(token) < 3:
                        continue
                    if all(char in string.digits for char in token):
                        continue
                    lemma = lemmatize(token, tag)
                    yield lemma

        tokenizer = fancy_tokenize
    else:
        tokenizer = snowball_stemmer()

    #######################################

    #############################################
    # Use tf-idf features for NMF.
    print("Extracting tf-idf features ...")
    tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    count_vectorizer = CountVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    t0 = time()
    if stat.method == "NM":
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        vectorizer = tfidf_vectorizer
    else:
        tfidf = count_vectorizer.fit_transform(abstracts)
        vectorizer = count_vectorizer
    print("done in %0.3fs." % (time() - t0))
    stat.tfidf_time = time() - t0
    stat.save()

    if citations is not False:
        tfidf = tfidf.multiply(scaled_citations)

    del abstracts
    gc.collect()

    if stat.db:
        vocab = vectorizer.get_feature_names()
        vocab_ids = []
        pool = Pool(processes=no_processes)
        vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab))
        pool.terminate()
        #del vocab
        vocab_ids = vocab_ids[0]

        ## Make some topics
        django.db.connections.close_all()
        topic_ids = db.add_topics(K, run_id)
        gc.collect()

    # Fit the NMF model
    print("Fitting the model with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    t0 = time()
    if stat.method == "NM":
        model = NMF(n_components=K,
                    random_state=1,
                    alpha=alpha,
                    l1_ratio=.1,
                    verbose=True,
                    init='nndsvd',
                    max_iter=n_samples).fit(tfidf)
        dtm = csr_matrix(model.transform(tfidf))
        components = csr_matrix(model.components_)

    else:
        if stat.lda_library == RunStats.LDA_LIB:
            model = lda.LDA(
                n_topics=K,
                alpha=stat.alpha,
                eta=stat.alpha,
                n_iter=stat.max_iter * 10,
            ).fit(tfidf)
            dtm = model.doc_topic_
            components = csr_matrix(model.components_)
        elif stat.lda_library == RunStats.WARP:
            # Export warp lda
            try:
                warp_path = settings.WARP_LDA_PATH
                os.chdir(warp_path)
            except:
                print(
                    "warplda is not installed, or its path is not defined in settings, exiting...."
                )
                return
            fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id)
            # preformat
            os.system(f'./format -input {fname} -prefix {run_id} train')
            # Run warp lda
            runcmd = f'./warplda --prefix {run_id} --k {stat.K}'
            if stat.alpha:
                runcmd += f' -alpha {stat.alpha}'
            if stat.beta:
                runcmd += f' -beta {stat.beta}'
            else:
                stat.beta = 0.01  # default beta value
                stat.save()
            if stat.max_iter:
                runcmd += f' --niter {stat.max_iter}'
            runcmd += ' train.model'
            print("Running warplda.")
            os.system(runcmd)
            print("Finished running warplda, importing results.")

            warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str)
            warp_translate = np.argsort(warp_vocab).argsort()
            # Import warp lda as matrices
            with open(f'{run_id}.model', 'r') as f:
                for i, l in enumerate(f):
                    if i == 0:
                        M = int(l.split()[0])
                        N = int(l.split()[1])
                        components = lil_matrix((N, M))
                    else:
                        largs = l.split('\t')[1].strip().split()
                        for la in largs:
                            wid = warp_translate[i - 1]
                            t, n = la.split(':')
                            components[int(t), wid] = int(n)

            components = components.todense()
            for k in range(components.shape[0]):
                components[k, :] = (components[k, :] + stat.beta) / (
                    components[k, :].sum() + stat.K * stat.beta)
            components = csr_matrix(components)

            dtm = lil_matrix((len(ids), N))
            with open(f'{run_id}.z.estimate', 'r') as f:
                for i, l in enumerate(f):
                    largs = l.split(' ', maxsplit=1)[1].strip().split()
                    for la in largs:
                        w, t = la.split(':')
                        dtm[i, int(t)] += 1

            theta = dtm.todense()
            for i in range(dtm.shape[0]):
                theta[i, :] = (theta[i, :] + stat.alpha) / (
                    theta[i, :].sum() + stat.K * stat.alpha)

            dtm = csr_matrix(theta)

        else:
            model = LDA(
                n_components=K,
                doc_topic_prior=stat.alpha,
                topic_word_prior=stat.beta,
                learning_method=stat.get_lda_learning_method_display().lower(),
                max_iter=stat.max_iter,
                n_jobs=2).fit(tfidf)

            dtm = csr_matrix(model.transform(tfidf))
            components = csr_matrix(model.components_)

    print("done in %0.3fs." % (time() - t0))
    stat.nmf_time = time() - t0

    if stat.db:
        ## Add topics terms
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(components)
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=no_processes)

        tts.append(
            pool.map(
                partial(db.f_lambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=topic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        TopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))
        stat.db_time = stat.db_time + time() - t0

        ## Add topic-docs
        print("Adding DocTopics")
        gamma = find(dtm)
        glength = len(gamma[0])

        chunk_size = 100000

        parallel_add = True

        all_dts = []

        make_t = 0
        add_t = 0

        t0 = time()
        ### Go through in chunks
        for i in range(glength // chunk_size + 1):
            dts = []
            values_list = []
            f = i * chunk_size
            l = (i + 1) * chunk_size
            if l > glength:
                l = glength
            docs = range(f, l)
            doc_batches = []
            for p in range(no_processes):
                doc_batches.append([x for x in docs if x % no_processes == p])
            pool = Pool(processes=no_processes)
            make_t0 = time()
            values_list.append(
                pool.map(
                    partial(db.f_gamma_batch,
                            gamma=gamma,
                            docsizes=docsizes,
                            docUTset=ids,
                            topic_ids=topic_ids,
                            run_id=run_id), doc_batches))
            #dts.append(pool.map(partial(f_gamma, gamma=gamma,
            #                docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches))
            pool.terminate()
            make_t += time() - make_t0
            print(make_t)
            django.db.connections.close_all()

            add_t0 = time()
            values_list = [item for sublist in values_list for item in sublist]
            pool = Pool(processes=no_processes)
            pool.map(insert_many, values_list)
            pool.terminate()
            add_t += time() - add_t0
            print(add_t)
            gc.collect()
            sys.stdout.flush()

        stat.db_time = stat.db_time + time() - t0
        print("done in %0.3fs." % (time() - t0))

    em = 0
    for i in range(K):
        if dtm[:, i].nnz == 0:
            em += 1

    stat.empty_topics = em
    if stat.method == "NM":
        stat.error = model.reconstruction_err_
        stat.errortype = "Frobenius"
    elif stat.method == "LD":
        if stat.lda_library == RunStats.LDA_LIB:
            stat.error = model.loglikelihood()
            stat.errortype = "Log likelihood"
            stat.iterations = model.n_iter
        elif stat.lda_library == RunStats.WARP:
            pass
        else:
            stat.error = model.perplexity(tfidf)
            stat.errortype = "Perplexity"
            stat.iterations = model.n_iter_
    stat.last_update = timezone.now()
    stat.status = 3

    stat.save()

    if stat.db:
        term_rankings = []

        topics = Topic.objects.filter(run_id=run_id)

        for topic in topics:
            term_ranking = list(
                Term.objects.filter(topicterm__topic=topic).order_by(
                    '-topicterm__score').values_list('title', flat=True)[:50])
            term_rankings.append(term_ranking)

        stat.coherence = validation_measure.evaluate_rankings(term_rankings)
        stat.save()
        if stat.db:
            management.call_command('update_run', run_id)
Esempio n. 4
0
def run_tm(s_id,
           K,
           language="german",
           verbosity=1,
           method='NM',
           max_features=0,
           max_df=0.95,
           min_df=5,
           alpha=0.01,
           extra_stopwords=set(),
           top_chain_var=None,
           rng_seed=None,
           max_iter=200,
           **kwargs):

    if method in ['BD', 'BleiDTM'] and top_chain_var is None:
        top_chain_var = 0.005

    s = Search.objects.get(pk=s_id)
    stat = RunStats(psearch=s,
                    K=K,
                    min_freq=min_df,
                    max_df=max_df,
                    method=method.upper()[0:2],
                    max_features=max_features,
                    max_iter=max_iter,
                    alpha=alpha,
                    extra_stopwords=list(extra_stopwords),
                    top_chain_var=top_chain_var,
                    status=1,
                    language=language)
    stat.save()
    django.db.connections.close_all()

    if method in ['DT', 'dnmf']:
        print("Running dynamic NMF algorithm")
        run_dynamic_nmf(stat, **kwargs)
        return 0
    elif method in ['BD', 'BleiDTM']:
        print("Running Blei DTM algorithm")
        if rng_seed:
            stat.rng_seed = rng_seed
        else:
            stat.rng_seed = 1
        stat.save()
        run_blei_dtm(stat, **kwargs)
        return 0

    print("starting topic model for runstat with settings:")
    for field in stat._meta.fields:
        field_value = getattr(stat, field.name)
        if field_value:
            print("{}: {}".format(field.name, field_value))

    start_time = time.time()
    start_datetime = timezone.now()

    stat.status = 1  # 3 = finished

    stat.save()
    run_id = stat.run_id

    if s.search_object_type == 1:
        ps = Paragraph.objects.filter(search_matches=s)
        docs = ps.filter(text__iregex='\w')
        texts, docsizes, ids = process_texts(docs)

    elif s.search_object_type == 2:
        uts = Utterance.objects.filter(search_matches=s)
        texts, docsizes, ids = merge_utterance_paragraphs(uts)
    else:
        print("search object type invalid")
        return 1

    if stat.max_features == 0:
        n_features = 10000000
    else:
        n_features = stat.max_features

    if stat.language is "german":
        stemmer = SnowballStemmer("german")
        tokenizer = german_stemmer()
        stopword_list = [stemmer.stem(t) for t in stopwords.words("german")]

    elif stat.language is "english":
        stemmer = SnowballStemmer("english")
        stopword_list = [stemmer.stem(t) for t in stopwords.words("english")]
        tokenizer = snowball_stemmer()
    else:
        print("Language not recognized.")
        return 1

    if stat.extra_stopwords:
        stopword_list = list(set(stopword_list) | set(stat.extra_stopwords))

    if method in ["NM", "nmf"]:
        if verbosity > 0:
            print(
                "creating term frequency-inverse document frequency matrix ({})"
                .format(time.time() - start_time))
        # get term frequency-inverse document frequency matrix (using log weighting)
        # and min/max document frequency (min_df, max_df)
        tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                           min_df=stat.min_freq,
                                           max_features=n_features,
                                           ngram_range=(1, stat.ngram),
                                           tokenizer=tokenizer,
                                           stop_words=stopword_list)

        tfidf = tfidf_vectorizer.fit_transform(texts)
        vectorizer = tfidf_vectorizer
        vocab = vectorizer.get_feature_names()

    elif method in ["LD", "lda"]:
        if verbosity > 0:
            print("creating term frequency matrix ({})".format(time.time() -
                                                               start_time))
        #  Use tf (raw term count) features for LDA.
        tf_vectorizer = CountVectorizer(max_df=stat.max_df,
                                        min_df=stat.min_freq,
                                        max_features=n_features,
                                        ngram_range=(1, stat.ngram),
                                        tokenizer=tokenizer,
                                        stop_words=stopword_list)
        tf = tf_vectorizer.fit_transform(texts)
        vectorizer = tf_vectorizer
        vocab = vectorizer.get_feature_names()
    else:
        print("method not implemented")
        return 1

    if verbosity > 0:
        print("save terms to db ({})".format(time.time() - start_time))

    paralellized = True
    if paralellized:
        vocab_ids = []
        # multiprocessing: add vocabulary as Term
        pool = Pool(processes=8)
        vocab_ids.append(
            pool.map(partial(db.add_features, run_id=run_id), vocab))
        pool.terminate()
        del vocab
        vocab_ids = vocab_ids[0]

    else:
        print("without multiprocessing for storing terms")
        # without multiprocessing
        objects = [Term(title=term_title) for term_title in vocab]

        # TODO: if some of the objects already exist, duplicates are created: use uniqueness of field 'title'
        Term.objects.bulk_create(objects)
        runstats = RunStats.objects.get(run_id=run_id)
        runstats.term_set.add(*objects)
        runstats.save()

    ## Make some topics
    django.db.connections.close_all()
    topic_ids = db.add_topics(K, run_id)
    gc.collect()

    if verbosity > 1:
        v = True
    else:
        v = False

    if method in ["NM", "nmf"]:
        if verbosity > 0:
            print("running matrix factorization with NMF ({})".format(
                time.time() - start_time))
        # NMF = non-negative matrix factorization
        model = NMF(n_components=K,
                    random_state=1,
                    alpha=stat.alpha,
                    l1_ratio=.1,
                    verbose=v,
                    init='nndsvd',
                    max_iter=stat.max_iter).fit(tfidf)
        # initialization with Nonnegative Double Singular Value Decomposition (nndsvd)
        print("Reconstruction error of nmf: {}".format(
            model.reconstruction_err_))

        stat.error = model.reconstruction_err_
        stat.errortype = "Frobenius"

        # document topic matrix
        dtm = csr_matrix(model.transform(tfidf))

    elif method in ["LD", "lda"]:
        if verbosity > 0:
            print(
                "running Latent Dirichlet Allocation ({})".format(time.time() -
                                                                  start_time))
        model = LDA(
            n_components=K,
            doc_topic_prior=stat.
            alpha,  # this is the concentration parameter of the Dirichlet distribution of topics in documents
            topic_word_prior=stat.
            beta,  # this is the concentration parameter of the Dirichlet distribution of words in topics
            # if None, this defaults to 1/n
            max_iter=stat.max_iter,
            learning_method=
            'online',  # using 'batch' instead could lead to memory problems
            learning_offset=50.
            #n_jobs=6
        ).partial_fit(tf)

        stat.error = model.perplexity(tf)
        stat.errortype = "Perplexity"

        dtm = csr_matrix(model.transform(tf))

    else:
        print("Method {} not available.".format(method))
        return 1

    # term topic matrix
    ldalambda = find(csr_matrix(model.components_))
    # find returns the indices and values of the nonzero elements of a matrix
    topics = range(len(ldalambda[0]))
    tts = []
    # multiprocessing: add TopicTerms and scores
    pool = Pool(processes=8)
    tts.append(
        pool.map(
            partial(db.f_lambda,
                    m=ldalambda,
                    v_ids=vocab_ids,
                    t_ids=topic_ids,
                    run_id=run_id), topics))
    pool.terminate()

    tts = flatten(tts)
    gc.collect()
    sys.stdout.flush()
    django.db.connections.close_all()
    TopicTerm.objects.bulk_create(tts)

    if verbosity > 0:
        print("saving document topic matrix to db ({})".format(time.time() -
                                                               start_time))

    #document topic matrix
    gamma = find(dtm)
    glength = len(gamma[0])

    chunk_size = 100000

    no_cores = 16
    parallel_add = True

    all_dts = []

    make_t = 0
    add_t = 0

    ### Go through in chunks
    for i in range(glength // chunk_size + 1):
        values_list = []
        f = i * chunk_size
        l = (i + 1) * chunk_size
        if l > glength:
            l = glength
        docs = range(f, l)
        doc_batches = []
        for p in range(no_cores):
            doc_batches.append([x for x in docs if x % no_cores == p])
        pool = Pool(processes=no_cores)
        values_list.append(
            pool.map(
                partial(db.f_gamma_batch,
                        gamma=gamma,
                        docsizes=docsizes,
                        docUTset=ids,
                        topic_ids=topic_ids,
                        run_id=run_id), doc_batches))
        pool.terminate()
        django.db.connections.close_all()
        print(
            "... created document topic matrix for saving iteration {}".format(
                i))

        values_list = [item for sublist in values_list for item in sublist]
        pool = Pool(processes=no_cores)
        if s.search_object_type == 1:
            pool.map(db.insert_many_pars, values_list)
        elif s.search_object_type == 2:
            pool.map(db.insert_many_utterances, values_list)
        pool.terminate()
        gc.collect()
        sys.stdout.flush()
        print("... saved document topic matrix iteration {}".format(i))

    stat.iterations = model.n_iter_
    stat.status = 3  # 3 = finished
    stat.last_update = timezone.now()
    stat.runtime = timezone.now() - start_datetime
    stat.save()
    update_topic_titles(run_id)
    update_topic_scores(run_id)

    if verbosity > 0:
        print("topic model run done ({})".format(time.time() - start_time))

    return 0