Example #1
0
def _base_topic_modelling(
    corpus: List[str],
    n_topics: int,
    decomposition,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    vectorizer: str = 'bow',
    stemming=sastrawi,
    cleaning: Callable = simple_textcleaning,
    stop_words: List[str] = None,
    **kwargs,
):
    if not isinstance(stemming, collections.Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')
    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if len(corpus) < n_topics:
        raise ValueError(
            'length corpus must be bigger than or equal to n_topics')

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    tf_vectorizer = Vectorizer(
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram,
        stop_words=stop_words,
        **kwargs,
    )
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    compose = decomposition(n_topics).fit(tf)
    return _TOPIC(
        tf_features,
        compose,
        [classification_textcleaning(c) for c in corpus],
        compose.transform(tf),
        tf_vectorizer,
        tf,
    )
Example #2
0
def _base_topic_modelling(
    corpus: List[str],
    n_topics: int,
    decomposition,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    vectorizer: str = 'bow',
    stemming=sastrawi,
    cleaning=simple_textcleaning,
    stop_words: List[str] = None,
    **kwargs,
):
    if not isinstance(stemming, Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')
    if not isinstance(cleaning, Callable) and cleaning is not None:
        raise ValueError('cleaning must be a callable type or None')

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if len(corpus) < n_topics:
        raise ValueError(
            'length corpus must be bigger than or equal to n_topics')

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])

    Vectorizer = vectorizer_mapping.get(vectorizer)
    if not Vectorizer:
        raise ValueError(
            'vectorizer is not supported, please check supported vectorizers from malaya.topic_model.available_vectorizer()'
        )
    tf_vectorizer = Vectorizer(
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram,
        stop_words=stop_words,
        **kwargs,
    )
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    compose = decomposition(n_topics).fit(tf)
    return TOPIC(
        tf_features,
        compose,
        [classification_textcleaning(c) for c in corpus],
        compose.transform(tf),
        tf_vectorizer,
        tf,
    )
Example #3
0
    def stem(self, string: str):
        """
        Stem a string.

        Parameters
        ----------
        string : str

        Returns
        -------
        string: stemmed string
        """
        token_strings = classification_textcleaning(string, True).split()
        idx = stemmer_str_idx(token_strings, self._dicts['dictionary_from'])
        predicted = self._sess.run(
            self._logits, feed_dict={self._x: pad_sentence_batch(idx, PAD)[0]})
        results = []
        for word in predicted:
            results.append(''.join([
                self._dicts['rev_dictionary_to'][c] for c in word
                if c not in [GO, PAD, EOS, UNK]
            ]))
        return ' '.join(results)