def build_tokenizer(dat, tokenizer_type="svm", **kwargs):
    if tokenizer_type == "svm":
        if kwargs["tfidf"]:
            tokenizer = TfidfVectorizer(ngram_range=(1, 1),
                                        max_features=kwargs["max_features"],
                                        analyzer=kwargs["analyzer"])
            tokenizer.fit(dat)
            return tokenizer
        elif kwargs["hashing"]:
            tokenizer = HashingVectorizer(ngram_range=(1, 1),
                                          n_features=kwargs["max_features"],
                                          analyzer=kwargs["analyzer"])
            return tokenizer

    elif tokenizer_type == "nn":
        tokenizer = text.Tokenizer(num_words=kwargs["num_words"])
        tokenizer.fit_on_texts(dat)
        return tokenizer
Esempio n. 2
0
def data_docs_to_matrix(data_docs,
                        mode="count",
                        max_features=10000,
                        ngram_range=(2, 6),
                        simple_clean=False):

    if simple_clean:
        data_docs = [remove_stopwords_and_articles(x) for x in data_docs]

    if mode == "matrix_word":
        tokenizer = TfidfVectorizer(stop_words="english",
                                    strip_accents="ascii",
                                    analyzer="word",
                                    max_features=max_features,
                                    ngram_range=ngram_range)
        tokenizer = tokenizer.fit(data_docs)
        encoded_docs = tokenizer.transform(data_docs)
        return (encoded_docs, tokenizer, None)

    if mode == "matrix_char":
        tokenizer = TfidfVectorizer(stop_words="english",
                                    strip_accents="ascii",
                                    analyzer="char",
                                    max_features=max_features,
                                    ngram_range=ngram_range)
        tokenizer = tokenizer.fit(data_docs)
        encoded_docs = tokenizer.transform(data_docs)
        return (encoded_docs, tokenizer, None)

    if mode == "matrix_pan":
        df_data = build_dataframe(data_docs)

        tfidf_unigram = TfidfVectorizer(ngram_range=(1, 1),
                                        sublinear_tf=True,
                                        min_df=10,
                                        max_df=0.8)
        tfidf_bigram = TfidfVectorizer(ngram_range=(2, 2),
                                       sublinear_tf=False,
                                       min_df=20,
                                       max_df=0.5)
        #tfidf_pos = TfidfVectorizer(ngram_range=(2, 2), sublinear_tf=True, min_df=0.1, max_df=0.6, lowercase=False)
        character_vectorizer = CountVectorizer(analyzer='char_wb',
                                               ngram_range=(4, 4),
                                               lowercase=False,
                                               min_df=4,
                                               max_df=0.8)
        tfidf_ngram = TfidfVectorizer(ngram_range=(1, 1),
                                      sublinear_tf=True,
                                      min_df=0.1,
                                      max_df=0.8)
        tfidf_transformer = TfidfTransformer(sublinear_tf=True)
        tfidf_affix_punct = TfidfVectorizer(ngram_range=(1, 1),
                                            sublinear_tf=True,
                                            min_df=0.1,
                                            max_df=0.8,
                                            tokenizer=affix_punct_tokenize)

        features = [
            ('cst', digit_col()),
            ('unigram',
             pipeline.Pipeline([('s1', text_col(key='no_stopwords')),
                                ('tfidf_unigram', tfidf_unigram)])),
            ('bigram',
             pipeline.Pipeline([('s2', text_col(key='no_punctuation')),
                                ('tfidf_bigram', tfidf_bigram)])),
            #                   ('tag', pipeline.Pipeline([('s4', text_col(key='pos_tag')), ('tfidf_pos', tfidf_pos)])),
            ('character',
             pipeline.Pipeline([('s5', text_col(key='text_clean')),
                                ('character_vectorizer', character_vectorizer),
                                ('tfidf_character', tfidf_transformer)])),
            ('affixes',
             pipeline.Pipeline([('s5', text_col(key='affixes')),
                                ('tfidf_ngram', tfidf_ngram)])),
            ('affix_punct',
             pipeline.Pipeline([('s5', text_col(key='affix_punct')),
                                ('tfidf_affix_punct', tfidf_affix_punct)])),
        ]
        weights = {
            'cst': 0.3,
            'unigram': 0.8,
            'bigram': 0.1,
            #                 'tag': 0.2,
            'character': 0.8,
            'affixes': 0.4,
            'affix_punct': 0.1,
        }

        matrix = pipeline.Pipeline([('union',
                                     FeatureUnion(transformer_list=features,
                                                  transformer_weights=weights,
                                                  n_jobs=1)),
                                    ('scale', Normalizer())])

        tokenizer = matrix.fit(df_data)
        print(df_data.shape, df_data.columns)
        encoded_docs = tokenizer.transform(df_data)
        print('Matrix shape: ', encoded_docs.shape)

        return (encoded_docs, tokenizer, None)

    if mode == "index_char":

        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=None,
            filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
            lower=True,
            split=' ',
            char_level=True,
            oov_token=None)

        tokenizer.fit_on_texts(data_docs)
        sequences = tokenizer.texts_to_sequences(data_docs)
        maxlen = np.max([len(x) for x in sequences])
        padded_docs = tf.keras.preprocessing.sequence.pad_sequences(
            sequences, maxlen=maxlen, padding='post')

        return (padded_docs, tokenizer, maxlen)

    if mode == "index_word":

        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=None,
            filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
            lower=True,
            split=" ",
            char_level=False,
            oov_token=None)

        tokenizer.fit_on_texts(data_docs)

        sequences = tokenizer.texts_to_sequences(data_docs)
        maxlen = np.max([len(x) for x in sequences])
        padded_docs = tf.keras.preprocessing.sequence.pad_sequences(
            sequences, maxlen=maxlen, padding='post')

        return (padded_docs, tokenizer, maxlen)