def build_tokenizer(dat, tokenizer_type="svm", **kwargs): if tokenizer_type == "svm": if kwargs["tfidf"]: tokenizer = TfidfVectorizer(ngram_range=(1, 1), max_features=kwargs["max_features"], analyzer=kwargs["analyzer"]) tokenizer.fit(dat) return tokenizer elif kwargs["hashing"]: tokenizer = HashingVectorizer(ngram_range=(1, 1), n_features=kwargs["max_features"], analyzer=kwargs["analyzer"]) return tokenizer elif tokenizer_type == "nn": tokenizer = text.Tokenizer(num_words=kwargs["num_words"]) tokenizer.fit_on_texts(dat) return tokenizer
def data_docs_to_matrix(data_docs, mode="count", max_features=10000, ngram_range=(2, 6), simple_clean=False): if simple_clean: data_docs = [remove_stopwords_and_articles(x) for x in data_docs] if mode == "matrix_word": tokenizer = TfidfVectorizer(stop_words="english", strip_accents="ascii", analyzer="word", max_features=max_features, ngram_range=ngram_range) tokenizer = tokenizer.fit(data_docs) encoded_docs = tokenizer.transform(data_docs) return (encoded_docs, tokenizer, None) if mode == "matrix_char": tokenizer = TfidfVectorizer(stop_words="english", strip_accents="ascii", analyzer="char", max_features=max_features, ngram_range=ngram_range) tokenizer = tokenizer.fit(data_docs) encoded_docs = tokenizer.transform(data_docs) return (encoded_docs, tokenizer, None) if mode == "matrix_pan": df_data = build_dataframe(data_docs) tfidf_unigram = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, min_df=10, max_df=0.8) tfidf_bigram = TfidfVectorizer(ngram_range=(2, 2), sublinear_tf=False, min_df=20, max_df=0.5) #tfidf_pos = TfidfVectorizer(ngram_range=(2, 2), sublinear_tf=True, min_df=0.1, max_df=0.6, lowercase=False) character_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(4, 4), lowercase=False, min_df=4, max_df=0.8) tfidf_ngram = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, min_df=0.1, max_df=0.8) tfidf_transformer = TfidfTransformer(sublinear_tf=True) tfidf_affix_punct = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, min_df=0.1, max_df=0.8, tokenizer=affix_punct_tokenize) features = [ ('cst', digit_col()), ('unigram', pipeline.Pipeline([('s1', text_col(key='no_stopwords')), ('tfidf_unigram', tfidf_unigram)])), ('bigram', pipeline.Pipeline([('s2', text_col(key='no_punctuation')), ('tfidf_bigram', tfidf_bigram)])), # ('tag', pipeline.Pipeline([('s4', text_col(key='pos_tag')), ('tfidf_pos', tfidf_pos)])), ('character', pipeline.Pipeline([('s5', text_col(key='text_clean')), ('character_vectorizer', character_vectorizer), ('tfidf_character', tfidf_transformer)])), ('affixes', pipeline.Pipeline([('s5', text_col(key='affixes')), ('tfidf_ngram', tfidf_ngram)])), ('affix_punct', pipeline.Pipeline([('s5', text_col(key='affix_punct')), ('tfidf_affix_punct', tfidf_affix_punct)])), ] weights = { 'cst': 0.3, 'unigram': 0.8, 'bigram': 0.1, # 'tag': 0.2, 'character': 0.8, 'affixes': 0.4, 'affix_punct': 0.1, } matrix = pipeline.Pipeline([('union', FeatureUnion(transformer_list=features, transformer_weights=weights, n_jobs=1)), ('scale', Normalizer())]) tokenizer = matrix.fit(df_data) print(df_data.shape, df_data.columns) encoded_docs = tokenizer.transform(df_data) print('Matrix shape: ', encoded_docs.shape) return (encoded_docs, tokenizer, None) if mode == "index_char": tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ', char_level=True, oov_token=None) tokenizer.fit_on_texts(data_docs) sequences = tokenizer.texts_to_sequences(data_docs) maxlen = np.max([len(x) for x in sequences]) padded_docs = tf.keras.preprocessing.sequence.pad_sequences( sequences, maxlen=maxlen, padding='post') return (padded_docs, tokenizer, maxlen) if mode == "index_word": tokenizer = tf.keras.preprocessing.text.Tokenizer( num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=" ", char_level=False, oov_token=None) tokenizer.fit_on_texts(data_docs) sequences = tokenizer.texts_to_sequences(data_docs) maxlen = np.max([len(x) for x in sequences]) padded_docs = tf.keras.preprocessing.sequence.pad_sequences( sequences, maxlen=maxlen, padding='post') return (padded_docs, tokenizer, maxlen)