def test_combined_features(): train_X, train_y, test_X, test_y = train_test_data() token_features = Pipeline([('prep', preprocessing.std_prep()), ('frm', representation.count_vectorizer({'min_df': 1}))]) X = token_features.fit_transform(train_X) expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0], [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64) assert (X.toarray() == expected).all() polarity_features = Pipeline([ ('prep', preprocessing.lex_prep()), ('frm', representation.count_vectorizer({'min_df': 1})) ]) X = polarity_features.fit_transform(train_X) expected = np.array([[1, 7], [0, 7]], np.int64) assert (X.toarray() == expected).all() combined_features = FeatureUnion([('token_features', token_features), ('polarity_features', polarity_features) ]) X = combined_features.fit_transform(train_X, train_y) actual = X.toarray() expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 7], [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1, 0, 7]], np.int64) assert (actual == expected).all() tokens_from_lexicon = combined_features.transformer_list[1][1].steps[0][ 1].tokens_from_lexicon assert tokens_from_lexicon == 1
def cnn(name): if name == 'cnn_raw': return CNN() elif name == 'cnn_prep': return CNN(preprocessing.std_prep()) else: raise ValueError("pipeline name is unknown.")
def test_preprocessing(): train_X, train_y, test_X, test_y = train_test_data() preprocessor = preprocessing.std_prep() X = preprocessor.fit_transform(train_X) assert X[0] == "they twats all deserve an ass kicking ." preprocessor = preprocessing.lex_prep() X = preprocessor.fit_transform(train_X) assert X[ 0] == "NEUTRAL HATE NEUTRAL NEUTRAL NEUTRAL NEUTRAL NEUTRAL NEUTRAL"
def svm_sigmoid_embed(): return pipeline(preprocessing.std_prep(), representation.text2embeddings('glove'), svm.SVC(kernel='sigmoid', gamma='scale'))
def svm_libsvc_embed(): return pipeline(preprocessing.std_prep(), representation.text2embeddings('wiki-news'), svm.LinearSVC(max_iter=10000, dual=False, C=0.1))
def svm_libsvc_tfidf(): return pipeline(preprocessing.std_prep(), representation.tfidf_vectorizer(), svm.LinearSVC(max_iter=10000, dual=False, C=0.1))
def svm_libsvc_counts_bigram(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1, 'ngram_range': (2, 2)}), svm.LinearSVC(max_iter=10000, dual=False, C=0.1))
def naive_bayes_counts_trigram(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1, 'ngram_range': (3, 3)}), MultinomialNB())
def naive_bayes_tfidf(): return pipeline(preprocessing.std_prep(), representation.tfidf_vectorizer(), MultinomialNB())
def naive_bayes_counts(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1}), MultinomialNB())
def random_forest_tfidf(): return pipeline(preprocessing.std_prep(), representation.tfidf_vectorizer(), RandomForestClassifier(random_state=42, n_estimators=1000))
def random_forest_embed(): return pipeline(preprocessing.std_prep(), representation.text2embeddings('glove'), RandomForestClassifier(random_state=42, n_estimators=1000))
def naive_bayes_counts_lex(): return CombinedFeaturesPipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1}), preprocessing.lex_prep(), representation.count_vectorizer({'min_df': 1}), MultinomialNB())