def test_combined_features(): train_X, train_y, test_X, test_y = train_test_data() token_features = Pipeline([('prep', preprocessing.std_prep()), ('frm', representation.count_vectorizer({'min_df': 1}))]) X = token_features.fit_transform(train_X) expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0], [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64) assert (X.toarray() == expected).all() polarity_features = Pipeline([ ('prep', preprocessing.lex_prep()), ('frm', representation.count_vectorizer({'min_df': 1})) ]) X = polarity_features.fit_transform(train_X) expected = np.array([[1, 7], [0, 7]], np.int64) assert (X.toarray() == expected).all() combined_features = FeatureUnion([('token_features', token_features), ('polarity_features', polarity_features) ]) X = combined_features.fit_transform(train_X, train_y) actual = X.toarray() expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 7], [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1, 0, 7]], np.int64) assert (actual == expected).all() tokens_from_lexicon = combined_features.transformer_list[1][1].steps[0][ 1].tokens_from_lexicon assert tokens_from_lexicon == 1
def test_count_vectorizer(): train_X, train_y, test_X, test_y = train_test_data() cv = representation.count_vectorizer() X = cv.fit_transform(train_X) assert cv.get_feature_names() == [ 'all', 'an', 'ass', 'deserve', 'hope', 'kicking', 'later', 'talk', 'they', 'to', 'twats', 'you' ] result = X.toarray() expected = np.array([[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0], [0, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 1]], np.int64) assert (result == expected).all()
def test_representation(): task = of.Offenseval() task.load(offenseval_data_dir) train_X, train_y, test_X, test_y = utils.get_instances( task, split_train_dev=True, proportion_train=0.1, proportion_dev=0.01) prep = preprocessing.Preprocessor(tokenize=True, normalize_tweet=False, lowercase=False, lemmatize=False) train_X = prep.transform(train_X) frmt = representation.count_vectorizer() train_X = frmt.fit_transform(train_X, train_y) assert not isinstance(train_X[0], str)
def svm_libsvc_counts_bigram(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1, 'ngram_range': (2, 2)}), svm.LinearSVC(max_iter=10000, dual=False, C=0.1))
def svm_libsvc_counts(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer(), svm.LinearSVC(max_iter=10000, dual=False, C=0.1))
def naive_bayes_counts_lex(): return pipeline(preprocessing.lex_prep(), representation.count_vectorizer({'min_df': 1}), MultinomialNB())
def naive_bayes_counts_trigram(): return pipeline(preprocessing.std_prep(), representation.count_vectorizer({'min_df': 1, 'ngram_range': (3, 3)}), MultinomialNB())