def classify_with_expert_knowledge(paths): sentences, labels, class_names = load_test_data(paths) features, vocabulary, unused = extract_features_and_vocabulary(sentences) stemmed_target_features = [stem_words(target_feature) for target_feature in TARGET_FEATURES] predicted = classify(stemmed_target_features, features.toarray(), vocabulary) print('EXPERT KNOWLEDGE:') evaluate_classification(predicted, labels, sentences, class_names)
def test_extract_features(input_sentences, expected_vocabulary): actual_features, actual_vocabulary, unused = extract_features_and_vocabulary(input_sentences) expected_features = np.array([ [1, 1, 2, 0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 1, 1] ]) assert actual_vocabulary == expected_vocabulary assert (actual_features == expected_features).all()
def classify_with_tf_idf(paths): sentences, labels, class_names = load_test_data(paths) sentences = np.array(sentences) labels = np.array(labels) average_precisions = [] average_recalls = [] for train_index, test_index in sklearn.cross_validation.StratifiedKFold(labels, n_folds=3): sentences_train, sentences_test = sentences[train_index], sentences[test_index] labels_train, labels_test = labels[train_index], labels[test_index] features_train, vocabulary, count_vectorizer = extract_features_and_vocabulary(sentences_train) tfidf_features_train = transform_to_tfidf(features_train) predicted = predict_with_svc(tfidf_features_train, labels_train, sentences_test, count_vectorizer) print('TF-IDF') average_precision, average_recall = evaluate_classification(predicted, labels_test, sentences, class_names) average_precisions.append(average_precision) average_recalls.append(average_recall) evaluate_complete_classification(average_precisions, average_recalls)
def vocabulary(input_sentences): return extract_features_and_vocabulary(input_sentences)[1]
def features(input_sentences): return extract_features_and_vocabulary(input_sentences)[0].toarray()