def classify_with_tf_idf(paths): sentences, labels, class_names = load_test_data(paths) sentences = np.array(sentences) labels = np.array(labels) average_precisions = [] average_recalls = [] for train_index, test_index in sklearn.cross_validation.StratifiedKFold(labels, n_folds=3): sentences_train, sentences_test = sentences[train_index], sentences[test_index] labels_train, labels_test = labels[train_index], labels[test_index] features_train, vocabulary, count_vectorizer = extract_features_and_vocabulary(sentences_train) tfidf_features_train = transform_to_tfidf(features_train) predicted = predict_with_svc(tfidf_features_train, labels_train, sentences_test, count_vectorizer) print('TF-IDF') average_precision, average_recall = evaluate_classification(predicted, labels_test, sentences, class_names) average_precisions.append(average_precision) average_recalls.append(average_recall) evaluate_complete_classification(average_precisions, average_recalls)
def predict_with_svc(tfidf_features_train, labels_train, sentences_test, count_vectorizer): clf = LinearSVC().fit(tfidf_features_train, labels_train) features_test = extract_features_for_testing(sentences_test, count_vectorizer) tfidf_features_test = transform_to_tfidf(features_test) return clf.predict(tfidf_features_test)