def __init__(self, clf): self.clf = clf results = list(extract.extract_labeled_tweets()) _ids, _labels, _tweets = zip(*results) tweets = np.asarray(_tweets) labels = np.asarray(_labels) # remap non-negative ('X') to 1, negative 0 labels = convert_labels_to_binary(labels, ['X']) self.clf.fit(tweets, labels)
def evaluate_revised(scorer): """Evaluate a scorer using revised labels.""" y_true = [] y_pred = [] def get_score(text): score = scorer.get_document_score(text, normalize=False) if score < 0: return '-' else: return 'X' res = [(label, get_score(text)) for _id, label, text in extract.extract_labeled_tweets()] y_true, y_pred = zip(*res) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, pos_label='-') correct = sum(1 for label, pred in zip(y_true, y_pred) if label==pred) accuracy = float(correct) / len(y_true) print 'precision=%.3f recall=%.3f fscore=%.3f accuracy=%.3f' % ( p[0], r[0], f[0], accuracy)
analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: [stemmer.stem(x) for x in analyzer(doc)] def convert_labels_to_binary(Y, one_label_list): """Convert string labels to zero or ones.""" pos = Y == one_label_list[0] for label in one_label_list[1:]: pos |= Y == sent_label X = np.zeros(Y.shape[0], dtype=np.int) X[pos] = 1 return X if __name__ == "__main__": results = list(extract.extract_labeled_tweets()) _ids, _labels, _tweets = zip(*results) tweets = np.asarray(_tweets) labels = np.asarray(_labels) # remap 'X' to 1, everything else to 0 labels = convert_labels_to_binary(labels, ['X']) X_train, X_test, y_train, y_test = train_test_split( tweets, labels, test_size=0.2, random_state=0) clf = create_logistic_regression_classifier() clf.fit(X_train, y_train) y_preds = clf.predict(X_test) target_names = ['Negative', 'Non-negative']