def classifier_for_twitter_corpus(): svc = LogisticRegression(class_weight='auto', penalty='l2') f_measure = [] entities = [] predicted = [] right = [] train_data = Bunch() test_data = Bunch() try: for i in range(1, 6): f_train = open("input/twitter_corpus/" + str(i) + "/train.txt") f_test = open("input/twitter_corpus/" + str(i) + "/test.txt") train_data.reviews = load_data(f_train) test_data.reviews = load_data(f_test) train_data.labels = extract_labels(train_data.reviews) test_data.labels = extract_labels(test_data.reviews) train_data.entities = extract_entities(train_data.reviews) features_train = extract_features_for_twitter_corpus( train_data.reviews, True) svc.fit(numpy.array(features_train), numpy.array(train_data.labels)) features_test = extract_features_for_twitter_corpus( test_data.reviews, False) predicted_block = svc.predict(numpy.array(features_test)) predicted.extend(predicted_block) right.extend(test_data.labels) print metrics.f1_score(test_data.labels, predicted_block, average='macro') f_measure.append( metrics.f1_score(test_data.labels, predicted_block, average='macro')) entities.extend(test_data.reviews) print str(f_measure) print classification_report(right, predicted, digits=3) print metrics.precision_score(right, predicted, average='macro') print metrics.recall_score(right, predicted, average='macro') print metrics.f1_score(right, predicted, average='macro') except FileNotFoundError: print "Please download Twitter corpus and put it into input/twitter_corpus folder"
def classifier_for_cadec_corpus(): svc = LinearSVC(class_weight='auto', penalty='l2') f_measure = [] entities = [] predicted = [] right = [] train_data = Bunch() test_data = Bunch() for i in range(1, 6): print i f_train = open("input/cadec_corpus/" + str(i) + "/train.txt") f_test = open("input/cadec_corpus/" + str(i) + "/test.txt") train_data.reviews = load_data(f_train) test_data.reviews = load_data(f_test) train_data.labels = extract_labels(train_data.reviews) test_data.labels = extract_labels(test_data.reviews) train_data.entities = extract_entities(train_data.reviews) features_train = extract_features_for_cadec_corpus( train_data.reviews, True) svc.fit(numpy.array(features_train), numpy.array(train_data.labels)) features_test = extract_features_for_cadec_corpus( test_data.reviews, False) predicted_block = svc.predict(numpy.array(features_test)) predicted.extend(predicted_block) right.extend(test_data.labels) print metrics.f1_score(test_data.labels, predicted_block, average='macro') f_measure.append( metrics.f1_score(test_data.labels, predicted_block, average='macro')) entities.extend(test_data.reviews) print str(f_measure) print classification_report(right, predicted, digits=3) print metrics.precision_score(right, predicted, average='macro') print metrics.recall_score(right, predicted, average='macro') print metrics.f1_score(right, predicted, average='macro')
def gen_tfidf(inputfile, output): bunch = _read_file(inputfile) tfidf_bunch = Bunch(category_labels={}, labels=[], tfidf=[], vocabulary={}) tfidf_bunch.category_labels = bunch.category_label #类别和标签对应的字典 tfidf_bunch.labels = bunch.labels #文章的标签 stop_words = get_stop_words() tfidf = TfidfVectorizer(stop_words=stop_words, sublinear_tf=True, max_df=0.8) tfidf_bunch.tfidf = tfidf.fit_transform(bunch.contents) #tfidf转化后文章信息 tfidf_bunch.vocabulary = tfidf.vocabulary_ #词汇对照字典 #保存信息 _wirte_file(output, tfidf_bunch)