def classifier_for_twitter_corpus():
    svc = LogisticRegression(class_weight='auto', penalty='l2')
    f_measure = []
    entities = []
    predicted = []
    right = []
    train_data = Bunch()
    test_data = Bunch()
    try:
        for i in range(1, 6):
            f_train = open("input/twitter_corpus/" + str(i) + "/train.txt")
            f_test = open("input/twitter_corpus/" + str(i) + "/test.txt")
            train_data.reviews = load_data(f_train)
            test_data.reviews = load_data(f_test)
            train_data.labels = extract_labels(train_data.reviews)
            test_data.labels = extract_labels(test_data.reviews)
            train_data.entities = extract_entities(train_data.reviews)
            features_train = extract_features_for_twitter_corpus(
                train_data.reviews, True)
            svc.fit(numpy.array(features_train),
                    numpy.array(train_data.labels))
            features_test = extract_features_for_twitter_corpus(
                test_data.reviews, False)
            predicted_block = svc.predict(numpy.array(features_test))
            predicted.extend(predicted_block)
            right.extend(test_data.labels)

            print metrics.f1_score(test_data.labels,
                                   predicted_block,
                                   average='macro')
            f_measure.append(
                metrics.f1_score(test_data.labels,
                                 predicted_block,
                                 average='macro'))

            entities.extend(test_data.reviews)

        print str(f_measure)
        print classification_report(right, predicted, digits=3)
        print metrics.precision_score(right, predicted, average='macro')
        print metrics.recall_score(right, predicted, average='macro')
        print metrics.f1_score(right, predicted, average='macro')
    except FileNotFoundError:
        print "Please download Twitter corpus and put it into input/twitter_corpus folder"
def classifier_for_cadec_corpus():
    svc = LinearSVC(class_weight='auto', penalty='l2')
    f_measure = []
    entities = []
    predicted = []
    right = []
    train_data = Bunch()
    test_data = Bunch()
    for i in range(1, 6):
        print i
        f_train = open("input/cadec_corpus/" + str(i) + "/train.txt")
        f_test = open("input/cadec_corpus/" + str(i) + "/test.txt")
        train_data.reviews = load_data(f_train)
        test_data.reviews = load_data(f_test)
        train_data.labels = extract_labels(train_data.reviews)
        test_data.labels = extract_labels(test_data.reviews)
        train_data.entities = extract_entities(train_data.reviews)
        features_train = extract_features_for_cadec_corpus(
            train_data.reviews, True)
        svc.fit(numpy.array(features_train), numpy.array(train_data.labels))
        features_test = extract_features_for_cadec_corpus(
            test_data.reviews, False)
        predicted_block = svc.predict(numpy.array(features_test))
        predicted.extend(predicted_block)
        right.extend(test_data.labels)

        print metrics.f1_score(test_data.labels,
                               predicted_block,
                               average='macro')
        f_measure.append(
            metrics.f1_score(test_data.labels,
                             predicted_block,
                             average='macro'))

        entities.extend(test_data.reviews)

    print str(f_measure)
    print classification_report(right, predicted, digits=3)
    print metrics.precision_score(right, predicted, average='macro')
    print metrics.recall_score(right, predicted, average='macro')
    print metrics.f1_score(right, predicted, average='macro')
def gen_tfidf(inputfile, output):
    bunch = _read_file(inputfile)
    tfidf_bunch = Bunch(category_labels={}, labels=[], tfidf=[], vocabulary={})
    tfidf_bunch.category_labels = bunch.category_label  #类别和标签对应的字典
    tfidf_bunch.labels = bunch.labels  #文章的标签
    stop_words = get_stop_words()
    tfidf = TfidfVectorizer(stop_words=stop_words,
                            sublinear_tf=True,
                            max_df=0.8)
    tfidf_bunch.tfidf = tfidf.fit_transform(bunch.contents)  #tfidf转化后文章信息
    tfidf_bunch.vocabulary = tfidf.vocabulary_  #词汇对照字典
    #保存信息
    _wirte_file(output, tfidf_bunch)