def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('kNN classifier training complete, saved predict labels to pickle')
    return
def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('MaxEnt classifier training complete, saved predict labels to pickle')
    return
def svm_classify(train_data, train_labels, test):
    log_state('Use SVM classifier')
    clf = svm.SVC(C=5.0, kernel='linear')
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('SVM classifier training complete, saved predict labels to pickle')
    return
Esempio n. 4
0
def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'kNN classifier training complete, saved predict labels to pickle')
    return
Esempio n. 5
0
def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'MaxEnt classifier training complete, saved predict labels to pickle')
    return
Esempio n. 6
0
def svm_classify(train_data, train_labels, test):
    log_state('Use SVM classifier')
    clf = svm.SVC(C=5.0, kernel='linear')
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'SVM classifier training complete, saved predict labels to pickle')
    return
Esempio n. 7
0
def gNB(train_data, train_labels, test, save_result=False):
    log_state('Use Gaussian Naive Bayes classifier')
    clf = GaussianNB()
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    predict_proba = clf.predict_proba(test)
    if save_result == True:
        dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
        dump_picle(predict_proba, './data/predict_labels/predict_proba.p')
    logger.info('Classifier training complete, saved predict labels to pickle')
    return predict_labels
def gNB(train_data, train_labels, test, save_result=False):
    log_state('Use Gaussian Naive Bayes classifier')
    clf = GaussianNB()
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    predict_proba = clf.predict_proba(test)
    if save_result == True:
        dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
        dump_picle(predict_proba, './data/predict_labels/predict_proba.p')
    logger.info('Classifier training complete, saved predict labels to pickle')
    return predict_labels
def TFIDF_estimator():
    log_state('Start generating features')

    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_analyzer(self):
            # 利用NLTK进行词干化处理
            english_stemmer = nltk.stem.SnowballStemmer('english')
            analyzer = super(TfidfVectorizer, self).build_analyzer()
            return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

    vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                        'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                        'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                        'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}
    log_state((sorted(list(vectorizer_param.items()))))
    log_state('Training data size: ' + str(parameters['test_data_size']))
    return StemmedTfidfVectorizer(**vectorizer_param)
Esempio n. 10
0
def TFIDF_estimator():
    log_state('Start generating features')

    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_analyzer(self):
            # 利用NLTK进行词干化处理
            english_stemmer = nltk.stem.SnowballStemmer('english')
            analyzer = super(TfidfVectorizer, self).build_analyzer()
            return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

    vectorizer_param = {
        'preprocessor': preprocessor,
        'ngram_range': parameters['ngram_range'],
        'analyzer': 'word',
        'min_df': parameters['min_df'],
        'max_df': parameters['max_df'],
        'binary': parameters['TF_binary'],
        'norm': parameters['norm'],
        'sublinear_tf': parameters['sublinear_tf'],
        'max_features': parameters['max_features']
    }
    log_state((sorted(list(vectorizer_param.items()))))
    log_state('Training data size: ' + str(parameters['test_data_size']))
    return StemmedTfidfVectorizer(**vectorizer_param)
Esempio n. 11
0
from vectorizers import punctuation_estimator
from logger_manager import log_state
from anew_vectorizer import strength_vectorizer, avg_affective_vectorizer

vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')