コード例 #1
0
def testClient(labeledset, fracLearn=0.8, LearnRate = 0.027, printing = True, SVM = False):
    import random
    from nltk import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC, LinearSVC
    from sklearn.naive_bayes import MultinomialNB
    
    random.shuffle(labeledset)
    length = len(labeledset)
    trainset = labeledset[:int(length*fracLearn)]
    testset = labeledset[int(length*fracLearn):]
    
    if SVM:
        clf = SklearnClassifier(LinearSVC(C=LearnRate)) #LR C=0.0012, C=LinearSVC 0.0007
    else:
        clf = SklearnClassifier(LogisticRegression(C=LearnRate))
    clf.train(trainset)
    
    correct = 0
    for i,film in enumerate(testset):
        if clf.classify(film[0]) == film[1]:
            correct += 1
    testAcc = correct/float(len(testset))
    if printing: print('Accuracy on test set: ' + str(testAcc))
    correct = 0
    for i,film in enumerate(trainset):
        if clf.classify(film[0]) == film[1]:
            correct += 1
    trainAcc = correct/float(len(trainset))
    if printing: print( 'Accuracy on train set: '+str(trainAcc))
    if not printing: return testAcc
コード例 #2
0
def train_classifier(collection, filename_start, **kwargs):
    main_cat_name = kwargs["main_cat_name"]
    main_cat_filenames = kwargs["main_cat_filenames"]
    opposite_cat_name = kwargs["opposite_cat_name"]
    opposite_cat_filenames = kwargs["opposite_cat_filenames"]

    main_cat_ids = get_training_ids_ms(*main_cat_filenames)
    main_cat = TrainTextPreprocessor(ids_list=main_cat_ids,
                                     db_collection=collection,
                                     category=main_cat_name)
    main_cat.process_data()
    # main_cat_features = main_cat.get_most_frequent(50)
    main_cat_tokens = main_cat.category_tokens
    main_cat_tweets = main_cat.tweets_lemmas_categorized

    opposite_cat_ids = get_training_ids_ms(*opposite_cat_filenames)
    opposite_cat = TrainTextPreprocessor(ids_list=opposite_cat_ids,
                                         db_collection=collection,
                                         category=opposite_cat_name)
    opposite_cat.process_data()
    # opposite_cat_features = opposite_cat.get_most_frequent(50)
    opposite_cat_tokens = opposite_cat.category_tokens
    opposite_cat_tweets = opposite_cat.tweets_lemmas_categorized
    # print(opposite_cat_tokens)

    # Compute TF-IDF
    corpus = main_cat.tweets_lemmas + opposite_cat.tweets_lemmas
    tf_idf_range = compute_tfidf(corpus)
    for el in tf_idf_range:
        # print(el)
        pass

    documents = combine_and_shuffle(main_cat_tweets, opposite_cat_tweets)
    word_features = combine_and_shuffle(main_cat_tokens, opposite_cat_tokens)

    featuresets = []
    for tweet, category in documents:
        featuresets.append((find_features(tweet, word_features), category))

    train_index, test_index = get_indexes_80_20(len(featuresets))
    training_set = featuresets[:train_index]
    testing_set = featuresets[test_index:]

    # NAIVE BAYES CLASSIFIER
    # NBC_classifier = nltk.NaiveBayesClassifier.train(training_set)
    # print("Original NB classifier accuracy percent:", (nltk.classify.accuracy(NBC_classifier, testing_set)) * 100)
    #
    # NBC_classifier.show_most_informative_features(50)

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    print("Linear SVC classifier accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)
コード例 #3
0
def train_mnb_clf(training_set, testing_set):
    """
    accuracy: 73.28
    """
    mnb_classifier = SklearnClassifier(MultinomialNB())
    mnb_classifier.train(training_set)
    print("Multinomial NB Classifier accuracy:",
          (classify.accuracy(mnb_classifier, testing_set)) * 100)

    pickle_as = os.path.join(utils.get_project_root(),
                             'data/classifiers/mnb_classifier_5k.pickle')
    with open(pickle_as, 'wb') as f:
        pickle.dump(mnb_classifier, f)
コード例 #4
0
def train_linear_svc_clf(training_set, testing_set):
    """
    accuracy: 72.01
    """
    linear_svc_classifier = SklearnClassifier(LinearSVC())
    linear_svc_classifier.train(training_set)
    print("LinearSVC Classifier accuracy:",
          (classify.accuracy(linear_svc_classifier, testing_set)) * 100)

    pickle_as = os.path.join(
        utils.get_project_root(),
        'data/classifiers/linear_svc_classifier_5k.pickle')
    with open(pickle_as, 'wb') as f:
        pickle.dump(linear_svc_classifier, f)
コード例 #5
0
def train_bernoulli_nb_clf(training_set, testing_set):
    """
    accuracy: 74.64
    """
    bernoulli_nb_classifier = SklearnClassifier(BernoulliNB())
    bernoulli_nb_classifier.train(training_set)
    print("Bernoulli NB Classifier accuracy:",
          (classify.accuracy(bernoulli_nb_classifier, testing_set)) * 100)

    pickle_as = os.path.join(
        utils.get_project_root(),
        'data/classifiers/bernoulli_nb_classifier_5k.pickle')
    with open(pickle_as, 'wb') as f:
        pickle.dump(bernoulli_nb_classifier, f)
コード例 #6
0
def train_logistic_regression_clf(training_set, testing_set):
    """
    accuracy: 74.59
    """
    logistic_regression_classifier = SklearnClassifier(LogisticRegression())
    logistic_regression_classifier.train(training_set)
    print('Logistic Regression Classifier accuracy:',
          (classify.accuracy(logistic_regression_classifier, testing_set)) *
          100)

    pickle_as = os.path.join(
        utils.get_project_root(),
        'data/classifiers/logistic_regression_classifier_5k.pickle')
    with open(pickle_as, 'wb') as f:
        pickle.dump(logistic_regression_classifier, f)
コード例 #7
0
def suggestions(labeledset, featureset, num = 20):
    from nltk import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC, LinearSVC
    from sklearn.naive_bayes import MultinomialNB
    
    clf = SklearnClassifier(LogisticRegression(C=0.024))
    clf.train(labeledset)
    
    filmsSeen = []
    for film in labeledset:
        filmsSeen.append( film[0]['title'] )
    
    suggestions = []
    for film in featureset:
        if film['title'] in filmsSeen: continue
        suggestions.append( (film['title'], clf.prob_classify(film).prob('fresh') ))
        
    suggestions.sort(key=lambda x: x[1], reverse=True)
    return suggestions[:num]
コード例 #8
0
def train(isis_path, general_path, out_path, for_production=True):
    # Load data
    isis_tweets = tuple(codecs.open(isis_path, 'r', 'utf-8-sig'))
    general_tweets = tuple(codecs.open(general_path, 'r', 'utf-8-sig'))
    # Done

    # Build datasets
    #   Label & shuffle lines
    labeled_lines = ([(line, 'isis') for line in isis_tweets] + [(line, 'general') for line in general_tweets])
    random.shuffle(labeled_lines)
    #   Tokenize into words
    entire_set = [(tweet_features(n), tweet_class) for (n, tweet_class) in labeled_lines]
    cls = SklearnClassifier(LogisticRegression())
    train_set = test_set = entire_set
    if not for_production:
        train_set = entire_set[500:]
        test_set = entire_set[:500]
    cls.train(train_set)
    print("accuracy on training set: " + str(classify.accuracy(cls, test_set)))
    joblib.dump(cls, out_path)
コード例 #9
0
def nearCrits(crit, critset, filmset, fracTrain = 0.7, learnRate = 0.024):
    import random
    from nltk import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    
    labeledset = makeLabeledset(crit,filmset,critset)
    random.shuffle(labeledset)
    trainset = labeledset[:int(len(labeledset)*fracTrain)]
    
    clf = SklearnClassifier(LogisticRegression(C=learnRate))
    clf.train(trainset)
    
    critdist = []
    baseline = clf.prob_classify({}).prob('fresh')
    for crit in critset:
        quickdic = {}
        quickdic[crit] = 'fresh'
        dist = clf.prob_classify(quickdic).prob('fresh') - baseline
        critdist.append( (crit,dist) )
    critdist.sort(key=lambda x: x[1], reverse=True)
    for i in range(-10,10):
        print (critdist[i])
コード例 #10
0
def entrenar_recomendacion(feature_labels):
    cv = cross_validation.KFold(
        len(feature_labels), n_folds=10
    )  #realizamos validacion cruzada para ver como esta funcionando nuestro clasificador, es decir, ver si podemos encontrar alguna configuracion de parametros que nos de un resultado mas exacto
    sum_accuracy = 0
    sum_average_precision = 0
    sum_f1 = 0
    sum_precision = 0
    sum_recall = 0
    sum_roc_auc = 0
    k = 0
    for traincv, testcv in cv:
        #        classifier = NaiveBayesClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        #        classifier = MaxentClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        classifier = SklearnClassifier(
            SVC(kernel='linear', probability=True)
        ).train(
            feature_labels[traincv[0]:traincv[len(traincv) - 1]]
        )  #elegimos nuestro algoritmo para clasificar, en este caso, una maquina de soporte vectorial para problemas de clasificacion
        #        classifier = SklearnClassifier(knn()).train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        y_true = []
        y_pred = []
        for i in range(len(testcv)):
            y_true.append(feature_labels[testcv[i]][1])
            y_pred.append(classifier.classify(feature_labels[testcv[i]][0]))
        acc = metrics.accuracy_score(
            y_true, y_pred
        )  #tomamos la exactitud de nuestra clasificacion con los datos de entrenamiento y los de prueba
        sum_accuracy += acc  #sumamos la exactitud total
        k += 1
        print(str(k) + ')exactitud: ' + str(acc))
        print('Clases utilizadas: ' + str(y_true))
        print('Predicciones: ' + str(y_pred))
        print('')
    print('EXACTITUD: ' + str(sum_accuracy / k))
    classifier.train(feature_labels)
    return classifier
コード例 #11
0
testing_set = featuresset[:100]

save_featuresset = open("pickleDocuments/reviewDocumentFeaturesset.pickle", "wb")
pickle.dump(featuresset, save_featuresset)
save_featuresset.close()

naive_bays_classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bays classifier accuracy :", nltk.classify.accuracy(naive_bays_classifier, testing_set))
print(naive_bays_classifier.show_most_informative_features())

save_original_naive_bays = open("pickleAlgos/original_naive_bays_classifier.pickle", "wb")
pickle.dump(naive_bays_classifier, save_original_naive_bays)
save_original_naive_bays.close()

multinomial_naive_bays_classifier = SklearnClassifier(MultinomialNB())
multinomial_naive_bays_classifier.train(training_set)
print("Multinominal Naive Bays classifier accuracy :", nltk.classify.accuracy(multinomial_naive_bays_classifier, testing_set))

save_multinomial_naive_bays_classifier = open("pickleAlgos/multinomial_naive_bays_classifier.pickle", "wb")
pickle.dump(multinomial_naive_bays_classifier, save_multinomial_naive_bays_classifier)
save_multinomial_naive_bays_classifier.close()

bernoulli_naive_bays_classifier = SklearnClassifier(BernoulliNB())
bernoulli_naive_bays_classifier.train(training_set)
print("Bernoulli Naive Bays classifier accuracy :",
      nltk.classify.accuracy(bernoulli_naive_bays_classifier, testing_set))

save_bernoulli_naive_bays_classifier = open("pickleAlgos/bernoulli_naive_bays_classifier.pickle", "wb")
pickle.dump(bernoulli_naive_bays_classifier, save_bernoulli_naive_bays_classifier)
save_bernoulli_naive_bays_classifier.close()
コード例 #12
0
def calc_model():
    global word_features, classifier
    documents = []
    pos = 0
    neg = 0
    with open("data.csv") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for record in csv_reader:
            ap = (' '.join(
                re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)", " ",
                       record[1]).split()))
            ap = word_tokenize(ap)
            documents.append((ap, record[0]))
            if '0' == record[0]:
                neg = neg + 1
            elif '1' == record[0]:
                pos = pos + 1

    print("neg ", neg)
    print("pos ", pos)

    shuffle(documents)

    all_words = []
    for tweet in documents:
        for w in tweet[0]:
            all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    print("getting features")
    word_features = list(all_words.keys())[:1000]

    save_pickle(pickle_word_features, word_features)
    print("saved word features")

    print("setting features per tweet")
    feature_sets = np.array([[find_features(tweet), category]
                             for (tweet, category) in documents])

    data = feature_sets[:, 0]

    k = 10
    cv = KFold(k)
    accur = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    i = 0
    for train_index, test_index in cv.split(data):
        print("starting split " + str(i + 1))
        training_this_round = feature_sets[train_index]
        testing_this_round = feature_sets[test_index]
        linear_svc_classifier = SklearnClassifier(LinearSVC())
        classifier = linear_svc_classifier.train(training_this_round)
        accur.insert(
            i, nltk.classify.util.accuracy(classifier, testing_this_round))
        print('accuracy:', accur[i])
        i = i + 1
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for j, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(j)
            observed = classifier.classify(feats)
            testsets[observed].add(j)

        cv_pos_precision = precision(refsets['1'], testsets['1'])
        cv_pos_recall = recall(refsets['1'], testsets['1'])
        cv_neg_precision = precision(refsets['0'], testsets['0'])
        cv_neg_recall = recall(refsets['0'], testsets['0'])

        print('Precision:', precision(refsets['1'], testsets['1']))
        print('Recall:', recall(refsets['1'], testsets['1']))
        print('Precision neg:', precision(refsets['0'], testsets['0']))
        print('Recall neg:', recall(refsets['0'], testsets['0']))
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)

    print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
    print('precision',
          (sum(pos_precision) / len(accur) + sum(neg_precision) / len(accur)) /
          2)
    print('recall',
          (sum(pos_recall) / len(accur) + sum(neg_recall) / len(accur)) / 2)

    save_pickle(pickle_model, classifier)
コード例 #13
0
    clf = pickle.load(open(path + '/classifier.pkl'))
    # predict = clf.prob_classify_many(feature_extractor.extract_features(test_review, best_words))
    # print '存储预测结果'
    # feature_extractor.store_predict_result(path, predict)
    # print '结束预测'
    #svm
    predict = clf.classify_many(
        feature_extractor.extract_features(test_review, best_words))
    print "存储预测结果"
    p_file = open(path + '/result/great_SVMfinal.txt', 'w')
    for pre in predict:
        p_file.write(pre + '\n')
    p_file.close()

    svmclassifier = SklearnClassifier(LinearSVC())
    svmclassifier.train(train_set)
    predict = svmclassifier.classify_many(test)
    print "svm总体正确率" + str(accuracy_score(tag_test, predict))
    predict = svmclassifier.classify_many(pos)
    print "svm pos正确率" + str(accuracy_score(tag_pos, predict))
    predict = svmclassifier.classify_many(neg)
    print "svm neg正确率" + str(accuracy_score(tag_neg, predict))

    nbclassifier = SklearnClassifier(MultinomialNB())
    nbclassifier.train(train_set)
    predict = nbclassifier.classify_many(test)
    print "nb总体正确率" + str(accuracy_score(tag_test, predict))
    predict = nbclassifier.classify_many(pos)
    print "nb pos正确率" + str(accuracy_score(tag_pos, predict))
    predict = nbclassifier.classify_many(neg)
    print "nb neg正确率" + str(accuracy_score(tag_neg, predict))
コード例 #14
0
def calc_model():
    global word_features, classifier, word_features_2gram
    # documents = [(list(movie_reviews.words(fileid)), category)
    #              for category in movie_reviews.categories()
    #              for fileid in movie_reviews.fileids(category)]

    documents = []
    documents2gram = []

    with open("positive.txt", 'r') as csv_file:
        pos = 1
        for record in csv_file:
            documents.append((word_tokenize(record), pos))
            # sixgrams = get_ngrams(record, 2)
            # documents2gram.append((get_ngrams(record, 2), pos))

    with open("negative.txt", 'r') as csv_file:
        for record in csv_file:
            documents.append((word_tokenize(record), 0))

            # documents2gram.append((get_ngrams(record, 2), 0))


    random.shuffle(documents)
    # random.shuffle(documents2gram)

    all_words = []
    for lst in documents:
        for w in lst[0]:
            all_words.append(w.lower())

    # all_words_2gram = []
    # for lst in documents2gram:
    #     for w in lst[0]:
    #         all_words_2gram.append(w.lower())

    all_words = nltk.FreqDist(all_words)
    print("getting features")
    word_features = list(all_words.keys())[:5000]

    # all_words_2gram = nltk.FreqDist(all_words_2gram)
    # print("getting features")
    # word_features_2gram = list(all_words_2gram.keys())[:5000]

    save_pickle(pickle_word_features, word_features)
    print("saved word features")

    print("setting features per tweet")
    feature_sets = [(find_features(rev), category) for (rev, category) in documents]
    # feature_sets_2gram = [(find_features(rev), category) for (rev, category) in documents2gram]



    k = 10
    cv = KFold(k)
    accur = []
    i = 0

    testing_set = feature_sets[1900:] #+ feature_sets_2gram[1900:]
    training_set = feature_sets[:1900] #+ feature_sets_2gram[:1900]

    linear_svc_classifier = SklearnClassifier(LinearSVC())
    # classifier = nltk.NaiveBayesClassifier.train(testing_set)
    classifier = linear_svc_classifier.train(testing_set)
    accur.insert(i, nltk.classify.util.accuracy(classifier, training_set))


    print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
コード例 #15
0
    # train size of our data set is 10k, which based on research is recommended for the small to medium size of the
    # problem
    train_data = dataset[:3000]
    test_data = dataset[7000:]

    # from this section we are running our project algorithms against 7 classifiers that will display theirs individual
    # accuracy. TO further speed up the process of loading the trained algorithms, we are saving the outputs of these
    # tests into pickle file so we can load them in later stages when required
    NB_classifier = NaiveBayesClassifier.train(train_data)
    print("Naive Bayes accuracy is:", (classify.accuracy(NB_classifier, test_data)) * 100, "%")
    save_classifier = open("TrainedAlgorithms/NB_classifier.pickle", "wb")
    pickle.dump(NB_classifier, save_classifier)
    save_classifier.close()

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(train_data)
    print("Multinomial Naive Bayes accuracy is:", (nltk.classify.accuracy(MNB_classifier, test_data)) * 100, "%")
    save_classifier = open("TrainedAlgorithms/MNB_classifier.pickle", "wb")
    pickle.dump(MNB_classifier, save_classifier)
    save_classifier.close()

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(train_data)
    print("Bernoulli Naive Bayes accuracy is:", (nltk.classify.accuracy(BernoulliNB_classifier, test_data)) * 100, "%")
    save_classifier = open("TrainedAlgorithms/BernoulliNB_classifier.pickle", "wb")
    pickle.dump(BernoulliNB_classifier, save_classifier)
    save_classifier.close()

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(train_data)
    print("Logistic Regression accuracy is:",