def testClient(labeledset, fracLearn=0.8, LearnRate = 0.027, printing = True, SVM = False): import random from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import MultinomialNB random.shuffle(labeledset) length = len(labeledset) trainset = labeledset[:int(length*fracLearn)] testset = labeledset[int(length*fracLearn):] if SVM: clf = SklearnClassifier(LinearSVC(C=LearnRate)) #LR C=0.0012, C=LinearSVC 0.0007 else: clf = SklearnClassifier(LogisticRegression(C=LearnRate)) clf.train(trainset) correct = 0 for i,film in enumerate(testset): if clf.classify(film[0]) == film[1]: correct += 1 testAcc = correct/float(len(testset)) if printing: print('Accuracy on test set: ' + str(testAcc)) correct = 0 for i,film in enumerate(trainset): if clf.classify(film[0]) == film[1]: correct += 1 trainAcc = correct/float(len(trainset)) if printing: print( 'Accuracy on train set: '+str(trainAcc)) if not printing: return testAcc
def train_classifier(collection, filename_start, **kwargs): main_cat_name = kwargs["main_cat_name"] main_cat_filenames = kwargs["main_cat_filenames"] opposite_cat_name = kwargs["opposite_cat_name"] opposite_cat_filenames = kwargs["opposite_cat_filenames"] main_cat_ids = get_training_ids_ms(*main_cat_filenames) main_cat = TrainTextPreprocessor(ids_list=main_cat_ids, db_collection=collection, category=main_cat_name) main_cat.process_data() # main_cat_features = main_cat.get_most_frequent(50) main_cat_tokens = main_cat.category_tokens main_cat_tweets = main_cat.tweets_lemmas_categorized opposite_cat_ids = get_training_ids_ms(*opposite_cat_filenames) opposite_cat = TrainTextPreprocessor(ids_list=opposite_cat_ids, db_collection=collection, category=opposite_cat_name) opposite_cat.process_data() # opposite_cat_features = opposite_cat.get_most_frequent(50) opposite_cat_tokens = opposite_cat.category_tokens opposite_cat_tweets = opposite_cat.tweets_lemmas_categorized # print(opposite_cat_tokens) # Compute TF-IDF corpus = main_cat.tweets_lemmas + opposite_cat.tweets_lemmas tf_idf_range = compute_tfidf(corpus) for el in tf_idf_range: # print(el) pass documents = combine_and_shuffle(main_cat_tweets, opposite_cat_tweets) word_features = combine_and_shuffle(main_cat_tokens, opposite_cat_tokens) featuresets = [] for tweet, category in documents: featuresets.append((find_features(tweet, word_features), category)) train_index, test_index = get_indexes_80_20(len(featuresets)) training_set = featuresets[:train_index] testing_set = featuresets[test_index:] # NAIVE BAYES CLASSIFIER # NBC_classifier = nltk.NaiveBayesClassifier.train(training_set) # print("Original NB classifier accuracy percent:", (nltk.classify.accuracy(NBC_classifier, testing_set)) * 100) # # NBC_classifier.show_most_informative_features(50) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) print("Linear SVC classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)
def train_mnb_clf(training_set, testing_set): """ accuracy: 73.28 """ mnb_classifier = SklearnClassifier(MultinomialNB()) mnb_classifier.train(training_set) print("Multinomial NB Classifier accuracy:", (classify.accuracy(mnb_classifier, testing_set)) * 100) pickle_as = os.path.join(utils.get_project_root(), 'data/classifiers/mnb_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(mnb_classifier, f)
def train_linear_svc_clf(training_set, testing_set): """ accuracy: 72.01 """ linear_svc_classifier = SklearnClassifier(LinearSVC()) linear_svc_classifier.train(training_set) print("LinearSVC Classifier accuracy:", (classify.accuracy(linear_svc_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/linear_svc_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(linear_svc_classifier, f)
def train_bernoulli_nb_clf(training_set, testing_set): """ accuracy: 74.64 """ bernoulli_nb_classifier = SklearnClassifier(BernoulliNB()) bernoulli_nb_classifier.train(training_set) print("Bernoulli NB Classifier accuracy:", (classify.accuracy(bernoulli_nb_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/bernoulli_nb_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(bernoulli_nb_classifier, f)
def train_logistic_regression_clf(training_set, testing_set): """ accuracy: 74.59 """ logistic_regression_classifier = SklearnClassifier(LogisticRegression()) logistic_regression_classifier.train(training_set) print('Logistic Regression Classifier accuracy:', (classify.accuracy(logistic_regression_classifier, testing_set)) * 100) pickle_as = os.path.join( utils.get_project_root(), 'data/classifiers/logistic_regression_classifier_5k.pickle') with open(pickle_as, 'wb') as f: pickle.dump(logistic_regression_classifier, f)
def suggestions(labeledset, featureset, num = 20): from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import MultinomialNB clf = SklearnClassifier(LogisticRegression(C=0.024)) clf.train(labeledset) filmsSeen = [] for film in labeledset: filmsSeen.append( film[0]['title'] ) suggestions = [] for film in featureset: if film['title'] in filmsSeen: continue suggestions.append( (film['title'], clf.prob_classify(film).prob('fresh') )) suggestions.sort(key=lambda x: x[1], reverse=True) return suggestions[:num]
def train(isis_path, general_path, out_path, for_production=True): # Load data isis_tweets = tuple(codecs.open(isis_path, 'r', 'utf-8-sig')) general_tweets = tuple(codecs.open(general_path, 'r', 'utf-8-sig')) # Done # Build datasets # Label & shuffle lines labeled_lines = ([(line, 'isis') for line in isis_tweets] + [(line, 'general') for line in general_tweets]) random.shuffle(labeled_lines) # Tokenize into words entire_set = [(tweet_features(n), tweet_class) for (n, tweet_class) in labeled_lines] cls = SklearnClassifier(LogisticRegression()) train_set = test_set = entire_set if not for_production: train_set = entire_set[500:] test_set = entire_set[:500] cls.train(train_set) print("accuracy on training set: " + str(classify.accuracy(cls, test_set))) joblib.dump(cls, out_path)
def nearCrits(crit, critset, filmset, fracTrain = 0.7, learnRate = 0.024): import random from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression labeledset = makeLabeledset(crit,filmset,critset) random.shuffle(labeledset) trainset = labeledset[:int(len(labeledset)*fracTrain)] clf = SklearnClassifier(LogisticRegression(C=learnRate)) clf.train(trainset) critdist = [] baseline = clf.prob_classify({}).prob('fresh') for crit in critset: quickdic = {} quickdic[crit] = 'fresh' dist = clf.prob_classify(quickdic).prob('fresh') - baseline critdist.append( (crit,dist) ) critdist.sort(key=lambda x: x[1], reverse=True) for i in range(-10,10): print (critdist[i])
def entrenar_recomendacion(feature_labels): cv = cross_validation.KFold( len(feature_labels), n_folds=10 ) #realizamos validacion cruzada para ver como esta funcionando nuestro clasificador, es decir, ver si podemos encontrar alguna configuracion de parametros que nos de un resultado mas exacto sum_accuracy = 0 sum_average_precision = 0 sum_f1 = 0 sum_precision = 0 sum_recall = 0 sum_roc_auc = 0 k = 0 for traincv, testcv in cv: # classifier = NaiveBayesClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) # classifier = MaxentClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) classifier = SklearnClassifier( SVC(kernel='linear', probability=True) ).train( feature_labels[traincv[0]:traincv[len(traincv) - 1]] ) #elegimos nuestro algoritmo para clasificar, en este caso, una maquina de soporte vectorial para problemas de clasificacion # classifier = SklearnClassifier(knn()).train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) y_true = [] y_pred = [] for i in range(len(testcv)): y_true.append(feature_labels[testcv[i]][1]) y_pred.append(classifier.classify(feature_labels[testcv[i]][0])) acc = metrics.accuracy_score( y_true, y_pred ) #tomamos la exactitud de nuestra clasificacion con los datos de entrenamiento y los de prueba sum_accuracy += acc #sumamos la exactitud total k += 1 print(str(k) + ')exactitud: ' + str(acc)) print('Clases utilizadas: ' + str(y_true)) print('Predicciones: ' + str(y_pred)) print('') print('EXACTITUD: ' + str(sum_accuracy / k)) classifier.train(feature_labels) return classifier
testing_set = featuresset[:100] save_featuresset = open("pickleDocuments/reviewDocumentFeaturesset.pickle", "wb") pickle.dump(featuresset, save_featuresset) save_featuresset.close() naive_bays_classifier = nltk.NaiveBayesClassifier.train(training_set) print("Naive Bays classifier accuracy :", nltk.classify.accuracy(naive_bays_classifier, testing_set)) print(naive_bays_classifier.show_most_informative_features()) save_original_naive_bays = open("pickleAlgos/original_naive_bays_classifier.pickle", "wb") pickle.dump(naive_bays_classifier, save_original_naive_bays) save_original_naive_bays.close() multinomial_naive_bays_classifier = SklearnClassifier(MultinomialNB()) multinomial_naive_bays_classifier.train(training_set) print("Multinominal Naive Bays classifier accuracy :", nltk.classify.accuracy(multinomial_naive_bays_classifier, testing_set)) save_multinomial_naive_bays_classifier = open("pickleAlgos/multinomial_naive_bays_classifier.pickle", "wb") pickle.dump(multinomial_naive_bays_classifier, save_multinomial_naive_bays_classifier) save_multinomial_naive_bays_classifier.close() bernoulli_naive_bays_classifier = SklearnClassifier(BernoulliNB()) bernoulli_naive_bays_classifier.train(training_set) print("Bernoulli Naive Bays classifier accuracy :", nltk.classify.accuracy(bernoulli_naive_bays_classifier, testing_set)) save_bernoulli_naive_bays_classifier = open("pickleAlgos/bernoulli_naive_bays_classifier.pickle", "wb") pickle.dump(bernoulli_naive_bays_classifier, save_bernoulli_naive_bays_classifier) save_bernoulli_naive_bays_classifier.close()
def calc_model(): global word_features, classifier documents = [] pos = 0 neg = 0 with open("data.csv") as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for record in csv_reader: ap = (' '.join( re.sub("(@[A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", record[1]).split())) ap = word_tokenize(ap) documents.append((ap, record[0])) if '0' == record[0]: neg = neg + 1 elif '1' == record[0]: pos = pos + 1 print("neg ", neg) print("pos ", pos) shuffle(documents) all_words = [] for tweet in documents: for w in tweet[0]: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) print("getting features") word_features = list(all_words.keys())[:1000] save_pickle(pickle_word_features, word_features) print("saved word features") print("setting features per tweet") feature_sets = np.array([[find_features(tweet), category] for (tweet, category) in documents]) data = feature_sets[:, 0] k = 10 cv = KFold(k) accur = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] i = 0 for train_index, test_index in cv.split(data): print("starting split " + str(i + 1)) training_this_round = feature_sets[train_index] testing_this_round = feature_sets[test_index] linear_svc_classifier = SklearnClassifier(LinearSVC()) classifier = linear_svc_classifier.train(training_this_round) accur.insert( i, nltk.classify.util.accuracy(classifier, testing_this_round)) print('accuracy:', accur[i]) i = i + 1 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for j, (feats, label) in enumerate(testing_this_round): refsets[label].add(j) observed = classifier.classify(feats) testsets[observed].add(j) cv_pos_precision = precision(refsets['1'], testsets['1']) cv_pos_recall = recall(refsets['1'], testsets['1']) cv_neg_precision = precision(refsets['0'], testsets['0']) cv_neg_recall = recall(refsets['0'], testsets['0']) print('Precision:', precision(refsets['1'], testsets['1'])) print('Recall:', recall(refsets['1'], testsets['1'])) print('Precision neg:', precision(refsets['0'], testsets['0'])) print('Recall neg:', recall(refsets['0'], testsets['0'])) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur)) print('precision', (sum(pos_precision) / len(accur) + sum(neg_precision) / len(accur)) / 2) print('recall', (sum(pos_recall) / len(accur) + sum(neg_recall) / len(accur)) / 2) save_pickle(pickle_model, classifier)
clf = pickle.load(open(path + '/classifier.pkl')) # predict = clf.prob_classify_many(feature_extractor.extract_features(test_review, best_words)) # print '存储预测结果' # feature_extractor.store_predict_result(path, predict) # print '结束预测' #svm predict = clf.classify_many( feature_extractor.extract_features(test_review, best_words)) print "存储预测结果" p_file = open(path + '/result/great_SVMfinal.txt', 'w') for pre in predict: p_file.write(pre + '\n') p_file.close() svmclassifier = SklearnClassifier(LinearSVC()) svmclassifier.train(train_set) predict = svmclassifier.classify_many(test) print "svm总体正确率" + str(accuracy_score(tag_test, predict)) predict = svmclassifier.classify_many(pos) print "svm pos正确率" + str(accuracy_score(tag_pos, predict)) predict = svmclassifier.classify_many(neg) print "svm neg正确率" + str(accuracy_score(tag_neg, predict)) nbclassifier = SklearnClassifier(MultinomialNB()) nbclassifier.train(train_set) predict = nbclassifier.classify_many(test) print "nb总体正确率" + str(accuracy_score(tag_test, predict)) predict = nbclassifier.classify_many(pos) print "nb pos正确率" + str(accuracy_score(tag_pos, predict)) predict = nbclassifier.classify_many(neg) print "nb neg正确率" + str(accuracy_score(tag_neg, predict))
def calc_model(): global word_features, classifier, word_features_2gram # documents = [(list(movie_reviews.words(fileid)), category) # for category in movie_reviews.categories() # for fileid in movie_reviews.fileids(category)] documents = [] documents2gram = [] with open("positive.txt", 'r') as csv_file: pos = 1 for record in csv_file: documents.append((word_tokenize(record), pos)) # sixgrams = get_ngrams(record, 2) # documents2gram.append((get_ngrams(record, 2), pos)) with open("negative.txt", 'r') as csv_file: for record in csv_file: documents.append((word_tokenize(record), 0)) # documents2gram.append((get_ngrams(record, 2), 0)) random.shuffle(documents) # random.shuffle(documents2gram) all_words = [] for lst in documents: for w in lst[0]: all_words.append(w.lower()) # all_words_2gram = [] # for lst in documents2gram: # for w in lst[0]: # all_words_2gram.append(w.lower()) all_words = nltk.FreqDist(all_words) print("getting features") word_features = list(all_words.keys())[:5000] # all_words_2gram = nltk.FreqDist(all_words_2gram) # print("getting features") # word_features_2gram = list(all_words_2gram.keys())[:5000] save_pickle(pickle_word_features, word_features) print("saved word features") print("setting features per tweet") feature_sets = [(find_features(rev), category) for (rev, category) in documents] # feature_sets_2gram = [(find_features(rev), category) for (rev, category) in documents2gram] k = 10 cv = KFold(k) accur = [] i = 0 testing_set = feature_sets[1900:] #+ feature_sets_2gram[1900:] training_set = feature_sets[:1900] #+ feature_sets_2gram[:1900] linear_svc_classifier = SklearnClassifier(LinearSVC()) # classifier = nltk.NaiveBayesClassifier.train(testing_set) classifier = linear_svc_classifier.train(testing_set) accur.insert(i, nltk.classify.util.accuracy(classifier, training_set)) print('LinearSVC_classifier average accuracy:', sum(accur) / len(accur))
# train size of our data set is 10k, which based on research is recommended for the small to medium size of the # problem train_data = dataset[:3000] test_data = dataset[7000:] # from this section we are running our project algorithms against 7 classifiers that will display theirs individual # accuracy. TO further speed up the process of loading the trained algorithms, we are saving the outputs of these # tests into pickle file so we can load them in later stages when required NB_classifier = NaiveBayesClassifier.train(train_data) print("Naive Bayes accuracy is:", (classify.accuracy(NB_classifier, test_data)) * 100, "%") save_classifier = open("TrainedAlgorithms/NB_classifier.pickle", "wb") pickle.dump(NB_classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(train_data) print("Multinomial Naive Bayes accuracy is:", (nltk.classify.accuracy(MNB_classifier, test_data)) * 100, "%") save_classifier = open("TrainedAlgorithms/MNB_classifier.pickle", "wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(train_data) print("Bernoulli Naive Bayes accuracy is:", (nltk.classify.accuracy(BernoulliNB_classifier, test_data)) * 100, "%") save_classifier = open("TrainedAlgorithms/BernoulliNB_classifier.pickle", "wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(train_data) print("Logistic Regression accuracy is:",