def testClient(labeledset, fracLearn=0.8, LearnRate = 0.027, printing = True, SVM = False): import random from nltk import SklearnClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import MultinomialNB random.shuffle(labeledset) length = len(labeledset) trainset = labeledset[:int(length*fracLearn)] testset = labeledset[int(length*fracLearn):] if SVM: clf = SklearnClassifier(LinearSVC(C=LearnRate)) #LR C=0.0012, C=LinearSVC 0.0007 else: clf = SklearnClassifier(LogisticRegression(C=LearnRate)) clf.train(trainset) correct = 0 for i,film in enumerate(testset): if clf.classify(film[0]) == film[1]: correct += 1 testAcc = correct/float(len(testset)) if printing: print('Accuracy on test set: ' + str(testAcc)) correct = 0 for i,film in enumerate(trainset): if clf.classify(film[0]) == film[1]: correct += 1 trainAcc = correct/float(len(trainset)) if printing: print( 'Accuracy on train set: '+str(trainAcc)) if not printing: return testAcc
def classify(tweetFile, algorithm): model = None if algorithm == 0: model = NaiveBayesClassifier.train(training_data) elif algorithm == 1: model = SklearnClassifier(LinearSVC(), sparse=False).train(training_data) data = pd.read_csv(tweetFile) text = data['text'] timestamp = data['timestamp'] datafile = '../gen/' + tweetFile.split('.')[0] + '.json' entry = {} with open(datafile, 'w') as json_file: for txt, ts in zip(text, timestamp): score = model.classify(format_sentence(txt)) if algorithm == 0: entry = {'tweet': txt, 'timestamp': ts, 'classifier': {'name': 'Naive Bayes', 'score': score}} elif algorithm == 1: entry = {'tweet': txt, 'timestamp': ts, 'classifier': {'name': 'SVM', 'score': score}} json.dump(entry, json_file, indent=4) accuracy = nltk.classify.accuracy(model, training_data) print(accuracy) y_true = [0 for _ in range(int(round(len(neg_data) * 1 / 5)))] + [1 for _ in range(int(round(len(pos_data) * 1 / 5)))] y_pred = [model.classify(format_sentence(txt)) for txt in negative[:int(round(len(neg_data) * 1 / 5))]] + [ model.classify(format_sentence(txt)) for txt in positive[:int(round(len(pos_data) * 1 / 5))]] print(precision_recall_fscore_support(y_true, y_pred, average='macro'))
def entrenar_recomendacion(feature_labels): cv = cross_validation.KFold( len(feature_labels), n_folds=10 ) #realizamos validacion cruzada para ver como esta funcionando nuestro clasificador, es decir, ver si podemos encontrar alguna configuracion de parametros que nos de un resultado mas exacto sum_accuracy = 0 sum_average_precision = 0 sum_f1 = 0 sum_precision = 0 sum_recall = 0 sum_roc_auc = 0 k = 0 for traincv, testcv in cv: # classifier = NaiveBayesClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) # classifier = MaxentClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) classifier = SklearnClassifier( SVC(kernel='linear', probability=True) ).train( feature_labels[traincv[0]:traincv[len(traincv) - 1]] ) #elegimos nuestro algoritmo para clasificar, en este caso, una maquina de soporte vectorial para problemas de clasificacion # classifier = SklearnClassifier(knn()).train(feature_labels[traincv[0]:traincv[len(traincv)-1]]) y_true = [] y_pred = [] for i in range(len(testcv)): y_true.append(feature_labels[testcv[i]][1]) y_pred.append(classifier.classify(feature_labels[testcv[i]][0])) acc = metrics.accuracy_score( y_true, y_pred ) #tomamos la exactitud de nuestra clasificacion con los datos de entrenamiento y los de prueba sum_accuracy += acc #sumamos la exactitud total k += 1 print(str(k) + ')exactitud: ' + str(acc)) print('Clases utilizadas: ' + str(y_true)) print('Predicciones: ' + str(y_pred)) print('') print('EXACTITUD: ' + str(sum_accuracy / k)) classifier.train(feature_labels) return classifier
logistic_regression_classifier.train(training_set) print("Logistic regression classifier accuracy :", nltk.classify.accuracy(logistic_regression_classifier, testing_set)) save_logistic_regression_classifier = open("pickleAlgos/logistic_regression_classifier.pickle", "wb") pickle.dump(logistic_regression_classifier, save_logistic_regression_classifier) save_logistic_regression_classifier.close() SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) print("SGD classifier accuracy :", nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) save_SGDClassifier_classifier = open("pickleAlgos/SGDClassifier_classifier.pickle", "wb") pickle.dump(SGDClassifier_classifier, save_SGDClassifier_classifier) save_SGDClassifier_classifier.close() linearSVC_classifier = SklearnClassifier(LinearSVC()) linearSVC_classifier.train(training_set) print("Liner SVC classifier accuracy :", nltk.classify.accuracy(linearSVC_classifier, testing_set)) save_linearSVC_classifier = open("pickleAlgos/linearSVC_classifier.pickle", "wb") pickle.dump(linearSVC_classifier, save_linearSVC_classifier) save_linearSVC_classifier.close() print(multinomial_naive_bays_classifier.classify( find_features(word_tokenize("This is just another terifying movie. I don't like this. Stupidity Completely")))) print(multinomial_naive_bays_classifier.classify( find_features(word_tokenize("I love this car. This is so beautiful and fast.It has so many gears either"))))