def testClient(labeledset, fracLearn=0.8, LearnRate = 0.027, printing = True, SVM = False):
    import random
    from nltk import SklearnClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC, LinearSVC
    from sklearn.naive_bayes import MultinomialNB
    
    random.shuffle(labeledset)
    length = len(labeledset)
    trainset = labeledset[:int(length*fracLearn)]
    testset = labeledset[int(length*fracLearn):]
    
    if SVM:
        clf = SklearnClassifier(LinearSVC(C=LearnRate)) #LR C=0.0012, C=LinearSVC 0.0007
    else:
        clf = SklearnClassifier(LogisticRegression(C=LearnRate))
    clf.train(trainset)
    
    correct = 0
    for i,film in enumerate(testset):
        if clf.classify(film[0]) == film[1]:
            correct += 1
    testAcc = correct/float(len(testset))
    if printing: print('Accuracy on test set: ' + str(testAcc))
    correct = 0
    for i,film in enumerate(trainset):
        if clf.classify(film[0]) == film[1]:
            correct += 1
    trainAcc = correct/float(len(trainset))
    if printing: print( 'Accuracy on train set: '+str(trainAcc))
    if not printing: return testAcc
def classify(tweetFile, algorithm):
    model = None

    if algorithm == 0:
        model = NaiveBayesClassifier.train(training_data)
    elif algorithm == 1:
        model = SklearnClassifier(LinearSVC(), sparse=False).train(training_data)

    data = pd.read_csv(tweetFile)
    text = data['text']
    timestamp = data['timestamp']

    datafile = '../gen/' + tweetFile.split('.')[0] + '.json'
    entry = {}

    with open(datafile, 'w') as json_file:

        for txt, ts in zip(text, timestamp):

            score = model.classify(format_sentence(txt))

            if algorithm == 0:
                entry = {'tweet': txt, 'timestamp': ts, 'classifier': {'name': 'Naive Bayes', 'score': score}}
            elif algorithm == 1:
                entry = {'tweet': txt, 'timestamp': ts, 'classifier': {'name': 'SVM', 'score': score}}

            json.dump(entry, json_file, indent=4)

    accuracy = nltk.classify.accuracy(model, training_data)
    print(accuracy)

    y_true = [0 for _ in range(int(round(len(neg_data) * 1 / 5)))] + [1 for _ in
                                                                      range(int(round(len(pos_data) * 1 / 5)))]
    y_pred = [model.classify(format_sentence(txt)) for txt in negative[:int(round(len(neg_data) * 1 / 5))]] + [
        model.classify(format_sentence(txt))
        for txt in positive[:int(round(len(pos_data) * 1 / 5))]]

    print(precision_recall_fscore_support(y_true, y_pred, average='macro'))
Esempio n. 3
0
def entrenar_recomendacion(feature_labels):
    cv = cross_validation.KFold(
        len(feature_labels), n_folds=10
    )  #realizamos validacion cruzada para ver como esta funcionando nuestro clasificador, es decir, ver si podemos encontrar alguna configuracion de parametros que nos de un resultado mas exacto
    sum_accuracy = 0
    sum_average_precision = 0
    sum_f1 = 0
    sum_precision = 0
    sum_recall = 0
    sum_roc_auc = 0
    k = 0
    for traincv, testcv in cv:
        #        classifier = NaiveBayesClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        #        classifier = MaxentClassifier.train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        classifier = SklearnClassifier(
            SVC(kernel='linear', probability=True)
        ).train(
            feature_labels[traincv[0]:traincv[len(traincv) - 1]]
        )  #elegimos nuestro algoritmo para clasificar, en este caso, una maquina de soporte vectorial para problemas de clasificacion
        #        classifier = SklearnClassifier(knn()).train(feature_labels[traincv[0]:traincv[len(traincv)-1]])
        y_true = []
        y_pred = []
        for i in range(len(testcv)):
            y_true.append(feature_labels[testcv[i]][1])
            y_pred.append(classifier.classify(feature_labels[testcv[i]][0]))
        acc = metrics.accuracy_score(
            y_true, y_pred
        )  #tomamos la exactitud de nuestra clasificacion con los datos de entrenamiento y los de prueba
        sum_accuracy += acc  #sumamos la exactitud total
        k += 1
        print(str(k) + ')exactitud: ' + str(acc))
        print('Clases utilizadas: ' + str(y_true))
        print('Predicciones: ' + str(y_pred))
        print('')
    print('EXACTITUD: ' + str(sum_accuracy / k))
    classifier.train(feature_labels)
    return classifier
logistic_regression_classifier.train(training_set)
print("Logistic regression classifier accuracy :",
      nltk.classify.accuracy(logistic_regression_classifier, testing_set))

save_logistic_regression_classifier = open("pickleAlgos/logistic_regression_classifier.pickle", "wb")
pickle.dump(logistic_regression_classifier, save_logistic_regression_classifier)
save_logistic_regression_classifier.close()

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGD classifier accuracy :",
      nltk.classify.accuracy(SGDClassifier_classifier, testing_set))

save_SGDClassifier_classifier = open("pickleAlgos/SGDClassifier_classifier.pickle", "wb")
pickle.dump(SGDClassifier_classifier, save_SGDClassifier_classifier)
save_SGDClassifier_classifier.close()

linearSVC_classifier = SklearnClassifier(LinearSVC())
linearSVC_classifier.train(training_set)
print("Liner SVC classifier accuracy :",
      nltk.classify.accuracy(linearSVC_classifier, testing_set))

save_linearSVC_classifier = open("pickleAlgos/linearSVC_classifier.pickle", "wb")
pickle.dump(linearSVC_classifier, save_linearSVC_classifier)
save_linearSVC_classifier.close()

print(multinomial_naive_bays_classifier.classify(
    find_features(word_tokenize("This is just another terifying movie. I don't like this. Stupidity Completely"))))
print(multinomial_naive_bays_classifier.classify(
    find_features(word_tokenize("I love this car. This is so beautiful and fast.It has so many gears either"))))