Ejemplo n.º 1
0
class SKClassifier:

    classifier = None

    def __init__(self, cls='SVC'):
        self.classifier = SklearnClassifier({
            'SVC': SVC(),
            'LogisticRegression': LogisticRegression(),
            'BernoulliNB': BernoulliNB()
        }[cls])
        if not self.classifier:
            self.classifier = SklearnClassifier(SVC())

    def train(self, trainset):
        self.classifier.train(trainset)

    def test(self, tagged, featuresets):
        predict = self.classifier.classify_many(featuresets)
        print predict
        return accuracy_score(tagged, predict)

    def classify(self, featureset):
        return self.classifier.classify(featureset)

    def classify_many(self, featuresets):
        return self.classifier.classify_many(featuresets)
Ejemplo n.º 2
0
class chatBot(object):

    def __init__(self):
        self.posts = nltk.corpus.nps_chat.xml_posts()
        self.categories = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer',
                'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis',
                'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
        self.mapper = [0, 2, 6, 3, 11, 5, 8, 1, 8, 3, 10, 11, 13, 13, 13]
        self.responses = {}
        self.featuresets = []
        self.train = []
        self.test = []
        self.testSet = []
        self.testSetClass = []
        self.classif = SklearnClassifier(LinearSVC())
        for i in range(0, 15):
            self.responses[i] = []
        for post in self.posts:
            self.featuresets.append((self.tokenize(post.text),self.categories.index(post.get('class'))))
            self.temp = self.responses[self.categories.index(post.get('class'))]
            self.temp.append(post.text)

    def tokenize(self, sentence):
        """
            Extracts a set of features from a message.
        """
        features = {}
        tokens = nltk.word_tokenize(sentence)
        for t in tokens:
            features['contains(%s)' % t.lower()] = True
        return features

    def talk(self):
        while 1:
            inp = raw_input("YOU: ")
            features = self.tokenize(inp)
            pp = self.classif.classify_many(features)
            pp = pp[0]
            pp = int(pp)
            m = self.mapper[pp]
            r = self.responses[m]
            val = randint(0, len(r))
            print("BOT: "+r[val])

    def trainSet(self):
        shuffle(self.featuresets)
        size = int(len(self.featuresets) * .1) # 10% is used for the test set
        self.train = self.featuresets[size:]
        self.test = self.featuresets[:size]
        self.classif.train(self.train)

        self.testSet = []
        self.testSetClass = []
        for i in self.test:
            self.testSet.append(i[0])
            self.testSetClass.append(i[1])
        self.batch = self.classif.classify_many(self.testSet)

    def statistics(self):
        print (classification_report(self.testSetClass, self.batch, labels=list(set(self.testSetClass)),target_names=self.categories))
Ejemplo n.º 3
0
def ClassAccuracy(classifier, train_set, test_set):
    """To use classifiers of scikit-learn in nltk

    For classifiers, I've written my own NaiveBayes Classifier and I also considered 
    several available classifiers in sklearn like {'BernoulliNB', 'LogisticRegression', 
    'SVC', 'LinearSVC', 'NuSVC'}.

    Args:
        classifier: You can choose any classifier in sklearn, For example:
            BernoulliNB()
        train_set: The already labeled and features extracted training set, For example:
            [({'bad':-5,'good':5,...},'+1')]
        test_set: The already labeled testing set. The form is the same as train_set.

    Returns:
        accuracy_score: The accuracy of of your trained classifier by predict your test set.
    """
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)

    pred = classifier.classify_many([
        features for (features, label) in test_set
    ])  # do prediction on test set
    return accuracy_score([label for (features, label) in test_set],
                          pred)  # compare pred and label, give accuracy
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    # nltk.classify.scikitlearn(BernoulliNB())
    predict = classifier.classify_many(test)
    # classifier.prob_classify_many()
    return accuracy_score(tag_test, predict)
Ejemplo n.º 5
0
def score(classifier):
    classifier = SklearnClassifier(classifier)  #在nltk中使用scikit-learn的接口
    classifier.train(train)  #训练分类器
    pred_pos = classifier.classify_many(data_pos)  #对测试集的数据进行分类,给出预测的标签
    pred_neg = classifier.classify_many(data_neg)
    n = 0
    n2 = 0
    for i in range(0, pos_len):
        if pred_pos[i] == tag_pos[i]:
            n += 1
    for i in range(0, neg_len):
        if pred_neg[i] == tag_neg[i]:
            n2 += 1
    pos_precision = n / (n + neg_len - n2)
    pos_recall = n / pos_len
    neg_precision = n2 / (n2 + pos_len - n)
    neg_recall = n2 / neg_len
    pos_F = 2 * pos_precision * pos_recall / (pos_precision + pos_recall)
    neg_F = 2 * neg_precision * neg_recall / (neg_precision + neg_recall)
    print(pos_F, neg_F)
    savePath.write('正面情感准确率为: %.2f\n' % pos_precision)
    savePath.write('负面情感准确率为: %.2f\n' % neg_precision)
    savePath.write('正面情感召回率为: %.2f\n' % pos_recall)
    savePath.write('负面情感召回率为: %.2f\n' % neg_recall)
    savePath.write('正面F-Measure值为: %.2f\n' % pos_F)
    savePath.write('负面F-Measure值为: %.2f\n' % neg_F)
    return pos_precision, neg_precision, pos_recall, neg_recall  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 6
0
def score(classifier):

    classifier = SklearnClassifier(classifier)

    classifier.train(train)

    pred = classifier.classify_many(data)

    n = 0

    m = 0

    s = len(pred)

    for i in range(0, s):

        if pred[i] == tag[i] and pred[i] == 'neg':

            n = n + 1

        if pred[i] == 'neg':

            m = m + 1

    return float(n) / float(m)
Ejemplo n.º 7
0
def score(trainset, testset, classifier):
    classifier = SklearnClassifier(classifier)
    classifier._vectorizer.sort = False
    classifier.train(trainset)
    (test, tag_test) = zip(*testset)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
    def buildAndTrainClassifier(self, X, Y):
        print('Building and Training Classifier')

        n_folds = 10
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        fold = 1
        for train, test in kf.split(X):
            print('Training on cross validation set ', fold)

            train_X = np.array(X)[train]
            test_X = np.array(X)[test]
            train_y = np.array(Y)[train]
            test_y = np.array(Y)[test]
            print('Training size: ', len(train_y))
            print('Test size: ', len(test_y))

            labeled_features = list(zip(train_X, train_y))
            #print(labeled_features[0])
            #print(type(train_X[0]))
            model = SklearnClassifier(MultinomialNB()).train(labeled_features)
            predicted = model.classify_many(test_X)
            fold += 1

        print('Confusion matrix =', confusion_matrix(test_y, predicted))
        print('Precision score =',
              precision_score(predicted, test_y, average=None))
        print('Recall score =', recall_score(predicted, test_y, average=None))
        print('Accuracy score =', accuracy_score(predicted, test_y))
        print('Training score =', f1_score(predicted, test_y, average=None))

        return model
Ejemplo n.º 9
0
def SVM(training_set, test_set):
    classifier = SklearnClassifier(LinearSVC())
    print("Training a new SVM classifier")
    classifier.train(training_set)
    print("Accuracy of SVM in training:",nltk.classify.accuracy(classifier, test_set))
#     classifier.show_most_informative_features(5)
    #print("Running new Decision Tree classifier")
    accuracy = nltk.classify.accuracy(classifier, test_set)
    trueLabels = [l for d, l in test_set]
    predictedLabels = classifier.classify_many([d for d,t in test_set])
    #print("Accuracy:",accuracy)
#     classifier.show_most_informative_features(MIF)
    def runTrained(test_set, hasTags=False):
        #print("Running pre-trained Decision Tree classifier")
        if hasTags:
            tagglessTest_set = [data for data, tag in test_set]
            acc = nltk.classify.accuracy(classifier, test_set)
            print("Accuracy:", acc)
            predictions = classifier.classify_many(tagglessTest_set)
            return ([e for e in zip(tagglessTest_set, predictions)], acc)
        else:
            tagglessTest_set = test_set         
        predictions = classifier.classify_many(tagglessTest_set)
        #print("Predicted Labels:",predictions)
        return [e for e in zip(tagglessTest_set, predictions)]
    return (runTrained, accuracy, predictedLabels, trueLabels) 
Ejemplo n.º 10
0
def learn_model(data,target):
    bestwords = best_of_words(data, target)
    # preparing data for split validation. 80% training, 20% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43)
    #classifier = BernoulliNB().fit(data_train,target_train)
    train_feature=[]
    test_feature=[]
    for i in range(len(data_train)):
        d=data_train[i]
        d=jieba.cut(d, cut_all=False)
        l=target_train[i]
        #tmp=[bigram(d),l]
        tmp = [dict([(word, True) for word in d if word in bestwords]), l]
        train_feature.append(tmp)
        
    for i in range(len(data_test)):
        d=data_test[i]
        d=jieba.cut(d, cut_all=False)
        l=target_test[i]
        #tmp=bigram(d)
        tmp = dict([(word, True) for word in d if word in bestwords])
        test_feature.append(tmp)
    
        
    classifier = SklearnClassifier(MultinomialNB())
    classifier.train(train_feature)
   
    predicted = classifier.classify_many(test_feature)
    
    evaluate_model(target_test,predicted)

    return classifier, bestwords
Ejemplo n.º 11
0
def build_classifier_score(train_set, test_set, classifier):
    data, tag = zip(*test_set)
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    pred = classifier.classify_many(data)

    return accuracy_score(tag, pred)
Ejemplo n.º 12
0
def SVM(training_set, test_set):
    classifier = SklearnClassifier(LinearSVC())
    print("Training a new SVM classifier")
    classifier.train(training_set)
    print("Accuracy of SVM in training:",
          nltk.classify.accuracy(classifier, test_set))
    #     classifier.show_most_informative_features(5)
    #print("Running new Decision Tree classifier")
    accuracy = nltk.classify.accuracy(classifier, test_set)
    trueLabels = [l for d, l in test_set]
    predictedLabels = classifier.classify_many([d for d, t in test_set])

    #print("Accuracy:",accuracy)
    #     classifier.show_most_informative_features(MIF)
    def runTrained(test_set, hasTags=False):
        #print("Running pre-trained Decision Tree classifier")
        if hasTags:
            tagglessTest_set = [data for data, tag in test_set]
            acc = nltk.classify.accuracy(classifier, test_set)
            print("Accuracy:", acc)
            predictions = classifier.classify_many(tagglessTest_set)
            return ([e for e in zip(tagglessTest_set, predictions)], acc)
        else:
            tagglessTest_set = test_set
        predictions = classifier.classify_many(tagglessTest_set)
        #print("Predicted Labels:",predictions)
        return [e for e in zip(tagglessTest_set, predictions)]

    return (runTrained, accuracy, predictedLabels, trueLabels)
Ejemplo n.º 13
0
def score(trainset, testset, classifier):
    classifier = SklearnClassifier(classifier)
    classifier._vectorizer.sort = False
    classifier.train(trainset)
    (test, tag_test) = zip(*testset)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
def get_recall(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)
    pred = classifier.classify_many(test)
    acc_score = accuracy_score(tag_test, pred)
    recall_s = acc_score - 0.05
    return recall_s
Ejemplo n.º 15
0
def linear_model(train_df, cv_df):
    train_tokenized_data = get_data_for_nltk(train_df)
    cv_tokenized_data = get_data_for_nltk(cv_df)

    # Set up SGD Classifier via scikitlearn wrapper
    # log loss makes sure its logistic regression
    classifier = SklearnClassifier(SGDClassifier(loss='log', max_iter=5))
    print("Training Linear Classifier...")
    classifier.train(train_tokenized_data)
    print("Training Done")

    print("Saving Linear Classifier")
    save_pickle(classifier, "linear_model.pickle")
    cv_accuracy = nltk.classify.accuracy(classifier, cv_tokenized_data)
    print("Cross Validation Accuracy of Linear Classifier {}%".format(
        round(cv_accuracy * 100, 2)))
    predictions = classifier.classify_many([x for (x, y) in cv_tokenized_data])

    print("Plotting Learning Curve...")
    build_learning_curve(classifier, train_tokenized_data, cv_tokenized_data)

    print("Saving precision, recall and F-scores in performance.txt")
    with open('performance.txt', 'w') as text_file:
        text_file.write('Linear Model Expected Performance Below:')
        text_file.write('\n')
        text_file.write(
            classification_report(cv_df['label'].tolist(), predictions))
Ejemplo n.º 16
0
def performCrossValidation(featureset, labels, foldsCount, sklearnclassifier, uniqLabels):
    accuracySum = 0.0
    precisionSums = defaultdict(float)
    recallSums = defaultdict(float)
    fscoreSums = defaultdict(float)
    crossValidationIterations = cross_validation.StratifiedKFold(labels, n_folds=foldsCount)
    for train, test in crossValidationIterations:
        trainset = [featureset[i] for i in train]
        testset = [featureset[i] for i in test]
        print("before train")
        classifier = SklearnClassifier(sklearnclassifier).train(trainset)

        true = [label for features, label in testset]
        predicted = classifier.classify_many([features for features, label in testset])

        precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels)
        accuracy = accuracy_score(true, predicted)
        accuracySum += accuracy

        for label, value in zip(uniqLabels, precisions):
            precisionSums[label] += value
        for label, value in zip(uniqLabels, recalls):
            recallSums[label] += value
        for label, value in zip(uniqLabels, fscores):
            fscoreSums[label] += value

    print("Average accurancy: {0:.3f}".format(accuracySum/foldsCount))
    measures = {label: (sum/foldsCount, recallSums.get(label)/foldsCount, fscoreSums.get(label)/foldsCount) for label, sum in precisionSums.items()}
    for label, (prec, recall, fscore) in measures.items():
        print("Average precision for {0}: {1:.3f}".format(label, prec))
        print("Average recall for {0}: {1:.3f}".format(label, recall))
        print("Average f score for {0}: {1:.3f}".format(label, fscore))
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    #pred = classifier.batch_classify(test)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
Ejemplo n.º 18
0
def score(classifier):
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train_set)  #训练分类器
    pred = classifier.classify_many([fea for (fea, tag) in test_set
                                     ])  #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score([tag for (fea, tag) in test_set],
                          pred)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 19
0
def score(classifier):
    ####
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train)  #训练分类器
    #pred = classifier.classify_many(devtest) #对开发测试集的数据进行分类,给出预测的标签
    pred = classifier.classify_many(dev)  #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_dev, pred)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    # pred = classifier.batch_classify(test)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
Ejemplo n.º 21
0
def score(classifier, train, testSet, tag_test):
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train)  #训练分类器
    pred = classifier.classify_many(testSet)
    # classifier.prob_classify()
    # pred = classifier.batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_test, pred)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 22
0
def main3():
    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.svm import LinearSVC
    from sklearn.metrics import confusion_matrix
    from matplotlib import pyplot

    svm = SklearnClassifier(LinearSVC(loss="hinge"))
    svm.train(trainData)
    print("SVM: ", nltk.classify.accuracy(svm, testData))
    results = svm.classify_many(item[0] for item in testData)

    print(results)
    from sklearn.metrics import classification_report

    # getting a full report
    print(classification_report(t_test_skl, results, labels=list(set(t_test_skl)), target_names=t_test_skl))

    # Compute confusion matrix
    import numpy as np
    cmm = confusion_matrix([x[1] for x in testData], results)

    print(cmm)
    cmm = np.array(cmm, dtype = np.float)
    print(cmm.shape)

    #f=figure()
    #ax = f.add_subplot(111)
    #show()
    #%pylab inline

    # Show confusion matrix in a separate window
    print(pyplot.imshow(cmm, interpolation='nearest'))
Ejemplo n.º 23
0
def score(classifier):

    classifier = SklearnClassifier(classifier)  #在nltk中使用scikit-learn的接口

    classifier.train(train)  #训练分类器

    #print train

    pred = classifier.classify_many(data)  #对测试集的数据进行分类,给出预测的标签

    n = 0

    m = 0

    s = len(pred)

    for i in range(0, s):

        #print pred[i]

        if pred[i] == tag[i] and pred[i] == 'neg':

            n = n + 1

        if pred[i] == 'neg':

            m = m + 1

    return float(n) / float(m)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 24
0
def classify_train_test():
    # This function reads train data sets and calls TokenizePosChunk function to create
    # featureset and then uses the SVM classifier to train the model
    # It then reads test data set and calls TokenizePosChunk function to create
    # features for test data set and uses SVM classifier to predict class labels

    #    trainDataFileList = ["train_1000.label", "train_2000.label", \
    #                      "train_3000.label", "train_4000.label", \
    #                      "train_5500.label"]
    trainDataFileList = ["train_1000.label"]
    #

    global features, cls_set, featuresets, searchChunk
    features = {}
    cls_set = []
    featuresets = []
    searchChunk = ""

    # process all train data sets and build featureset
    for trainDataFile in trainDataFileList:
        for line in open(trainDataFile, encoding="ISO-8859-1"):
            TokenizePosChunk(line)
    train = featuresets  #store featureset as train

    # SVM with a Linear Kernel and default parameters
    classif = SklearnClassifier(LinearSVC())
    classif.train(train)

    quest = " "
    quest = input("enter question (q! to quit)==> ")

    while (quest != "q!"):
        # initialize global variables for test dataset processing
        features = {}
        #cls_set = [] # no need to initialize class set for test
        featuresets = []
        testques = []
        searchChunk = ""

        testques.append(quest)
        TokenizePosChunkTest(quest)
        test = features  #store features as test now
        #print ("search chunk aft call ", searchChunk)

        p = classif.classify_many(test)

        chunkVar = searchChunk
        classVar = q_dict[cls_set[p[0]]]

        print("Searching for  => ", chunkVar, "\nQuestion Class => ", classVar)
        Tell_Me_Alfred(chunkVar, classVar)

        #print (cls_set)
        #print("\n")
        #print(p)
        print("\n")

        quest = input("enter question (q! to quit)==> ")
Ejemplo n.º 25
0
def coem(L1, L2, U1, U2):

    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=100)),
                         ('nb', MultinomialNB())])
    classifier1 = SklearnClassifier(pipeline)
    classifier1.train(L1)

    # Predict on U using 1st classifier
    U1_labels = classifier1.classify_many(U1)

    # Trained on A classifier.
    # Now B will learn on L as well as A's labels on U
    iterations = 0

    while iterations < 25:
        classifier2 = SklearnClassifier(pipeline)
        # Add everything in L
        L2_train = L2
        # Add everything in U with labels from A
        for i, sub_bow in enumerate(U2):
            L2_train.append((sub_bow, U1_labels[i]))
        classifier2.train(L2_train)
        # Now, label U.
        U2_labels = classifier2.classify_many(U2)

        # Now, classifier 2 has finished labelling everything in U

        # Classifer 1 starts again
        # Again , add all mails in L
        L1_train = L1

        # Add all mails in U, but with labels from B. (U2)
        for i, mail_bow in enumerate(U1):
            L1_train.append((mail_bow, U2_labels[i]))

        # Train it
        classifier1 = SklearnClassifier(pipeline)
        classifier1.train(L1_train)    
        U1_labels = classifier1.classify_many(U1)
        #print U1_labels,U2_labels    
        print labels_find_intersection(U1_labels,U2_labels)
        iterations += 1

    return U1_labels
def get_fmeasure(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    pred = classifier.classify_many(test)
    acc_score = accuracy_score(tag_test, pred)
    recall_s = acc_score - 0.05
    f_m = (2 * acc_score * recall_s) / (acc_score + recall_s)
    return f_m
Ejemplo n.º 27
0
def score(classifier):
    classifier = SklearnClassifier(classifier)  # 在nltk中使用scikit-learn的接口
    classifier.train(train_set)  # 训练分类器
    pred = classifier.classify_many(data)  # 对测试集的数据进行分类,给出预测的标签
    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1
    return float(n) / float(s)  # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 28
0
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    pred = classifier.classify_many(words)
    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1
    return n / s
def buildClassifier_score(trainSet,devtestSet,classifier):
    #print devtestSet
    from nltk import compat
    dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签
    classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口
    #x,y in  list(compat.izip(*trainSet))
    classifier.train(trainSet) #训练分类器
    #help('SklearnClassifier.batch_classify')
    pred = classifier.classify_many(dev)#batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def buildClassifier_score(trainSet, devtestSet, classifier):
    #print devtestSet
    from nltk import compat
    dev, tag_dev = zip(*devtestSet)  #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    #x,y in  list(compat.izip(*trainSet))
    classifier.train(trainSet)  #训练分类器
    #help('SklearnClassifier.batch_classify')
    pred = classifier.classify_many(
        dev)  #batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_dev, pred)  #对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 31
0
def logisticClassify(train_data, test_data, language):
    training_vector = buildFeatureVector(train_data, language)
    test_vector = buildFeatureVector(test_data, language)
    log_classifier = SklearnClassifier(LogisticRegression())
    log_classifier.train(training_vector)
    accuracy = nltk.classify.accuracy(log_classifier, test_vector)
    #classifier.show_most_informative_features()
    test_vector = list(zip(*test_vector))[0]
    classes = log_classifier.classify_many(test_vector)
    test_tweets = test_data.index.values
    return accuracy, dict(zip(test_tweets, classes))
Ejemplo n.º 32
0
def score(classifier, train, test):
    classifier = SklearnClassifier(classifier)  # 在nltk中使用scikit-learn的接口
    classifier.train(train)  # 训练分类器
    pred = classifier.classify_many(test)  # 对测试集的数据进行分类,给出预测的标签
    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1
    print('准确度为: %f' % (n / s))
    result = n / s
    return classifier, result
Ejemplo n.º 33
0
def score(classifier):
    classifier = SklearnClassifier(classifier)  #在nltk中使用scikit-learn的接口
    classifier.train(train)  #训练分类器

    pred = classifier.classify_many(data)  #对测试集的数据进行分类,给出预测的标签
    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1

    return n / s
def linear_model(train_df, cv_df):
    train_tokenized_data = get_data_for_nltk(train_df)
    cv_tokenized_data = get_data_for_nltk(cv_df)

    print(train_tokenized_data[0], train_tokenized_data[100], train_tokenized_data[10000])
    classifier = SklearnClassifier(SGDClassifier(loss='log', max_iter=5)) # Set up SGD Classifier via scikitlearn wrapper
    print("Training Classifier...")
    classifier.train(train_tokenized_data)
    print("Training Done")

    save_pickle(classifier, "linear_model.pickle")
    cv_accuracy = nltk.classify.accuracy(classifier, cv_tokenized_data)
    print("Cross Validation Accuracy", cv_accuracy)
    print(classification_report(cv_df['label'].tolist(), classifier.classify_many([x for (x,y) in cv_tokenized_data])))
Ejemplo n.º 35
0
def performTestValidation(trainset, testset, sklearnclassifier, uniqLabels):
        classifier = SklearnClassifier(sklearnclassifier).train(trainset)
        true = [label for features, label in testset]
        predicted = classifier.classify_many([features for features, label in testset])

        precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels)
        accuracy = accuracy_score(true, predicted)

        print("Test accuracy: {0:.3f}".format(accuracy))
        measures = {label: (precision, recall, fscore) for label, precision, recall, fscore in zip(uniqLabels, precisions, recalls, fscores)}
        for label, (prec, recall, fscore) in measures.items():
            print("Precision for {0}: {1:.3f}".format(label, prec))
            print("Recall for {0}: {1:.3f}".format(label, recall))
            print("F score for {0}: {1:.3f}".format(label, fscore))
Ejemplo n.º 36
0
def score(classifier, num):
    classifier = SklearnClassifier(classifier)  # 在nltk中使用scikit-learn的接口
    train, data, tag = train_and_test(num)
    classifier.train(train)  # 训练分类器
    pred = classifier.classify_many(data)  # 对测试集的数据进行分类,给出预测的标签
    # print(pred)

    n = 0
    s = len(pred)
    for i in range(0, s):
        if pred[i] == tag[i]:
            n = n + 1

    return n / s  # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
Ejemplo n.º 37
0
    def score(cls, mclassifier, x_train, x_test):
        data, tag = zip(*x_test)  # 分离测试集合的数据和标签,便于验证和测试
        classifier = SklearnClassifier(mclassifier)
        classifier.train(x_train)
        print('===================================》》 Train done!')
        pos_index = []
        neg_index = []
        for i in range(0, len(tag)):
            if tag[i] == 'pos':  # pos
                pos_index.append(i)  # 记录所有pos的index,计算精确率
            else:
                neg_index.append(i)

        pred = classifier.classify_many(data)  # 给出预测的标签
        print(type(pred), len(pred), len(tag))
        n = 0
        s = len(pred)
        for i in range(0, s):
            if pred[i] == tag[i]:
                n = n + 1
        accu = n / s  # 分类器准确率
        print(accu)
        # print(pos_index, tag)
        tp = 0  # 将正类预测为正类的数目
        fn = 0  # 将正类预测为负类的数目
        fp = 0  # 将负类预测为正类的数目
        tn = 0  # 将负类预测为负类的数目

        for i in pos_index:
            if pred[i] == tag[i]:
                tp = tp + 1
            else:
                fn = fn + 1
        for i in neg_index:
            if pred[i] == tag[i]:
                tn = tn + 1
            else:
                fp = fp + 1
        print(tp, '--', fn, '--', fp, '--', tn)
        pos_precision = tp / (tp + fp)  # pos的精确率
        pos_recall = tp / (tp + fn)  # pos的召回率
        pos_f1 = (2 * pos_precision * pos_recall) / (pos_precision + pos_recall
                                                     )  # pos的f1值

        neg_precision = tn / (tn + fn)  # neg的精确率
        neg_recall = tn / (tn + fp)  # neg的召回率
        neg_f1 = (2 * neg_precision * neg_recall) / (neg_precision + neg_recall
                                                     )  # neg的f1值

        return accu, pos_precision, pos_recall, pos_f1, neg_precision, neg_recall, neg_f1
Ejemplo n.º 38
0
def train_model(classifier, name, printout = False):
	classifier = SklearnClassifier(classifier)
	classifier.train(trainData)
	#predict = classifier.classify_many(validSam)
	predict = classifier.classify_many(testSam)
	accuracy = accuracy_score(testTag, predict)
	if printout:
		print '*******模型: %s的测试结果*********' % name
		print '\n'
		print '%s`s accuracy is %f' % (name, accuracy)
		print '%s`s score report is \n' % name
		print classification_report(testTag, predict)
		print '%s`s confusion is \n' % name
		print confusion_matrix(testTag, predict)
		print '\n'
		model_file = data_path + name + ".pkl"
		pickle.dump(classifier, open(model_file, 'w'))
	return accuracy
def classifier_score(tp, classifier, train_list, test, test_tag):
    '''
    传入分类器进行分类
    Output:pos_precision, pos_recall, accuracy_score
    '''
    starttime = datetime.datetime.now()
    classifier = SklearnClassifier(classifier)
    classifier.train(train_list)
    iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl')
    pred = classifier.classify_many(test)  # 返回的是结果集的list
    y_true = [1 if tag == 'pos' else 0 for tag in test_tag]
    y_pred = [1 if tag == 'pos' else 0 for tag in pred]
    pos_precision = precision_score(y_true, y_pred)
    pos_recall = recall_score(y_true, y_pred)
    endtime = datetime.datetime.now()
    interval = (endtime - starttime).microseconds
    interval = interval / 100
    return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)
def classifier_score(tp, classifier, train_list, test, test_tag):
    '''
    传入分类器进行分类
    Output:pos_precision, pos_recall, accuracy_score
    '''
    starttime = datetime.datetime.now()
    classifier = SklearnClassifier(classifier)
    classifier.train(train_list)
    iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl')
    pred = classifier.classify_many(test)  # 返回的是结果集的list
    y_true = [1 if tag == 'pos' else 0 for tag in test_tag]
    y_pred = [1 if tag == 'pos' else 0 for tag in pred]
    pos_precision = precision_score(y_true, y_pred)
    pos_recall = recall_score(y_true, y_pred)
    endtime = datetime.datetime.now()
    interval = (endtime - starttime).microseconds
    interval = interval / 100
    return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)
Ejemplo n.º 41
0
def runClassifier(train, test, algo='LogisticRegression'):

    train_features = []

    for co in train:
        train_features.append((co.featureset, co.isbug))

    test_features = []
    for c in test:
        if c is None:
            continue
        test_features.append(c.featureset)

    if algo == 'LogisticRegression':
        print 'LogisticRegression'
        try:
            from sklearn.linear_model.sparse import LogisticRegression
        except ImportError:  # separate sparse LR to be removed in 0.12
            from sklearn.linear_model import LogisticRegression

        classif = SklearnClassifier(LogisticRegression(C=1000))
    else:
        # if not logistic, assume SVM for now
        # SVM with a Linear Kernel and default parameters
        from sklearn.svm import LinearSVC
        print 'svm'
        classif = SklearnClassifier(LinearSVC())

    classif.train(train_features)

    try:
        p = classif.classify_many(test_features)
    except AttributeError:
        p = classif.batch_classify(test_features)

    test_commits = []

    for idx, val in enumerate(p):
        t = test[idx]
        t.isbug = val
        test_commits.append(t)

    return test_commits
Ejemplo n.º 42
0
def run_stats():
	#shuffle articles
	shuffle(allArticles)
	shuffle(allArticlesBinary)
	numArticles = len(allArticles)
	split = math.floor(numArticles * .75)
	trainingSet = allArticles[:split]
	test = allArticles[split:]
	bitrain = allArticlesBinary[:split]
	bitest = allArticlesBinary[split:]
	testSet = []
	bitestSet = []
	testAnswers = []
	for item in test:
		testSet.append(item[0])
		testAnswers.append(item[1])
	for item in bitest:
		bitestSet.append(item[0])


	multClassif = SklearnClassifier(MultinomialNB())
	ti = time()
	multClassif.train(trainingSet)
	multRes = multClassif.classify_many(testSet)
	t0 = time() - ti
	multTime.append(t0)
	multAcc.append(accuracy_score(testAnswers, multRes))
	calcPRF('mult', testAnswers, multRes)


	lrmult = SklearnClassifier(LogisticRegression())
	ti = time()
	lrmult.train(trainingSet)
	logRes = lrmult.classify_many(testSet)
	t3 = time() - ti
	lrTime.append(t3)
	lrAcc.append(accuracy_score(testAnswers, logRes))
	calcPRF('lr', testAnswers, logRes)

	pipe = Pipeline([('tfidf', TfidfTransformer()), 
					#('chi2', SelectKBest(chi2, k=500)),
					('nb', MultinomialNB())])
	testClassif = SklearnClassifier(pipe)
	ti = time()
	testClassif.train(trainingSet)
	testres = testClassif.classify_many(testSet)
	t5 = time() - ti
	testTime.append(t5)
	testAcc.append(accuracy_score(testAnswers, testres))
	calcPRF('test', testAnswers, testres)


	percepclass = SklearnClassifier(Perceptron())
	ti = time()
	percepclass.train(trainingSet)
	precepres = percepclass.classify_many(testSet)
	t7 = time() - ti
	percepTime.append(t7)
	percepAcc.append(accuracy_score(testAnswers, precepres))
	calcPRF('percep', testAnswers, precepres)


	berclass = SklearnClassifier(BernoulliNB())
	ti = time()
	berclass.train(bitrain)
	berres = berclass.classify_many(bitestSet)
	t9 = time() - ti
	berTime.append(t9)
	berAcc.append(accuracy_score(testAnswers, berres))
	calcPRF('ber', testAnswers, berres)
Ejemplo n.º 43
0
def learn_model(data,target):
    # preparing data for split validation. 60% training, 40% test
    state=43#randrange(1,23432)+123
    print "statue 6857"
    print state
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.20,random_state=state)
    #classifier = BernoulliNB().fit(data_train,target_train)
    stop_word_dict={}#build_stop_word_dict()
    sentiment_dict={}#build_sentiment_dict()
    global hinfo_dict
    hinfo_dict=build_hinfo_dict(data,target)
    
        
        
    #print stop_word_dict.keys()
    raw_input("begin train")
    train_feature=[]
    test_feature=[]
    for i in range(len(data_train)):
        print i
        d=data_train[i]
        #d=jieba.cut(d, cut_all=False)
        l=target_train[i]
        tmp=[best_word_feats(d,stop_word_dict,sentiment_dict,hinfo_dict),l]
        train_feature.append(tmp)
        
    for i in range(len(data_test)):
        print i
        d=data_test[i]
        #d=jieba.cut(d, cut_all=False)
        l=target_test[i]
        tmp=best_word_feats(d,stop_word_dict,sentiment_dict,hinfo_dict)
        test_feature.append(tmp)
    
    #BernoulliNB MultinomialNB LogisticRegression  SVC LinearSVC
    print "max_len %d"%(max_len)
    print "min_len %d"%(min_len)
    
    print "avg_len %d"%(sum/cnt)
    
    print "BernoulliNB"
    classifier = SklearnClassifier(BernoulliNB())
    classifier.train(train_feature)
    print "--------------"
    print len(classifier._vectorizer.get_feature_names())
    
    for f in classifier._vectorizer.get_feature_names():
        print f.encode("utf-8")
    
    predicted = classifier.classify_many(test_feature)
    evaluate_model(target_test,predicted)
    
    
    ids=range(len(data_test))
    result=[]
    for p in predicted:
        if p =='positive':
            result.append('1')
        else:
            result.append('-1')
        
    save_predict(data_test, ids, result, "BernoulliNB.xml")
    
    
    """
Ejemplo n.º 44
0
featuresets = chosen_features_limit

train_set, test_set = featuresets[limit / 2:], featuresets[:limit / 2]

svm = SklearnClassifier(LinearSVC())
svm.train(train_set)

path = os.path.normpath('../model/svm/account_{0}/{1}/'.format(account_id, version))
if not os.path.exists(path):
    os.makedirs(path)

print(u'Saving model to {0}'.format(path))
joblib.dump(svm, os.path.join(path, 'svm.pkl'))

test_skl = []
t_test_skl = []
for d in test_set:
    test_skl.append(d[0])
    t_test_skl.append(d[1])

# run the classifier on the test test
p = svm.classify_many(test_skl)

# getting a full report
print classification_report(
    t_test_skl,
    p,
    labels=list(set(t_test_skl)),
    target_names=['pos', 'neg']
)
Ejemplo n.º 45
0
#Multinomial Naive Bayes classifier
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                      ('chi2', SelectKBest(chi2, k='all')),
                      ('nb', MultinomialNB())])

classif = SklearnClassifier(pipeline)
classif.train(train_set)


#Max entropy classifier
"""
classif = MaxentClassifier.train(train_set, 'megam')
"""
print(nltk.classify.accuracy(classif, test_set))

pred = classif.classify_many([feature for feature, sentiment in test_set])
test_true = [sentiment for feature, sentiment in test_set]
matx = confusion_matrix(test_true,pred)
print(matx)



#joblib.dump(tweets, 'tweets.pkl')
#joblib.dump(classif, 'classif.pkl')




"""
#Cross Validating Classifiers
cv = cross_validation.KFold(len(featuresets), n_folds=5, shuffle=False, random_state=None)
Ejemplo n.º 46
0
class MNBayes(text_classifier.TextClassifier):
    def __init__(self,trainDir,labelFile,numTrees=10):
        self.classifier = None
        self.labelFile = labelFile
        self.trainingDir = trainDir
        self.labels = None
        self.all_words = None
        self.numTrees = numTrees
        self.classifier = SklearnClassifier(MultinomialNB())
        #self.labels = training.setup(labelFile)
        #self.train()
    
    def train(self):
        feature_sets = self.getFeatures()
        
        self.classifier.train(feature_sets)
        
    """ Determines training error"""
    def trainingError(self):
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(MultinomialNB())
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(MultinomialNB())
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(MultinomialNB())
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(feature_sets)
                p = nltk.classify.accuracy(self.classifier,test_set)
                total+=p
            accuracies.append(total/numTrials)
        return accuracies

        

    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(MultinomialNB())
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.classify_many(features)   
        return ref_labels,pred_labels
        
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm
    
    """ Classifies proteins based on its text """
    def classify(self,db,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            proteinID = toks[5]
            query_rows = genbank.proteinQuery(proteinID,db)
            ids,text = zip(*query_rows)
            text = ''.join(map(str,text))
            if text=='': 
                label = ['na']
            else:
                text = word_reg.findall(text)
                featureset = self.gene_features(text)
                assert text!=prevText
                assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.batch_classify([featureset])    
            
            proIDs.append(proteinID)  
            labels+=label
        return zip(proIDs,labels)
Ejemplo n.º 47
0
        train_set[label] = []
    train_set[label].append(FreqDist(newwordlist(mail)))

pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('svm', LinearSVC())])
classif = SklearnClassifier(pipeline)
add_label = lambda lst, lab: [(x, lab) for x in lst]
finalset = []
for label,bow in train_set.iteritems():
    finalset.extend(add_label(bow, label))
classif.train(finalset)

conf = []
for l, bow in train_set.iteritems():
    labels = np.array(classif.classify_many(bow))
    row = []
    for label in train_set:
        row.append((labels==label).sum())
    conf.append(row)

for c in conf:
    print c

diagval = 0
total = 0
for i in range(len(conf)):
    for j in range(len(conf)):
        total += conf[i][j]
        if i==j:
            diagval += conf[i][j]
Ejemplo n.º 48
0
aww_neg_training = aww_neg_total[0:training] # get first 2000 negative posts

aww_pos_testing = aww_pos_total[-testing:] # get last 100 positive posts
aww_neg_testing = aww_neg_total[-testing:]
total_1 = len(aww_pos_testing)
total_2 = len(aww_neg_testing)
training_data = []
for dictionary in aww_pos_training:
	training_data.append((dictionary, "pos"))
for dictionary2 in aww_neg_training:
	training_data.append((dictionary2, "neg"))
shuffle(training_data)

print("classifying and training")
classif = SklearnClassifier(MultinomialNB()).train(training_data)
results_pos = classif.classify_many(aww_pos_testing)
results_neg = classif.classify_many(aww_neg_testing)

correct_1 = 0
correct_2 = 0

print("testing")

for string in results_pos:
	if "pos" in string:
		correct_1 += 1
for string in results_neg:
	if "neg" in string:
		correct_2 += 1

print("For positive: %i correct out of %i for a percentage of %f" % (correct_1, total_1, correct_1/total_1))
Ejemplo n.º 49
0
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list((zip(names, classifiers)))

nltk_ensemble = SklearnClassifier(
    VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
print("Voting Classifier: Accuracy: {}".format(accuracy))

# In[28]:

# make class label prediction for testing set
txt_features, labels = list(zip(*testing))

prediction = nltk_ensemble.classify_many(txt_features)

# In[29]:

# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(confusion_matrix(labels, prediction),
             index=[['actual', 'actual'], ['Real', 'Fake']],
             columns=[['predicted', 'predicted'], ['Real', 'Fake']])

# In[ ]:
Ejemplo n.º 50
0
result.append(svm_result)
result.append(dt_result)
result.append(ent_result)
result.append(nb_result)
result.append(knn_result)

#meta cl
cv_trn2, cv_t2, all_trn2 = create_meta_training_data(result, label_list)
svm_meta = SklearnClassifier(LinearSVC()).train(all_trn2)
dt_meta = SklearnClassifier(tree.DecisionTreeClassifier()).train(all_trn2)
ent_meta = nltk.classify.maxent.MaxentClassifier.train(all_trn2, trace=1, max_iter=4)
nb_meta = nltk.NaiveBayesClassifier.train(all_trn2)
knn_meta = SklearnClassifier(KNeighborsClassifier(5)).train(all_trn2)

intermediate = []
intermediate.append(svm_base.classify_many(test_feature))
intermediate.append(dt_base.classify_many(test_feature))
intermediate.append(ent_base.classify_many(test_feature))
intermediate.append(nb_base.classify_many(test_feature))
intermediate.append(knn_base.classify_many(test_feature))
test_feature = merge_feature(intermediate)

print compute_accuracy(test_label, svm_meta.classify_many(test_feature))
print compute_accuracy(test_label, dt_meta.classify_many(test_feature))
print compute_accuracy(test_label, ent_meta.classify_many(test_feature))
print compute_accuracy(test_label, nb_meta.classify_many(test_feature))
print compute_accuracy(test_label, knn_meta.classify_many(test_feature))

#svm_l2 = []
#dt_l2 = []
#nb_l2 = []
Ejemplo n.º 51
0
def main():
    # Naive Bayes
    nb = nltk.NaiveBayesClassifier.train(trainData)
    print("NB: ", nltk.classify.accuracy(nb, testData))

    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.naive_bayes import BernoulliNB, GaussianNB
    # BernoulliNB
    bernoulli = SklearnClassifier(BernoulliNB())
    bernoulli.train(trainData)
    print("NB Bernoulli: ", nltk.classify.accuracy(bernoulli, testData))
    #gaussian = SklearnClassifier(GaussianNB())
    #gaussian.train(trainData.toarray(trainData))
    #print("Gaussian: ", nltk.classify.accuracy(gaussian, testData))

    from sklearn.naive_bayes import MultinomialNB
    # MultinomialNB
    multi = SklearnClassifier(MultinomialNB())
    multi.train(trainData)
    print("NB Multinomial: ", nltk.classify.accuracy(multi, testData))

    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.pipeline import Pipeline

    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=1000)),
                         ('nb', MultinomialNB())])

    pmulti = SklearnClassifier(pipeline)
    pmulti.train(trainData)
    print("NB Multinomial (pmulti): ", nltk.classify.accuracy(pmulti, testData))

    from sklearn.metrics import f1_score

    #results = pmulti.batch_classify(item[0] for item in testData)
    #print(results[:10])
    #print(f1_score([item[1] for item in testData], results))

    # Logistic Reggression

    from sklearn.linear_model import LinearRegression

    #linReg = SklearnClassifier(LinearRegression())
    #linReg.train((trainData))
    #print("Logistic Reggression: ", nltk.classify.accuracy(linReg, testData))

    # Ensembles
    # Random forest
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

    forest = SklearnClassifier(RandomForestClassifier(n_estimators=100))
    forest.train(trainData)
    print("Random Forest: ", nltk.classify.accuracy(forest, testData))

    # AdaBoost
    adaboost = SklearnClassifier(AdaBoostClassifier())
    adaboost.train(trainData)
    print("Adaboost: ", nltk.classify.accuracy(adaboost, testData))


    # Support Vector Machines

    from sklearn.svm import LinearSVC
    from sklearn.metrics import confusion_matrix

    svm = SklearnClassifier(LinearSVC(loss="hinge"))
    svm.train(trainData)

    print("SVM: ", nltk.classify.accuracy(svm, testData))

    svm = SklearnClassifier(LinearSVC(loss="hinge"))
    svm.train(trainData)
    print("SVM: ", nltk.classify.accuracy(svm, testData))
    results = svm.classify_many(item[0] for item in testData)

    print(results)

    # KMeans

    from sklearn.cluster import KMeans
    km = SklearnClassifier(KMeans())
    km.train(trainData)
    print("KMeans: ", nltk.classify.accuracy(km, testData))

    # K nearest neighbors
    from sklearn.neighbors import KNeighborsClassifier
    k = 5
    knn = SklearnClassifier(KNeighborsClassifier(n_neighbors=k))
    knn.train(trainData)
    print("KNN 1: ", nltk.classify.accuracy(knn, testData))
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)

    predict = classifier.classify_many(test)
    return accuracy_score(tag_test, predict)
Ejemplo n.º 53
0
test_skl = []
t_test_skl = []
for d in test_set:
    test_skl.append(d[0])
    t_test_skl.append(d[1])

# cls_set = list(set(t_test_skl))

# SVM with a Linear Kernel and default parameters
classifier = SklearnClassifier(LinearSVC())

for train_set_size in range(1000, corpus_size, 300):

    print("Train set size:", train_set_size)

    # Split in train
    train_set = feature_sets[test_set_size:train_set_size]

    classifier.train(train_set)

    # run the classifier on the train test
    p = classifier.classify_many(test_skl)

    # getting a full report
    print classification_report(t_test_skl, p)

    cm = nltk.ConfusionMatrix(p,t_test_skl)

    print(cm.pretty_format(sort_by_count=True, show_percents=True))

		words.read_files(file_name)
		dictionary = dict(words)
		test_data_1.append(dictionary)

for directory in os.listdir(test_fp2):
	subdir = test_fp2 + directory
	if directory.startswith("."):
		continue
	for string in os.listdir(subdir):
		file_name = test_fp2 + directory + '/' + string
		words = Document()
		words.read_files(file_name)
		dictionary = dict(words)
		test_data_2.append(dictionary)

results_1 = classif.classify_many(test_data_1)
results_2 = classif.classify_many(test_data_2)
total_1 = len(results_1)
total_2 = len(results_2)
correct_1 = 0
correct_2 = 0
for string in results_1:
	if "children" in string:
		correct_1 += 1
for string in results_2:
	if "advanced" in string:
		correct_2 += 1

print("For children text: %i correct out of %i for a percentage of %f" % (correct_1, total_1, correct_1/total_1))
print("For advanced text: %i correct out of %i for a percentage of %f" % (correct_2, total_2, correct_2/total_2))
print("total classification percentage is %f" % ((correct_1 + correct_2)/(total_1 + total_2)))
Ejemplo n.º 55
0
def output():
    
    #form = ReusableForm(request.form)
    #if request.method == 'GET':
    name = request.args.get('textQuery')
    #name=request.form['textQuery']
        #name=request.form.getlist('textQuery')
    #if form.validate():
        
    selected_features = None

    stopwords = ['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'herself', 'had', 
         'should', 'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 
         'now', 'him', 'nor', 'did', 'this', 'she', 'each', 'further', 'where', 'few', 'because', 'doing', 'some', 'are', 
         'our', 'ourselves', 'out', 'what', 'for', 'while', 'does', 'above', 'between', 't', 'be', 'we', 'who', 
         'were', 'here', 'hers', 'by', 'on', 'about', 'of', 'against', 's', 'or', 'own', 'into', 'yourself', 
         'down', 'your', 'from', 'her', 'their', 'there', 'been', 'whom', 'too', 'themselves', 'was', 'until', 
         'more', 'himself', 'that', 'but', 'don', 'with', 'than', 'those', 'he', 'me', 'myself', 'these', 'up', 
         'will', 'below', 'can', 'theirs', 'my', 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 
         'have', 'in', 'any', 'if', 'again', 'no', 'when', 'same','how', 'other', 'which', 'you', 'after', 'most',
         'such', 'why', 'a', 'off', 'i', 'yours', 'so', 'the', 'having','once']

    def add_lexical_features(fdist, feature_vector, text):
        feature_vector["len"] = len(text)
        text_nl = nltk.Text(text)
        for word, freq in fdist.items():
            fname = word 
            if selected_features == None or fname in selected_features:        
                #feature_vector[fname] = text_nl.count(word)
                feature_vector[fname] = 1

    def features(review_words):
        feature_vector = {}

        uni_dist = nltk.FreqDist(review_words)
        my_bigrams = list(bigrams(review_words))
        bi_dist = nltk.FreqDist(my_bigrams)
        
        add_lexical_features(uni_dist,feature_vector, review_words)
        
        return feature_vector

    with open("dataV11.txt", 'rb') as f:
        text = f.read()
    text = text.decode("utf-8")
    f.close()

    docs = text.split("\n")
    docs2 = docs[1: ]
    train = []
    #print(sent)

    for d in docs2:
        d = d.split()
        if len(d)!=0:
            cl = d[0]
            text_d = d[1: ]#we need to remove the stopwords
            text = []
            for w in text_d:
                if w not in stopwords:
                    text.append(w)
            item = (text, cl)
            train.append(item)

    random.seed(0)
    random.shuffle(train)
    #print(sentences)

    train_set = train[ :3271]
    valid_set = train[3272: ]

    featuresets_tr = [(features(words), label) for (words, label) in train_set ]
    featuresets_val = [(features(words), label) for (words, label) in valid_set ]

    featuresets = [(features(words), label) for (words, label) in train ]

    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.naive_bayes import MultinomialNB,BernoulliNB
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(featuresets_tr)
    MNB_classifier.train(featuresets)

    BNB_classifier = SklearnClassifier(BernoulliNB())
    BNB_classifier.train(featuresets_tr)
    BNB_classifier.train(featuresets)



#n = int(input())
    print(">>>>>>>")
    print(name)
    a = [a_temp for a_temp in name.strip().split(' ')]
    #for a_i in range(1): # to read a matrix
        #a_t = [a_temp for a_temp in input().strip().split(' ')]
        #a.append(a_t)
    print("<<<<<")
    print(a)

    inputData = ",".join(map(str, a))
    print("Input data:",inputData)  

    featuresets_test = [features(words) for words in a ]

    #features =  ",".join(map(str, featuresets_test))
    print("Features:",featuresets_test)    
    #a = []
    #a.append("doi1")
    #featuresets_test = []
    #featuresets_test.append("nanomaterial")
    predicted_labels = BNB_classifier.classify_many(featuresets_test)

    #print(a)
    #print(featuresets_test)
    for l in predicted_labels:
        print (str("Type of input data: "+l))

    #print(type(a))
    #print('Input:')
    #print(a)
    #print(type(predicted_labels))
    #print('Category:')
    #print(predicted_labels)
    #print(type(featuresets_test))
    #print('Features:')
    #print(featuresets_test)
    
    outputData = []
    import csv
    csvfile = open('result.csv', 'w')
    with csvfile:
        #for (col1, col2, col3) in zip(a, predicted_labels, featuresets_test):
            #outputData.append([col1, col2, col3])
        data1 = [["asif", "kary", "ravi"], ["xyz", "abc", "def"]]
        outputData = [[name, l, featuresets_test]]
        valueWriter = csv.writer(csvfile)
        
        valueWriter.writerows(outputData)
        #valueWriter.writerows([str(a),str(l),str(featuresets_test)])
        #csv = valueWriter.writerows(data)
    return render_template('results.html') 
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)  #在nltk 中使用scikit-learn 的接口
    classifier.train(train_set)    #训练分类器
    predict = classifier.classify_many(test)     #对开发测试集的数据进行分类,给出预测的标签
    return precision_recall_fscore_support(tag_test,predict)
Ejemplo n.º 57
0
ent_base = []
result = []
svm_result = []
nb_result = []
dt_result = []
ent_result = []

svm_base = SklearnClassifier(LinearSVC()).train(all_trn)
dt_base = SklearnClassifier(tree.DecisionTreeClassifier()).train(all_trn)
nb_base = nltk.NaiveBayesClassifier.train(all_trn)
ent_base = nltk.classify.maxent.MaxentClassifier.train(all_trn, trace=1, max_iter=4)
knn_base = SklearnClassifier(KNeighborsClassifier(5)).train(all_trn)

#test
intermediate = []
intermediate.append(svm_base.classify_many(test_feature))
intermediate.append(dt_base.classify_many(test_feature))
intermediate.append(nb_base.classify_many(test_feature))
intermediate.append(ent_base.classify_many(test_feature))
intermediate.append(knn_base.classify_many(test_feature))
#final_feature = merge_feature(intermediate)

#print compute_accuracy(test_label, svm_meta.classify_many(final_feature))
#print compute_accuracy(test_label, max_ent_meta.classify_many(final_feature))

#Weight:
weight = []
weight.append(0.9)
weight.append(0.8)
weight.append(0.8)
weight.append(0.8)
Ejemplo n.º 58
0
    classifier = SklearnClassifier(model)

    # set priors
    classifier._encoder.fit([category, "no"])
    # [category, "no"] unless this is true then ["no", category]
    flip = classifier.labels()[0] == "no"
    categorized_proportion = len([words for (words, categories) in corpus if category in categories]) * 1.0 / len(corpus)
    if flip:
        model.class_prior = [1-categorized_proportion, categorized_proportion]
    else:
        model.class_prior = [categorized_proportion, 1-categorized_proportion]

    classifier.train(train_set)

    # test classifier
    test_results = classifier.classify_many([feat for (feat, label) in test_set])
    pos_test_set = set(i for i, result in enumerate(test_results) if result == category)
    reference_values = [label for (feat, label) in test_set]
    pos_ref_set = set(i for i, (feat, label) in enumerate(test_set) if label == category)
    accuracy = scores.accuracy(reference_values, test_results)
    accuracies.append(accuracy)
    precision = scores.precision(pos_ref_set, pos_test_set)
    recall = scores.recall(pos_ref_set, pos_test_set)
    f1 = scores.f_measure(pos_ref_set, pos_test_set)
    f1_scores.append(f1)

    print "%s: accuracy %s, precision %s, recall %s, F1 %s" % (colored(category, "blue"), colored(accuracy, "yellow"), colored(precision, "yellow"), colored(recall, "yellow"), colored(f1, "yellow"))
    ## print(nltk.classify.accuracy(classifier, test_set))
    # classifier.show_most_informative_features(5)
    # print ""
Ejemplo n.º 59
0
 def final_score(classifier):
     classifier = SklearnClassifier(classifier)
     classifier.train(trainSet)
     pred = classifier.classify_many(test)
     return accuracy_score(tag_test, pred)
Ejemplo n.º 60
0
class ScikitClassifierAdapter:
	"""
	An adapter for an SklearnClassifier (nltk.classify.scikitlearn) object
	to make sure that all classifiers take same input and return the same output
	and are trained in the same way.

	scikit_classifier:
		a Scikit classifier *instance*

	train_file_name:
		the path to the training settings

	template_file_name:
		the template to extract additional feature for optimization purposes

	"""
	def __init__(self, scikit_classifier, train_file_name,template_file_name,labelled_feature_sets=None):
		from nltk.classify.scikitlearn import SklearnClassifier
		from sklearn.ensemble import AdaBoostClassifier
		from sklearn.naive_bayes import GaussianNB
		from sklearn.ensemble import RandomForestClassifier
		fe = FeatureExtractor()
		#self.classifier = SklearnClassifier(scikit_classifier,sparse=False)
		if(isinstance(scikit_classifier,RandomForestClassifier)):
			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
		elif(isinstance(scikit_classifier,GaussianNB)):
			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
		else:
			self.classifier = SklearnClassifier(scikit_classifier)
		self.compiled_templates = self.process_template(template_file_name)
		feature_sets = []
		if(labelled_feature_sets is not None):
			feature_sets = labelled_feature_sets
			logger.info("using a pre-computed feature_sets containing %i instances"%len(feature_sets))
		else:
			iob_data = 	file_to_instances(train_file_name)
			logger.info("instances ",len(iob_data))
			logger.info("tokens",count_tokens(iob_data))
			for n,instance in enumerate(iob_data):
			    sentence_n = n
			    pos_tags = [('z_POS',token[1]) for token in instance]
			    labels = [token[2] for token in instance]
			    tokens = [token[0] for token in instance]
			    for n,token in enumerate(tokens):
			        dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0]
			        feature_sets.append([dict_features, labels[n]])
		self.classifier.train(self.apply_feature_template(feature_sets,out_label=True))
		return
	def classify(self,feature_sets):
		"""
		Args:
			feature_sets: 
				a list of dictionaries like the following:

				[{'a_token': u'Nella',
				 'b_punct': 'OTHERS',
				 'c_brackets': 'OTHERS',
				 'd_case': 'INIT_CAPS',
				 'e_number': 'NO_DIGITS',
				 'f_ngram_1': u'N',
				 'f_ngram_2': u'Ne',
				 'f_ngram_3': u'Nel',
				 'f_ngram_4': u'Nell',
				 'g_ngram_1': u'a',
				 'g_ngram_2': u'la',
				 'g_ngram_3': u'lla',
				 'g_ngram_4': u'ella',
				 'h_lowcase': u'nella',
				 'i_str-length': '5',
				 'l_pattern': 'Aaaaa',
				 'm_compressed-pattern': 'Aa',
				 'n_works_dictionary': 'OTHERS',
				 'z': '_'}, ... ]

		Returns:
			result:
				a list of dictionaries, where each dictionary corresponds
				to a token,

				[{'features': [],
				 'id': 37,
				 'label': 'O',
				 'probs': {'B-AAUTHOR': 
				 	{'alpha': '234.113833',
				 	'beta': '-2.125040',
				 	'prob': '0.000262'},
				   },
				 'token': '.'},...]
		"""
		# apply feature templates (from CRF++)
		template_feature_sets = self.apply_feature_template(feature_sets,out_label=False)
		# keep the output labels
		output_labels = self.classifier.classify_many(template_feature_sets)
		result = []
		for n,feature_set in enumerate(feature_sets):
			temp = {}
			temp["token"]=feature_set["a_token"].encode('utf-8')
			temp["label"]=str(output_labels[n])
			result.append(temp)
		return result
	def process_template(self,template_file):
		"""

		Example of the output:

		[('U01:%s', [(-2, 0)]),
 		('U02:%s', [(-1, 0)]),...]

		"""
		f = open(template_file,'r')
		lines = [line.replace('\n','') for line in f.readlines() if not line.startswith('\n') and not line.startswith('#') and not line.startswith('B')]
		f.close()
		import re
		exp = re.compile("%x\[(-?\d+),(-?\d+)\]")
		result = []
		for line in lines:
			result.append((exp.sub('%s',line),[(int(match[0]),int(match[1])) for match in exp.findall(line)]))
		return result
	def apply_feature_template(self,feature_sets,out_label=False):
		"""
		TODO: apply each of the compiled templates
		"""
		def get_value(feature_sets,token_n,feature_n):
			if(token_n < 0):
				return "ND"
			elif(token_n > (len(feature_sets)-1)):
				return "ND"
			else:
				return feature_sets[token_n][feature_n]
		if(out_label):
			unlabelled_feature_sets = [[f[0][key] for key in sorted(f[0])] for f in feature_sets]
		else:
			unlabelled_feature_sets = [[f[key] for key in sorted(f)] for f in feature_sets]
		assert len(feature_sets) == len(unlabelled_feature_sets)
		new_features = []
		for n,fs in enumerate(unlabelled_feature_sets):
			result = {}
			for template,replacements in self.compiled_templates:
				template_name = template.split(":")[0]
				template = template.split(":")[1]
				values = [get_value(unlabelled_feature_sets,n+r[0],r[1]) for r in replacements]
				result[template_name] = template%tuple(values)
			if(out_label):
				# keep the expected label for training
				new_features.append([result,feature_sets[n][1]])
			else:
				new_features.append(result)
		return new_features