Esempio n. 1
0
def set_classifier(chosen_classifier, train_set, sentence):
    classifier = SklearnClassifier(chosen_classifier)
    classifier.train(train_set)

    neg = 0
    pos = 0
    # print('Classifier:', str(chosen_classifier))

    for word in sentence:
        classResult = classifier.classify(word_feats(word))
        # print(word_feats(word))
        # print(classResult)
        if classResult == 'neg':
            neg = neg + 1
        if classResult == 'pos':
            pos = pos + 1

    posPercent = str(float(pos) / len(sentence))
    negPercent = str(float(neg) / len(sentence))

    # print ('Accuracy:', nltk.classify.util.accuracy(classifier, sentence))
    # classifier.show_most_informative_features()
    # print('Score:', score)

    # print('Positive: ' + posPercent)
    # print('Negative: ' + negPercent)
    # print('Pos', pos)
    # print('Neg', neg)

    return posPercent, negPercent, pos, neg
    def getSubjObj(self, text):
        words = Text(text.split(" "))
        bigrams = self.getBigrams(words)
        subjclassifier = self.loadSOClsssifier()
        posnegclassifier = self.loadPNClsssifier()

        subj_or_obj = SklearnClassifier.classify(subjclassifier, bigrams)
        if subj_or_obj == "objective":
            return "neutral"

        pos_or_neg = SklearnClassifier.classify(posnegclassifier, bigrams)

        if pos_or_neg == "negative":
            return "negative"
        else:
            return "positive"
def evaluate(classifier_alo):
    
    classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口
    classifier.train(trainFeatures) #训练分类器
    
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	
    i = 0
    for item in testFeatures:
        referenceSets[item[1]].add(i)
        predicted = classifier.classify(item[0])
        testSets[predicted].add(i)	
        i += 1
    
    pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    neg_pre =  nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    
    print (str('{0:.3f}'.format(float(pos_pre))) + "  "
    +str('{0:.3f}'.format(float(pos_recall))) + "  "
    +str('{0:.3f}'.format(float(neg_pre))) + "  "
    +str( '{0:.3f}'.format(float(neg_recall))) + "  "
    +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + "  "
    +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
Esempio n. 4
0
class SKClassifier:

    classifier = None

    def __init__(self, cls='SVC'):
        self.classifier = SklearnClassifier({
            'SVC': SVC(),
            'LogisticRegression': LogisticRegression(),
            'BernoulliNB': BernoulliNB()
        }[cls])
        if not self.classifier:
            self.classifier = SklearnClassifier(SVC())

    def train(self, trainset):
        self.classifier.train(trainset)

    def test(self, tagged, featuresets):
        predict = self.classifier.classify_many(featuresets)
        print predict
        return accuracy_score(tagged, predict)

    def classify(self, featureset):
        return self.classifier.classify(featureset)

    def classify_many(self, featuresets):
        return self.classifier.classify_many(featuresets)
Esempio n. 5
0
def classification(value):
    trainingSet = pd.read_csv("ner_dataset.csv")
    dataSet = pd.read_csv("ner_test.csv")
    print("Done it")
    featureSet = obtain_training_set(trainingSet)
    testSet = obtain_testset(dataSet)

    #classifier = nltk.DecisionTreeClassifier.train(featureSet[:5000])
    #accuracy1 = nltk.classify.accuracy(classifier, featureSet[size:])
    #print("Accuracy of Decision Tree classifier: ", accuracy1)
    #secondClassifier = nltk.NaiveBayesClassifier.train(featureSet[:5000])
    #accuracy2 = nltk.classify.accuracy(secondClassifier, featureSet[size:])
    #print("Accuracy of Naive Bayes Classifier: ", accuracy2)
    #thirdClassifier = SklearnClassifier(KNeighborsClassifier()).train(featureSet[:5000])
    #accuracy3 = nltk.classify.accuracy(thirdClassifier, featureSet[size:])
    #print("Accuracy of K-neighbour classifier", accuracy3)
    # information = CountVectorizer(analyzer=obtain_features(trainingSet, value), lowercase=False)
    # information.fit_transform(trainingSet['Word']).toarray()
    firstClassifier = SklearnClassifier(SGDClassifier()).train(featureSet)
    accuracy4 = nltk.classify.accuracy(firstClassifier, featureSet)
    print("Accuracy of linear model", accuracy4)
    solution = []
    for index in range(0, len(dataSet['Word'])):
        predictTag = str(firstClassifier.classify(testSet[index]))
        solution.append((dataSet['Word'][index], predictTag))
    return 0, 0, 0, accuracy4, solution
def randomforests(num_folds, featuresets, label_list):
    subset_size = int(len(featuresets) / num_folds)
    # overall gold labels for each instance (reference) and predicted labels (test)
    reflist = []
    testlist = []
    accuracy_list = []
    print("Random Forests Classifier")
    # iterate over the folds
    for i in range(num_folds):
        print('Start Fold', i)
        test_this_round = featuresets[i * subset_size:][:subset_size]
        train_this_round = featuresets[:i * subset_size] + featuresets[
            (i + 1) * subset_size:]
        # train using train_this_round
        classifier = SklearnClassifier(RandomForestClassifier())
        classifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier,
                                                     test_this_round)
        print(i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)

        # add the gold labels and predicted labels for this round to the overall lists
        for (features, label) in test_this_round:
            reflist.append(label)
            testlist.append(classifier.classify(features))

    print('Done with cross-validation')
    # call the evaluation measures function
    print('mean accuracy-', sum(accuracy_list) / num_folds)
    (precision_list, recall_list) = eval_measures(reflist, testlist,
                                                  label_list)
    print_evaluation(precision_list, recall_list, label_list)
    print(" ")
Esempio n. 7
0
def main():

    posts = nltk.corpus.nps_chat.xml_posts()

    print(len(posts))
    print(sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()))

    featuresets = []
    prev_ = None

    for post in posts:
        featuresets.append((dialogue_act_features(post.text,
                                                  prev_), post.get('class')))

        if post.get('class') != 'Statement':
            prev_ = post.get('class')

    size = int(len(featuresets) * 0.01)
    train_set, test_set = featuresets[size:], featuresets[:size]

    # Linear Support vector classification
    classif = SklearnClassifier(LinearSVC())
    classif.train(train_set)

    # Logistic Regression method
    # classif = SklearnClassifier(LogisticRegression())
    # classif.train(train_set)

    dialog_Act_A = []
    print("Accuracy : ", nltk.classify.accuracy(classif, test_set) * 100)

    classAprev = None
    book = xlwt.Workbook()
    sh1 = book.add_sheet('Group A')
    index = 0
    openFile = open("output.txt", "a", encoding='utf-8')

    with open('test-inputs.txt', 'r', encoding='utf-8') as groupA:
        for text in groupA:
            class_ = classif.classify(dialogue_act_features(text, classAprev))
            classAprev = class_
            if class_ != 'Statement':
                classAprev = class_
                if class_.find('Question') != -1:
                    class_ = "1"
                else:
                    class_ = "0"
            openFile.write(text.rstrip() + ", " + class_ + "\n")
            sh1.write(index, 0, text)
            sh1.write(index, 1, classAprev)
            index = index + 1
            dialog_Act_A.append(class_)

    groupA.close()
    book.save('QuestionAnalysis.xls')
Esempio n. 8
0
def mnb_classifier(dataset):

    label_feats = label_feats_from_data(dataset, bag_of_non_stopwords)
    train_feats, test_feats = train_test_split(label_feats,
                                               train_size=0.7,
                                               test_size=0.3)
    mnb_classify = SklearnClassifier(MultinomialNB())
    mnb_classify.train(train_feats)
    result = mnb_classify.classify(test_feats)

    generate_report(result, 'bow_mnb', class_list)
def create_bnb_classifier(trainingset, testingset):
    x = 0
    y = 0
    print("\nBernoulli Naive Bayes classifier is being trained and created...")
    BNB_classifier = SklearnClassifier(BernoulliNB())
    BNB_classifier.train(trainingset)
    for t in testingset:
        y = y + 1
        l = BNB_classifier.classify(t[0])
        if (l == t[1]):
            x = x + 1
    accuracy = x / y * 100
    print("BernoulliNB accuracy percent = " + str(accuracy))
    return BNB_classifier
def create_logistic_regression_classifier(trainingset, testingset):
    x = 0
    y = 0
    print("\nLogistic Regression classifier is being trained and created...")
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(trainingset)
    for t in testingset:
        y = y + 1
        l = LogisticRegression_classifier.classify(t[0])
        if (l == t[1]):
            x = x + 1
    accuracy = x / y * 100
    print("Logistic Regression classifier accuracy = " + str(accuracy))
    return LogisticRegression_classifier
Esempio n. 11
0
    def handle(self, *args, **options):
        trains = get_train_tweets()
        if not trains:
            raise CommandError('No train data, please add some from the admin page!')

        train_count = trains.count()
        train_set = generate_trainset(trains)
        nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
        sci_classifier = SklearnClassifier(LinearSVC())
        sci_classifier.train(train_set)

        while True:
            unclassified_tweets = Tweet.objects.filter(train=False, klass=None)
            total_count = unclassified_tweets.count()
            if total_count > 0:
                print('Classifying %d tweets...' % total_count)
                counts_nb = defaultdict(int)
                counts_svm = defaultdict(int)
                start_time = time.time()
                for tweet in unclassified_tweets:
                    feature_vect = get_feature_vector(process_tweet(tweet.body))
                    features = extract_features(feature_vect)
                    sentiment_nb = nb_classifier.classify(features)
                    sentiment_svm = sci_classifier.classify(features)
                    counts_nb[sentiment_nb] += 1
                    counts_svm[sentiment_svm] += 1
                    tweet.klass = sentiment_nb
                    tweet.klass_svm = sentiment_svm
                    msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES]
                    msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES]
                    print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='')
                    # print('\r' + ', '.join(msg_nb), end='')
                    tweet.save()
                    if settings.DEBUG:
                        db.reset_queries()
                elapsed = int(time.time() - start_time)
                print('\nClassifying finished in %d seconds.' % elapsed)

            new_trains = get_train_tweets()
            if new_trains.count() != train_count:
                print('Train set has been changed, retraining...')
                trains = new_trains
                train_count = new_trains.count()
                train_set = generate_trainset(trains)
                nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
                sci_classifier = SklearnClassifier(LinearSVC())
                sci_classifier.train(train_set)
            else:
                print('Waiting...')
                time.sleep(3)
def create_mnb_classifier(trainingset, testingset):
    x = 0
    y = 0
    print(
        "\nMultinomial Naive Bayes classifier is being trained and created...")
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(trainingset)
    for t in testingset:
        y = y + 1
        l = MNB_classifier.classify(t[0])
        if (l == t[1]):
            x = x + 1
    accuracy = x / y * 100
    print("MultinomialNB Classifier accuracy = " + str(accuracy))
    return MNB_classifier
Esempio n. 13
0
class Bernoulli:
    def __init__(self):
        self.classifier = None
        self.word_features = None

    def train(self, listaTweets, listaTweets2):
        selected_tweets = listaTweets
        rejected_tweets = listaTweets2
        self.word_features = self.features(selected_tweets, rejected_tweets)
        training_set = self.get_training_set(selected_tweets, rejected_tweets)
        self.classifier = SklearnClassifier(BernoulliNB())
        self.classifier.train(training_set)

    def features(self, selected_tweets, rejected_tweets):
        selected_tweets = np.array(selected_tweets, dtype=object)
        selected_tweets = np.hstack(selected_tweets.flat)
        rejected_tweets = np.array(rejected_tweets, dtype=object)
        rejected_tweets = np.hstack(rejected_tweets.flat)
        wordlist1 = nltk.FreqDist(selected_tweets)
        wordlist2 = nltk.FreqDist(rejected_tweets)
        word_features1, v = zip(*wordlist1.most_common())
        word_features2, g = zip(*wordlist2.most_common())
        return word_features1 + word_features2

    def extract_features(self, tweet):
        if self.word_features is not None:
            tweet_words = set(tweet)
            features = {}
            for word in self.word_features:
                features['contains(%s)' % word] = (word in tweet_words)
            return features
        else:
            print("Bernoulli  must be trained before classifying")
            sys.exit(1)

    def get_training_set(self, selected_tweets, rejected_tweets):
        training_set = []
        for tweet in selected_tweets:
            training_set.append((self.extract_features(tweet), "selected"))

        for tweet in rejected_tweets:
            training_set.append((self.extract_features(tweet), "rejected"))
        return training_set

    def classify(self, inputs):
        if self.classifier is not None:
            return self.classifier.classify(self.extract_features(inputs))
    def train_Classifier(self, posfeats, negfeats, index):
        """The training set percentage should be passed as an argument.
        """

        # divide dataset into train and validation sets
        posCutoff = int(math.floor(len(posfeats) * 7 / 10))
        negCutoff = int(math.floor(len(negfeats) * 7 / 10))
        trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff]
        testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:]

        referenceSets = collections.defaultdict(set)
        testSets = collections.defaultdict(set)

        classsifiername = ''

        if (index == 0):
            classifier = nltk.classify.maxent.MaxentClassifier.train(
                trainFeatures,
                'GIS',
                trace=3,
                encoding=None,
                labels=None,
                gaussian_prior_sigma=0,
                max_iter=5)
            classsifiername = 'Maximum Entropy'
        elif (index == 1):
            classifier = SklearnClassifier(BernoulliNB())
            classifier.train(trainFeatures)
            classsifiername = 'Bernoulli Naive Bayes'
        else:
            classifier = SklearnClassifier(LogisticRegression())
            classifier.train(trainFeatures)
            classsifiername = 'LogisticRegression'

        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = classifier.classify(features)
            testSets[predicted].add(i)
        #
        # print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
        # print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
        # print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
        # print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
        # print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
        # print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
        #classifier.show_most_informative_features(10)
        return classifier
def logistic_classifier(file):
    file = str(file)
    logistic_model = SklearnClassifier(LogisticRegression())

    # train the model on the training data
    logistic_model.train(train_data)
    accuracy = nltk.classify.accuracy(logistic_model, test_data) * 100
    print("Logistic Regression Classifier Accuracy: {}".format(accuracy))

    # Tag the test file.
    with open(file, 'r') as fin:
        for test_sentence in fin:
            # Tokenize the line.
            doc = word_tokenize(test_sentence.lower())
            featurized_doc = {i: (i in doc) for i in word_features}
            tagged_label = logistic_model.classify(featurized_doc)
            results.write(str(tagged_label) + '\n')
def naive_classifier(file):
    file = str(file)
    naive_bayes_model = SklearnClassifier(MultinomialNB())

    # train the model on the training data
    naive_bayes_model.train(train_data)

    accuracy = nltk.classify.accuracy(naive_bayes_model, test_data) * 100
    print("Naive Bayes Classifier Accuracy: {}".format(accuracy))

    # Tag the test file.
    with open(file, 'r') as fin:
        for test_sentence in fin:
            # Tokenize the line.
            doc = word_tokenize(test_sentence.lower())
            featurized_doc = {i: (i in doc) for i in word_features}
            tagged_label = naive_bayes_model.classify(featurized_doc)
            results.write(str(tagged_label) + '\n')
    def ImplementBNB(self):

        print("~~~~~~~~~~~~~~~ BernoulliNB Classifier ~~~~~~~~~~~~~~~\n")
        #         classifier = NaiveBayesClassifier.train(trainFeatures)
        classifier = SklearnClassifier(BernoulliNB())
        classifier.train(trainFeatures)

        print("BernoulliNB Classifier Training Completed")

        #initiates referenceSets and testSets
        referenceSets = collections.defaultdict(set)
        testSets = collections.defaultdict(set)
        expected_array = []
        predicted_array = []

        #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            expected_array.append(label)
            predicted = classifier.classify(features)
            predicted_array.append(predicted)
            testSets[predicted].add(i)

        #prints metrics to show how well the feature selection did
        print("BernoulliNB Classifier Test Results ")
        print("")
        print("Length of Training Features" + str(len(trainFeatures)))
        print("Length of Test Features" + str(len(testFeatures)))
        print('Accuracy:' +
              str(nltk.classify.util.accuracy(classifier, testFeatures)))
        print('Positive precision:',
              str(precision(referenceSets['Positive'], testSets['Positive'])))
        print('Positive recall:',
              str(recall(referenceSets['Positive'], testSets['Positive'])))
        print('Negative precision:',
              str(precision(referenceSets['Negative'], testSets['Negative'])))
        print('Negative recall:',
              str(recall(referenceSets['Negative'], testSets['Negative'])))
        print("~~~~~~~~~~~~~~~Classification report~~~~~~~~~~~~~~~\n",
              classification_report(expected_array, predicted_array))
        print("~~~~~~~~~~~~~~~Confusion matrix~~~~~~~~~~~~~~~\n",
              confusion_matrix(expected_array, predicted_array))
        print("")
Esempio n. 18
0
def multinomial_bayes_nltk_wrapper(corpus, documents_training, documents_test, words_features, smoothing, kbest):
    """
    Multinomial Naive Bayes Algorithm using wrapper NLTK SklearnClassifier
    Memory problems can occur if very large dataset
    :param corpus:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :param smoothing:
    :param kbest:
    :return:
    """

    print
    print "----- Multinomial Bayes with wrapper nltk Algorithm------"
    print "Creating Training Feature Vectors..."
    array_features_training = []
    for (id, original_category, annotations) in documents_training:
        array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))
    # array_features_training = apply_features(extract_document_features,documents_training)
    print "Training algorithm..."
    # ('chi2', SelectKBest(chi2, k=3000)),
    if kbest == 0:
        kbest = "all"
    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
                         ('nb', MultinomialNB(alpha=smoothing))])

    # pipeline = Pipeline([('nb', MultinomialNB(alpha=smoothing))])

    classifier = SklearnClassifier(pipeline)
    classifier.train(array_features_training)

    print "Calculating metrics ..."
    categories = util_classify.get_categories(corpus)
    estimated_categories = []
    original_categories = []               

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))
    return original_categories, estimated_categories
Esempio n. 19
0
class Classifier:
    """The Classifier"""

    #############################################
    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        train = pd.read_csv(trainfile,
                            delimiter='\t',
                            names=[
                                'polarity_label', 'aspect_category', 'term',
                                'char_term_offset', 'sentence'
                            ])
        train, feat_list = preprocessor(train)
        feat_set = nltk_compatible(train, feat_list)
        split = int(len(feat_set) * 0.75)
        feat_train = feat_set[:split]
        feat_test = feat_set[split:]
        #self.main_classifier = SklearnClassifier(RandomForestClassifier())
        #self.main_classifier = SklearnClassifier(MultinomialNB())
        #self.main_classifier = SklearnClassifier(BernoulliNB())
        #self.main_classifier = SklearnClassifier(LogisticRegression())
        self.main_classifier = SklearnClassifier(svm.LinearSVC())
        self.main_classifier.train(feat_train)

    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        devdata = pd.read_csv(datafile,
                              delimiter='\t',
                              names=[
                                  'polarity_label', 'aspect_category', 'term',
                                  'char_term_offset', 'sentence'
                              ])
        devdata, test_feat = preprocessor(devdata)
        test_set = nltk_compatible(devdata, test_feat)
        labels = []
        for (sentence, label) in test_set:
            predict = self.main_classifier.classify(sentence)
            labels.append(predict)
        return labels
Esempio n. 20
0
def linear_support_vector_machines_tf_idf(corpus, documents_training, documents_test, words_features, kbest):
    """
    Linear Support Vector Machines Algorithm. The Support Vector Machines algorithm with a linear kernel and using TF/IDF
    :param corpus:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :param kbest:
    :return:
    """

    print
    print "----- Linear Support Vector Machines with tfidf algorithm ------"
    print "Creating Features Training Vectors..."
    categories = util_classify.get_categories(corpus)
    array_features_training = []

    for (id, original_category, annotations) in documents_training:
        array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus), original_category))

    print "Training algorithm..."

    if kbest == 0:
        kbest = "all"

    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()),
                         ('svc', LinearSVC())])

    classifier = SklearnClassifier(pipeline)
    classifier.train(array_features_training)

    print "Calculating metrics..."
    estimated_categories = []
    original_categories = []

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus)))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))
    return original_categories, estimated_categories
def train_Classifier(posfeats,negfeats,index):
    # divide dataset into train and validation sets
    posCutoff = int(math.floor(len(posfeats)*7/10))
    negCutoff = int(math.floor(len(negfeats)*7/10))
    trainFeatures = posfeats[:posCutoff] + negfeats[:negCutoff]
    testFeatures = posfeats[posCutoff:] + negfeats[negCutoff:]

    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    classsifiername=''

    if (index == 0):
        classifier = nltk.classify.maxent.MaxentClassifier.train(trainFeatures, 'GIS', trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 5)
        classsifiername= 'Maximum Entropy'
    elif (index ==1):
        classifier = SklearnClassifier(BernoulliNB())
	classifier.train(trainFeatures)
        classsifiername='Bernoulli Naive Bayes'
    else:
        classifier = SklearnClassifier(LogisticRegression())
	classifier.train(trainFeatures)
        classsifiername = 'LogisticRegression'

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    #classifier.show_most_informative_features(10)
    return classifier
Esempio n. 22
0
def main():
    features = load_features()
    
    train_set = features[9500:]
    test_set = features[:1406]
        
    test = processTweet("Just started using @zoho email client on #ios and must admit that it's much better than @gmail from @Google.Better #UI, #UX and faster sync")
    test2 = processTweet("What the hell, @firefox and @Apple? Implement damn date\/time inputs. Chrome has supported for 5 years, Opera for 8.\u2026 https:\/\/t.co\/ZiyAQH8sBt")
    test3 = processTweet("Lovely @google celebration of Iraqi architect Zaha Hadid today https:\/\/t.co\/FrsJUt3RF5 via @\/google.com\/doodles")
    test4 = processTweet("#Apple pay usage peaked in March 2015. Adoption rate is declining. One of the major concerns: security (despite Apple Pay being very secure)")
    
    global word_features
    word_features = get_word_features(get_words_in_tweets(train_set))
    training_set = nltk.classify.apply_features(extract_features, train_set)
    classifier = SklearnClassifier(MultinomialNB())
    classifier.train(training_set)
    #print classifier.show_most_informative_features(40)
    print classifier.classify(extract_features(test))
    print classifier.classify(extract_features(test2))
    print classifier.classify(extract_features(test3))
    print classifier.classify(extract_features(test4))
    testing_set = nltk.classify.apply_features(extract_features, test_set)
    print("MNB_classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
Esempio n. 23
0
class Swinger(object):
    """docstring for Swinger"""
    BASEDIR = os.path.dirname(__file__)
    classifier_table = {
        'SVC': SVC(probability=False),
        'LinearSVC': LinearSVC(),
        'NuSVC': NuSVC(probability=False),
        'MultinomialNB': MultinomialNB(),
        'BernoulliNB': BernoulliNB(),
        'LogisticRegression': LogisticRegression()
    }

    def __init__(self):
        self.train = []
        self.test = []
        self.classifier = ''

    def load(self,
             model,
             useDefault=True,
             pos=None,
             neg=None,
             BestFeatureVec=700):
        BestFeatureVec = int(BestFeatureVec)

        if useDefault:
            print('load default bestMainFeatures')
            self.bestMainFeatures = pickle.load(
                open(
                    os.path.join(
                        self.BASEDIR,
                        'bestMainFeatures.pickle.{}'.format(BestFeatureVec)),
                    'rb'))
            print('load default bestMainFeatures success!!')

            self.classifier = pickle.load(
                open(
                    os.path.join(self.BASEDIR,
                                 '{}.pickle.{}'.format(model, BestFeatureVec)),
                    'rb'))
            print("load model from {}".format(model))
        else:
            try:
                print('load local bestMainFeatures')
                self.bestMainFeatures = pickle.load(
                    open('bestMainFeatures.pickle.{}'.format(BestFeatureVec),
                         'rb'))
                print('load local bestMainFeatures success!!')

                self.classifier = pickle.load(
                    open('{}.pickle.{}'.format(model, BestFeatureVec), 'rb'))
                print("load model from {}".format(model))
            except Exception as e:
                # build best features.
                print(
                    'load bestMainFeatures failed!!\nstart creating bestMainFeatures ...'
                )

                self.pos_origin = json.load(open(pos, 'r'))
                self.neg_origin = json.load(open(neg, 'r'))
                shuffle(self.pos_origin)
                shuffle(self.neg_origin)
                poslen = len(self.pos_origin)
                neglen = len(self.neg_origin)

                # build train and test data.
                self.pos_review = self.pos_origin[:int(poslen * 0.9)]
                self.pos_test = self.pos_origin[int(poslen * 0.9):]
                self.neg_review = self.neg_origin[:int(neglen * 0.9)]
                self.neg_test = self.neg_origin[int(neglen * 0.9):]

                self.bestMainFeatures = create_Mainfeatures(
                    pos_data=self.pos_review,
                    neg_data=self.neg_review,
                    BestFeatureVec=BestFeatureVec)  # 使用詞和雙詞搭配作為特徵

                # build model
                print('start building {} model!!!'.format(model))

                self.classifier = SklearnClassifier(
                    self.classifier_table[model])  #nltk在sklearn的接口
                if len(self.train) == 0:
                    print('build training data')
                    posFeatures = self.emotion_features(
                        self.best_Mainfeatures, self.pos_review, 'pos')
                    negFeatures = self.emotion_features(
                        self.best_Mainfeatures, self.neg_review, 'neg')
                    self.train = posFeatures + negFeatures
                self.classifier.train(self.train)  #訓練分類器
                pickle.dump(
                    self.classifier,
                    open('{}.pickle.{}'.format(model, BestFeatureVec), 'wb'))

    def buildTestData(self, pos_test, neg_test):
        pos_test = json.load(open(pos_test, 'r'))
        neg_test = json.load(open(neg_test, 'r'))
        posFeatures = self.emotion_features(self.best_Mainfeatures, pos_test,
                                            'pos')
        negFeatures = self.emotion_features(self.best_Mainfeatures, neg_test,
                                            'neg')
        return posFeatures + negFeatures

    def best_Mainfeatures(self, word_list):
        return {
            word: True
            for word in word_list if word in self.bestMainFeatures
        }

    def score(self, pos_test, neg_test):
        from sklearn.metrics import precision_recall_curve
        from sklearn.metrics import roc_curve
        from sklearn.metrics import auc
        # build test data set
        if len(self.test) == 0:
            # self.test = self.buildTestData(self.pos_test, self.neg_test)
            self.test = self.buildTestData(pos_test, neg_test)

        test, test_tag = zip(*self.test)
        pred = list(
            map(lambda x: 1 if x == 'pos' else 0,
                self.classifier.classify_many(test)))  #對開發測試集的數據進行分類,給出預測的標籤
        tag = list(map(lambda x: 1 if x == 'pos' else 0, test_tag))
        # ROC AUC
        fpr, tpr, _ = roc_curve(tag, pred, pos_label=1)
        print("ROC AUC: %.2f" % auc(fpr, tpr))
        return auc(fpr, tpr)

    def emotion_features(self, feature_extraction_method, data, emo):
        return list(map(lambda x: [feature_extraction_method(x), emo],
                        data))  #爲積極文本賦予"pos"

    def swing(self, sentence):
        sentence = self.best_Mainfeatures(CutAndrmStopWords(sentence))
        return self.classifier.classify(sentence)

    def swingList(self, sentenceList):
        sentence = self.best_Mainfeatures(sentenceList)
        return self.classifier.classify(sentence)
Esempio n. 24
0
def SingleFold(train_group, k=8):
    """Do a single fold of different classifiers

    For classifiers, I've written my own NaiveBayes Classifier and I also considered 
    several available classifiers in nltk and sklearn like 
    ['Maximum Entropy', 'DecisionTree', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC'].
    I want to compare performances of these classifiers and ouput their accuracy, precision, recall, F1.

    Args:
        train_group: The original training set contains all the news related with the stock and its label. 
            For example:
            ([[title1],[content1],[title2],[content2],...],'+1')
        k: Title's weight

    Returns:
        It doesn't return things, instead it prints the result. For each classifier, for example:
            ---------------------------------------
            SINGLE FOLD RESULT (NaiveBayes)
            ---------------------------------------
            accuracy: 0.6479463537300922
            precision 0.6505853139411139
            recall 0.965771458662454
            f-measure 0.7774480712166171
    """
    print('Preparing...')
    random.shuffle(train_group)
    cutoff = int(math.floor(len(train_group) * 3 / 4))
    train_set, test_set = PrepareSets(train_group[cutoff:],
                                      train_group[:cutoff], k)

    classifier_list = [
        'NaiveBayes', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC',
        'NuSVC'
    ]  # 'Maximum Entropy', 'DecisionTree'
    for cl in classifier_list:
        if cl == 'NaiveBayes':
            print('Training...')
            classifier = nltk.NaiveBayesClassifier.train(train_set)
        # elif cl == 'Maximum Entropy':
        #     print('Training...')
        #     classifier = nltk.MaxentClassifier.train(train_set, 'GIS', trace=0)
        elif cl == 'BernoulliNB':
            classifier = SklearnClassifier(BernoulliNB())
            print('Training...')
            classifier.train(train_set)
        elif cl == 'LogisticRegression':
            classifier = SklearnClassifier(LogisticRegression())
            print('Training...')
            classifier.train(train_set)
        elif cl == 'SVC':
            classifier = SklearnClassifier(LinearSVC())
            print('Training...')
            classifier.train(train_set)
        elif cl == 'LinearSVC':
            classifier = SklearnClassifier(LinearSVC())
            print('Training...')
            classifier.train(train_set)
        else:
            classifier = SklearnClassifier(NuSVC())
            print('Training...')
            classifier.train(train_set)
        # else:
        #     print('Training...')
        #     classifier = nltk.DecisionTreeClassifier.train(train_set)

        # print(classifier.show_most_informative_features(10))

        print('Testing...')
        TP = 0
        FN = 0
        FP = 0
        TN = 0
        for i, (feats, label) in enumerate(test_set):
            observed = classifier.classify(feats)
            if label == '+1' and observed == '+1':
                TP += 1
            elif label == '-1' and observed == '+1':
                FP += 1
            elif label == '+1' and observed == '-1':
                FN += 1
            elif label == '-1' and observed == '-1':
                TN += 1

        accuracy = (TP + TN) / len(test_set)
        recall = TP / (TP + FN)
        precision = TP / (TP + FP)
        F1 = 2 * precision * recall / (precision + recall)

        pickle.dump(classifier, open('./' + cl + '.pkl', 'wb'))

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + cl + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', precision)
        print('recall', recall)
        print('f-measure', F1)
Esempio n. 25
0
classifier.train(train_set)

# classifier = NaiveBayesClassifier.train(train_set)
# print ('accuracy:', nltk.classify.util.accuracy(classifier, test_set))
# classifier.show_most_informative_features()

neg = 0
pos = 0
# sentence = "I feel terrible today."
sentence = sentence.lower()
print(sentence)
words = nltk.word_tokenize(sentence)

print(words)
for word in words:
    classResult = classifier.classify(word_feats(word))
    print(word_feats(word))
    print(classResult)
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1

print('Positive: ' + str(float(pos) / len(words)))
print('Negative: ' + str(float(neg) / len(words)))
print('Pos', pos)
print('Neg', neg)

# print('BernoulliNB`s accuracy is %f'  %score(BernoulliNB()))
# print('MultinomiaNB`s accuracy is %f'  %score(MultinomialNB()))
# print('LogisticRegression`s accuracy is  %f' %score(LogisticRegression()))
Esempio n. 26
0
def linear_support_vector_machines_cross_language_tf_idf(corpus_training, corpus_test, documents_training, documents_test, words_features, kbest):
    """
    Cross Language linear Support Vector Machines algorithm. The Support Vector Machines algorithm with a linear kernel.
    An implementation of linear SVM to conduct cross-language experiments.
    :param corpus_training:
    :param corpus_test:
    :param documents_training:
    :param documents_test:
    :param words_features:
    :return:
    """

    print
    print "----- Cross-Language Support Vector Machines algorithm------"
    print "Creating Training Vectors..."
    categories = util_classify.get_categories(corpus_training)
    ids_documents_test = []
    original_cats = []
    array_cats_names = []
    array_features_training = []
    array_vector_training = []
    array_categories = []

    for (id, original_category, annotations) in documents_training:
        array_features_training.append((util_classify.transform_document_in_dict(annotations, words_features, corpus_training), original_category))
        array_categories.append(util_classify.get_categories(corpus_training).index(original_category))

    for x in array_categories:
        array_cats_names.append(categories[x])

    print "Training algorithm..."

    if kbest == 0:
        kbest = "all"

    pipeline = Pipeline([('chi2', SelectKBest(chi2, k=kbest)), ('tfidf', TfidfTransformer()), ('svc', LinearSVC())])

    classifier = SklearnClassifier(pipeline)
    classifier.train(array_features_training)

    print "Calculating metrics..."
    estimated_categories = []
    original_categories = []

    categories = util_classify.get_categories(corpus_test)

    for (id, cat_original, annotations) in documents_test:
        cat_estimated = classifier.classify((util_classify.transform_document_in_dict(annotations, words_features, corpus_test)))
        estimated_categories.append(categories.index(cat_estimated))
        original_categories.append(categories.index(cat_original))

    '''
    categories_names = util_classify.get_categories(corpus_test)

    array_cats_names = []
    for x in estimated_categories:
        array_cats_names.append(categories_names[x])

    # Storage process predicted categories in DB
    util_classify.set_database_session(corpus_test)
    for document in Session.query(Document):
        if document.id in ids_documents_test:
            pos = ids_documents_test.index(document.id)
            document.classified_in_category = array_cats_names[pos]
    Session.commit()
    # End storage process predicted categories in DB
    '''

    return original_categories, estimated_categories
Esempio n. 27
0
testing_set = nltk.classify.apply_features(extract_features, test_tweets)

for (tweet, sentiment) in test_tweets:
    print(classifier.classify(extract_features(tweet)))

print(nltk.classify.accuracy(classifier, testing_set))

classifier.show_most_informative_features(5)



"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                      ('chi2', SelectKBest(chi2, k='all')),
                      ('nb', MultinomialNB())])
"""
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                      ('chi2', SelectKBest(chi2, k='all')),
                      ('nb', MultinomialNB())])

classif = SklearnClassifier(pipeline)

classif.train(training_set)

print(classif.labels())
for (tweet, sentiment) in test_tweets:
    print(classif.classify(extract_features(tweet)))

print(nltk.classify.accuracy(classif, testing_set))
class YoutubeVideoClassifier(Utility):
    """ Use the collected data as training set and classify test data"""

    def __init__(self):
        Utility.__init__(self)
        self.nb_output_file_name = self.config.get("GLOBAL", "nb_output_file")
        self.svm_output_file_name = self.config.get("GLOBAL", "svm_output_file")
        self.nb_output = os.path.join(self.output_dir, self.nb_output_file_name)
        self.svm_output = os.path.join(self.output_dir, self.svm_output_file_name)

        self.train_features = []
        self.stopwords_set = set(stopwords.words("english"))

    def run_main(self):
        self.pre_processing()
        self.feature_extraction()
        self.classification()
        self.testing()

    def pre_processing(self):
        self.load_data()

    def load_data(self):
        self.load_movies()
        self.load_actors()
        self.load_tvshows()
        self.load_test_data()

    def load_movies(self):
        self.movies_list = []
        movies_fd = codecs.open(self.movies_file)

        for movie in movies_fd.readlines():
            if not movie:
                continue
            self.movies_list.append(movie)
        movies_fd.close()

    def load_actors(self):
        self.actors_list = []
        actors_fd = codecs.open(self.actors_file)

        for actor in actors_fd.readlines():
            if not actor:
                continue
            self.actors_list.append(actor)
        actors_fd.close()

    def load_tvshows(self):
        self.tvshows_list = []
        tvshows_fd = codecs.open(self.tvshows_file)

        for tvshow in tvshows_fd.readlines():
            if not tvshow:
                continue
            self.tvshows_list.append(tvshow)
        tvshows_fd.close()

    def load_test_data(self):
        json_data = open(self.test_file)
        self.test_data = json.load(json_data)

    def feature_selection(self, features_list):
        selected_features = []

        for feat in features_list:
            if feat and feat.strip() and feat.lower() not in self.stopwords_set:
                selected_features.append((feat.strip().lower(), True))
        return dict(selected_features)

    def feature_extraction(self):
        for item in self.tvshows_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "tvshow"))

        for item in self.movies_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "movie"))

        for item in self.actors_list:
            if not item:
                continue
            selected_features = self.feature_selection(item.replace("_", " ").split(" "))
            self.train_features.append((selected_features, "celebrity"))

    def classification(self):

        # Training NB Classifier
        self.nb_classifier = NaiveBayesClassifier.train(self.train_features)

        # Training SVM classifier
        self.svm_classifier = SklearnClassifier(LinearSVC())
        self.svm_classifier.train(self.train_features)

    def testing(self):
        nb_fd = codecs.open(self.nb_output, "w", "utf-8")
        svm_fd = codecs.open(self.svm_output, "w", "utf-8")

        for instance in self.test_data:
            try:
                if not instance:
                    continue
                test_features = instance.get("title").split(" ")
                test_features.extend(instance.get("description").split(" "))
                selected_features = self.feature_selection(test_features)

                label = self.nb_classifier.classify(selected_features)
                nb_fd.write("%s\n" % (label))

                label = self.svm_classifier.classify(selected_features)
                svm_fd.write("%s\n" % (label))
            except:
                logging.info("Exception in test data ")
                continue

        nb_fd.close()
        svm_fd.close()
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:]

neg_feats_train = get_train_features_from_tweets(neg_train, 'neg')
pos_feats_train = get_train_features_from_tweets(pos_train, 'pos')

train_feats = neg_feats_train + pos_feats_train

svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_feats)

# Evaluation
correct, wrong = 0, 0

for tweet in neg_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "neg":
        correct += 1
    else:
        wrong += 1


for tweet in pos_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "pos":
        correct += 1
    else:
        wrong += 1

print "Accuracy: {}".format(correct / float(correct + wrong))
Esempio n. 30
0
def AccuracyByClassifier(classifier_model, pos_wordlist, neg_wordlist, mode='normal', best_topwords=list()):
	is_network = False
	# 进行分类
	if classifier_model in ('svm', 'SVM'):
		# 线性核的SVM
		classifier_model = LinearSVC()
	elif classifier_model in ('mb', 'MB'):
		# 多项式朴素贝叶斯
		classifier_model = MultinomialNB()
	elif classifier_model in ('bb', 'BB'):
		# 伯努利朴素贝叶斯
		classifier_model = BernoulliNB()
	elif classifier_model in ('dt', 'DT'):
		# 决策树
		classifier_model = DecisionTreeClassifier(criterion='entropy')
	# elif classifier_model in ('gbdt', 'GBDT'):
	#	# 梯度提升决策树GBDT
	# 	classifier_model = GradientBoostingClassifier()
	elif classifier_model in ('nn', 'NN'):
		# 神经网络
		is_network = True
		# [len(best_topwords), 30, 2], 30, 5, 0.3->82.6%准确率
		# Network接口参数说明:神经网络各层神经元个数、迭代次数、分片数据集大小、学习率
		classifier_model = Network([len(best_topwords), 30, 2], 30, 10, 0.3, best_topwords)
	else:
		# 默认用LR
		classifier_model = LogisticRegression()

	classifier = SklearnClassifier(classifier_model) if not is_network else classifier_model
	tp = fp = tn = fn = 0
	if len(mode) == 2 and mode[0]=='k-cross':
		knum = int(mode[1])
		all_wordlist = pos_wordlist + neg_wordlist
		shuffle(all_wordlist)
		precision = recall = F_measure = accuracy = 0.0
		real = list()
		pred = list()
		for i in range(knum):
			piece_len = int(len(all_wordlist)/knum)
			train_set = all_wordlist[:piece_len*i] + all_wordlist[piece_len*(i+1):]
			test_set = all_wordlist[piece_len*i:piece_len*(i+1)]
			classifier.train(train_set)
			for each in test_set:
				pre = classifier.classify(each[0])
				real.append(int(each[1]))
				pred.append(int(pre))
				if int(each[1]) == int(pre) and int(each[1]) == 1:
					tp += 1
				elif int(each[1]) == int(pre) and int(each[1]) == 0:
					tn += 1
				elif int(each[1]) != int(pre) and int(each[1]) == 1:
					fn += 1
				elif int(each[1]) != int(pre) and int(each[1]) == 0:
					fp += 1
			cur_precision = float(tp)/(tp+fp)
			precision += cur_precision
			cur_recall = float(tp)/(tp+fn)
			recall += cur_recall
			F_measure += 2.0/((1/cur_precision) + (1/cur_recall))
			accuracy += float(tp + tn) / (tp + fp + tn + fn)

		# DrawPrecisionRecallCurve(real, pred)

		return (precision/knum, recall/knum, F_measure/knum, accuracy/knum)
	elif mode[0]=='normal':
		# 分出训练集和测试集
		pos_len = len(pos_wordlist)
		neg_len = len(neg_wordlist)
		# shuffle数据集
		shuffle(pos_wordlist)
		shuffle(neg_wordlist)
		train_set = pos_wordlist[:int(0.7*pos_len)] + neg_wordlist[:int(0.7*neg_len)]
		# devtest_set = pos_wordlist[int(0.6*pos_len):int(0.7*pos_len)] + neg_wordlist[int(0.6*neg_len):int(0.7*neg_len)]
		test_set = pos_wordlist[int(0.7*pos_len):] + neg_wordlist[int(0.7*neg_len):]
		classifier.train(train_set)
		real = list()
		pred = list()
		for each in test_set:
			pre = classifier.classify(each[0])
			real.append(int(each[1]))
			pred.append(int(pre))
			if int(each[1]) == int(pre) and int(each[1]) == 1:
				tp += 1
			elif int(each[1]) == int(pre) and int(each[1]) == 0:
				tn += 1
			elif int(each[1]) != int(pre) and int(each[1]) == 1:
				fn += 1
			elif int(each[1]) != int(pre) and int(each[1]) == 0:
				fp += 1
		precision = float(tp)/(tp+fp)
		recall = float(tp)/(tp+fn)
		F_measure = 2.0/((1/precision) + (1/recall))
		accuracy = float(tp + tn) / (tp + fp + tn + fn)
		# print tp, fp, tn, fn
		# DrawPrecisionRecallCurve(real, pred)
		return (precision, recall, F_measure, accuracy, classifier)
	else:
		return (0, 0, 0, 0, 0)
Esempio n. 31
0
	print "creating feature sets..."
	tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/testdata.csv')
	labeld_features = label_feats_from_tweets(tweetlist)
	#labeld_features = label_feats_from_corpus(movie_reviews)
	training_set, test_set = split_label_feats(labeld_features)

	# tweetlist = tweetTest.loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv')
	# training_set = label_feats_from_tweets(tweetlist)
	# training_set, garbage = split_label_feats(training_set, 1.0)
	# test_set, garbage = split_label_feats(labeld_features, 1.0)

	print "training set length: %i  test set length: %i" % (len(training_set), len(test_set))
	print prettifyFeatureSet(test_set)
	print "training classifier..."
	#classifier = NaiveBayesClassifier.train(training_set)
	#classifier = MaxentClassifier.train(training_set, algorithm='iis', max_iter=99, min_lldelta=0.01)
	#classifier = MaxentClassifier.train(training_set)
	classifier = SklearnClassifier(LogisticRegression()).train(training_set)
	print "calculating accuracy..."
	print 'accuracy:', nltk.classify.util.accuracy(classifier, test_set)
	#classifier.show_most_informative_features(30)

	negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
	print classifier.classify(negfeat)
	probdist =  classifier.prob_classify(negfeat)
	print "pos: ", probdist.prob('pos'), " neg: ", probdist.prob('neg')
	print classifier.labels()
	classify_tweet(classifier, "I love this movie!", True)
	classify_tweet(classifier, "!!!", True)

Esempio n. 32
0
class TestCorpus():
	
	# static variables common to all instances

	feature_words = stopwords.words('english')

	feature_types = {'BOOLEAN':0, 'FREQUENCY':1, 'FREQUENCY_NORMALIZED':2}
	# default feature type is FREQUENCY_NORMALIZED
	feature_type = 2

	classifier_types = {'NAIVE_BAYES':0, 'SVM_LINEAR':1, 'SVM_POLY':2}
	# default classifier is polynomial SVM
	classifier_type = 1


	# boolean values (feature word occurs or does not occur in text)
	@classmethod
	def features_boolean(cls, text, features=[]):
		if not features:
			features = cls.feature_words
		return dict((word, int(word in text)) for word in features)

	# frequency values, normalized (how many times a feature word occurs in text, normalized by text length)
	@classmethod
	def features_frequency_normalized(cls, text, features=[]):
		if not features:
			features = cls.feature_words
		# multiply normalized frequency count by 1000 to avoid very small numbers
		return dict((word, 1000.0*text.count(word)/float(len(text))) for word in features)

	# frequency values, not normalized
	@classmethod
	def features_frequency(cls, text, features=[]):
		if not features:
			features = cls.feature_words
		return dict((word, text.count(word)) for word in features)


	# takes as input lists of (text, label) pairs for each class (1/2), for training/testing
	def __init__(self, train_set_class1, train_set_class2, test_set_class1, test_set_class2):
		self.train_set_class1 = train_set_class1
		self.train_set_class2 = train_set_class2
		self.test_set_class1 = test_set_class1
		self.test_set_class2 = test_set_class2
		self.train_set = self.train_set_class1 + self.train_set_class2
		self.test_set = self.test_set_class1 + self.test_set_class2

		# use default feature type to compute list of features and labels for training set and test set
		self.train_feature_set = [(self.features_frequency_normalized(word_tokenize(text)), label) for (text,label) in self.train_set]
		self.test_feature_set = [(self.features_frequency_normalized(word_tokenize(text)), label) for (text,label) in self.test_set]	

		# custom feature_sets initialized with defaut
		self.train_feature_set_custom = self.train_feature_set
		self.test_feature_set_custom = self.test_feature_set
		self.feature_words_custom = self.feature_words

	# recompute featuresets with current parameters
	def compute_featuresets(self):
		if (self.feature_type == 0):
			self.train_feature_set_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set]
			self.test_feature_set_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set]	
		if (self.feature_type == 1):
			self.train_feature_set_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set]
			self.test_feature_set_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set]	
		if (self.feature_type == 2):
			self.train_feature_set_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set]
			self.test_feature_set_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.test_set]

	# compute featuresets with current parameters separately for each (training) class
	def compute_class_featuresets(self):
		if (self.feature_type == 0):
			self.train_feature_set_class1_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1]
			self.train_feature_set_class2_custom = [(self.features_boolean(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2]
		if (self.feature_type == 1):
			self.train_feature_set_class1_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1]
			self.train_feature_set_class2_custom = [(self.features_frequency(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2]
		if (self.feature_type == 2):
			self.train_feature_set_class1_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class1]
			self.train_feature_set_class2_custom = [(self.features_frequency_normalized(word_tokenize(text), self.feature_words_custom), label) for (text,label) in self.train_set_class2]
	
	# mean frequency of feature words in texts
	def mean_features(self, featureset=[]):
		mean_features = {}
		if not featureset:
			featureset = self.train_feature_set + self.test_feature_set
		nr_ex = len(featureset)
		for stopword in featureset[0][0]:
			mean_features[stopword] = 0
			for train_example in featureset:
				mean_features[stopword] += train_example[0][stopword]/float(nr_ex)
				
		return mean_features

	# top feature words occuring in whole corpus
	def top_features(self, how_many=127):
		features = self.mean_features()
		# return same format dictionary only with top occuring features (according to mean_features result)
		return dict(sorted(features.iteritems(), key=operator.itemgetter(1), reverse = True)[:how_many])

	def set_nr_features(self, nr_features):
		top_swords = self.top_features(nr_features)
		self.feature_words_custom = top_swords.keys()
		# recompute featuresets for new feature vector
		self.compute_featuresets()
		print 'Nr of features: ', len(self.feature_words_custom)

	def set_feature_type(self, feature_type):
		self.feature_type = feature_type
		self.compute_featuresets()
		print 'Feature type: ', [name for name, value in self.feature_types.iteritems() if value==feature_type][0]

	def train_classifier(self, trainset=[], svm_param=1.0):
		# default train set is class field (all train files)
		if not trainset:
			trainset = self.train_feature_set_custom

		if (self.classifier_type == 0):
			self.classifier = SklearnClassifier(MultinomialNB())
			print "Training Naive Bayes classifier..."
		if (self.classifier_type == 1):
			self.classifier = SklearnClassifier(LinearSVC(penalty='l2', loss='l2', dual=False, C=svm_param, class_weight='auto'))
			print "Training Linear SVM classifier..."
		if (self.classifier_type == 2):
			self.classifier = SklearnClassifier(SVC(kernel='poly', C=svm_param, class_weight='auto'))
			print "Training Polynomial SVM classifier..."

		self.classifier.train(self.train_feature_set_custom)


	def testall_accuracy(self, testset=[]):
		# default test set is class field (all test files)
		if not testset:
			testset = self.test_feature_set_custom
		print 'Measuring classifier performance...'
		acc = accuracy(self.classifier, self.test_feature_set_custom)
		print 'Overall accuracy:', acc
		
		return acc

	def results_per_file(self, filenames=[]):
		# if no filenames are given as parameters just use numbers from 1 to nr_of_files
		if not filenames:
			filenames = range(len(self.test_feature_set_custom) + 1)[1:]
		print 'Results per file:'
		findex = 0
		# first index - element to be tested
		# second index - 0 = index of feature dictionary
		for text in self.test_feature_set_custom:
			predicted_label = self.classifier.classify(text[0])
			actual_label = text[1]
			print filenames[findex], predicted_label, predicted_label == actual_label
			findex += 1

	def classify_this(self, text):
		return self.classifier.classify(text)

	# TODO: easier computation of top_features and set_feature_nr. \
	# maybe just get first elements of sorted featuresets, not compute them again everytime

	def leave_one_out(self, feature_type=2, classifier_type=1, C=1.0, nr_features=127):
		print '\nCross-validating with leave-one-out...'

		# set parameters

		# if (nr_features != 127):
		# 	self.set_nr_features(nr_features)
		# if (feature_type !=2):
		# 	self.set_feature_type(feature_type)

		# faster: don't recompute featuresets everytime:

		self.feature_type = feature_type
		if (nr_features != 127):
			top_swords = self.top_features(nr_features)
			self.feature_words_custom = top_swords.keys()
		if (nr_features != 127 or feature_type != 2):
			self.compute_featuresets()
		print '\nNr features: ', nr_features
		print 'Feature type: ', \
		[name for name, value in self.feature_types.iteritems() if value==feature_type][0], '(%d)'%feature_type, '\n'
		self.classifier_type = classifier_type


		# cross-validate

		nrcorrect = 0
		total = len(self.train_feature_set_custom)
		for i in range (total):
			trainset = self.train_feature_set_custom[:i] + self.train_feature_set_custom[i+1:]
			self.train_classifier(trainset=trainset, svm_param=C)
			label = self.classify_this(self.train_feature_set_custom[i][0])
			print 'Testing on: file', i+1
			print 'actual: ', self.train_feature_set_custom[i][1]
			print 'predicted: ', label
			print "--------------------------------"
			if (label== self.train_feature_set_custom[i][1]):
				nrcorrect += 1
		print 'Correctly classified: ', nrcorrect, '/', total, '\n'
		return float(nrcorrect)/total

	# cross-validate results with leave-one-out for different parameters
	def cross_validate(self, validate_type=0):

		# validate_type =
		#				0: nr of features
		#				1: feature_type
		#				2: classifier_type
		#				3: classifier_parameter

		if (validate_type==0):
			# cross-validate for nr of features:
			# [stopwords, accuracies] = self.nrstopwords_experiment(True)
			# results = dict((stopwords[i], accuracies[i]) for i in range(len(stopwords)))
			nr_stopwords = range(1,100,10)
			results = dict((nr,0) for nr in nr_stopwords)
			for nr in nr_stopwords:
				acc = self.leave_one_out(nr_features=nr)
				results[nr] = acc

		# TODO: strange results for this? too accurate; weak methods too successful
		if (validate_type==1):
			# cross-validate for feature type
			results = dict((feat,0) for feat in self.feature_types)
			for feat in self.feature_types:
				acc = self.leave_one_out(feature_type=self.feature_types[feat])
				results[feat] = acc

		if (validate_type==2):
			# cross-validate for classifier type
			results = dict((cl, 0) for cl in self.classifier_types)
			for cl in self.classifier_types:
				acc = self.leave_one_out(classifier_type=self.classifier_types[cl])
				results[cl] = acc

		if (validate_type==3):
			# cross-validate for classifier parameter
			Cs = [10**(-10), 10**(-5), 10**(-3), 10**(-1), 1.0, 1.5, 10, 100, 1000, 10**5, 10**10]
			results = dict((C, 0) for C in Cs)
			for C in Cs:
				acc = self.leave_one_out(C=C)
				results[C] = acc

		return results


	# accuracy vs number of stopwords used
	def nrstopwords_experiment(self, validate=False):
		#TODO: not sure about these results,  maybe test some more
		accuracies = []
		for nr_stopwords in range(1,10):
			if (validate):
				acc = self.leave_one_out(nr_features=nr_stopwords)
			else:
				self.set_nr_features(nr_stopwords)
				self.train_classifier()
				acc = self.testall_accuracy()
			accuracies.append(acc)
			print nr_stopwords, acc

		for nr_stopwords in range(10,40,5):
			if (validate):
				acc = self.leave_one_out(nr_features=nr_stopwords)
			else:
				self.set_nr_features(nr_stopwords)
				self.train_classifier()
				acc = self.testall_accuracy()
			accuracies.append(acc)
			print nr_stopwords, acc

		for nr_stopwords in range(40,127,25):
			if (validate):
				acc = self.leave_one_out(nr_features=nr_stopwords)
			else:
				self.set_nr_features(nr_stopwords)
				self.train_classifier()
				acc = self.testall_accuracy()
			accuracies.append(acc)
			print nr_stopwords, acc

		stopwords = range(1,10) + range(10,40,5) + range(40,127,25)

		return [stopwords, accuracies]


	def plot_stopwords_vs_accuracy(self, validate=False):
		[stopwords, accuracies] = self.nrstopwords_experiment(validate)
		plt.plot(stopwords, accuracies, label='Circle')
		plt.xlabel('Nr of stopwords')
		plt.ylabel('Accuracy')
		plt.title('Performance of algorithm versus number of stopwords used in classification')
		plt.show()
		# save to disk
		#plt.savefig('stopwords_experiment2.png')

	def plot_featureword_distribution(self, nr_swords=25):
		self.compute_class_featuresets()
		# for test set and each class of train set
		Utils.bar_graph(self.mean_features(self.test_feature_set_custom), graph_title='%d stop words for test set - mean occurences'%nr_swords, output_name='test%d.png'%nr_swords)
		Utils.bar_graph(self.mean_features(self.train_feature_set_custom), graph_title='%d stop words for train set - mean occurences'%nr_swords, output_name='train%d.png'%nr_swords)
		Utils.bar_graph(self.mean_features(self.train_feature_set_class1_custom), graph_title='%(nr)d stop words for %(class)s set - mean occurences'%{'class':self.train_feature_set_class1_custom[0][1], 'nr':nr_swords}, output_name='hamilton%d.png'%nr_swords)
		Utils.bar_graph(self.mean_features(self.train_feature_set_class2_custom), graph_title='%(nr)d stop words for %(class)s set - mean occurences'%{'class':self.train_feature_set_class2_custom[0][1], 'nr':nr_swords}, output_name='madison%d.png'%nr_swords)
Esempio n. 33
0
neg_train, neg_test = neg_tweets[:negcutoff], neg_tweets[negcutoff:]

neg_feats_train = get_train_features_from_tweets(neg_train, 'neg')
pos_feats_train = get_train_features_from_tweets(pos_train, 'pos')

train_feats = neg_feats_train + pos_feats_train

svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_feats)

# Evaluation
correct, wrong = 0, 0

for tweet in neg_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "neg":
        correct += 1
    else:
        wrong += 1


for tweet in pos_test:
    features = get_features_from_tweet(tweet)
    result = svm_classifier.classify(features)
    if result == "pos":
        correct += 1
    else:
        wrong += 1

print "Accuracy: {}".format(correct / float(correct + wrong))
""" This is a demo of the Scikit-learn Classifier from the NLTK
    package using the movie reviews corpus  """
from nltk.corpus import movie_reviews
from featx import *
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from nltk.classify.util import accuracy
from nltk import word_tokenize
lfeats = label_feats_from_corpus(movie_reviews)# extracts the features and its labels (neg/pos) associated with each tweets
train_feats,test_feats = split_label_feats(lfeats, split = 0.75) # splits labeled feature sets into training and test feats see featx.py
sk_classifier = SklearnClassifier(LogisticRegression())# trains classifier
sk_classifier.train(train_feats)
print("The associated accuracy for this classfier on the data is :" )
print(accuracy(sk_classifier,test_feats))
while True:
    text = input("Enter your fake tweet use only words: \n")
    test = bag_of_words(word_tokenize(text)) # converts text into a bag of words see featx.py
    print("Sentiment:")
    print(sk_classifier.classify(test))
    control = input("press aney key to continue 'q' to quit:")
    if(control == "q" ):
        break
Esempio n. 35
0
 classif = SklearnClassifier(pipeline)
 classif.train(zip(trainData,trainLabels))
 cf = None
 if USE_CHI_SQUARE:
     cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large","w")
 else:
     cf = open("nb_classifier_"+str(gram)+"gram_"+str(size)+"_large_nochi","w")
 pickle.dump(classif, cf)
 
 
 
 matches = 0
 mismatches = 0
 scores = {1:0, 2:0, 3:0, 4:0, 5:0}
 for i in range(len(testLabels)):
     label = classif.classify(testData[i])
     log("test data id: "+str(i),f)
     if label == testLabels[i]:
         matches += 1
         log("matched: label: "+str(label),f)
     else:
         mismatches += 1
         log("mismatched: label: "+str(label)+" was supposed to be: "+str(testLabels[i]),f)
     scores[int(label)]+=1
 log("summary of results for: gram: "+str(gram) +" size: "+str(size),f)
 log("matches = "+str(matches),f)        
 log("mismatches = "+str(mismatches),f)
 log("guesses = "+repr(scores),f)
 log("="*20,f)
 log("="*20,f)
 log("="*20,f)
Esempio n. 36
0
#  # post.get('class') is the label of the current post
#  featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class'))))

#  print featuresets[0]


def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    features = {}
    for token in tokens:
        features[token] = tokens.count(token)
    return features


featureset = []
sentences = [
    "hello there, how are you?  Are you very happy??",
    "Yammering on all the time, what a loser"
]
for sentence in sentences:
    features = preprocess(sentence)
    featureset.append(features)

cls = SklearnClassifier(LinearSVC())
featuresets = []
featuresets.append((featureset[0], "first"))
featuresets.append((featureset[1], "second"))
cls.train(featuresets)
print cls.classify(preprocess("hello there, friends"))
Esempio n. 37
0
class RForests(text_classifier.TextClassifier):
    def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1):
        self.classifier = None
        self.labelFile = labelFile
        self.trainingDir = trainDir
        self.labels = None
        self.all_words = None
        self.numTrees = numTrees
        self.numJobs = numJobs
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees,
                                            n_jobs=numJobs),sparse=False)
        #self.labels = training.setup(labelFile)
        #self.train()
    
    def train(self):
        feature_sets = self.getFeatures()
        self.classifier.train(feature_sets)
        
    """ Determines training error"""
    def trainingError(self):
        feature_sets = self.getFeatures()
        p = nltk.classify.accuracy(self.classifier,feature_sets)
        return p
        
    """ Make sure that the algorithm works on training data using a k fold 
        cross validation scheme """
    def kfoldCrossValidation(self,k):
        feature_sets = self.getFeatures()
        error = 0
        for i in range(k):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            n = len(feature_sets)/k
            train_set,test_set = feature_sets[:n*i],feature_sets[n*i:]
            test_set1 = feature_sets[:n*i]
            train_set   = feature_sets[n*i:n*(i+1)]
            test_set2 = feature_sets[i+1:]
            test_set = test_set1+test_set2
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
        return p
    """ Make sure that the algorithm works on training data using a leave one out 
        cross validation scheme """
    def leave1OutCrossValidation(self):
        error = 0
        feature_sets = self.getFeatures()
        N = len(feature_sets)
        for i in range(N):
            self.classifier = SklearnClassifier(RandomForestClassifier(
                                                n_estimators=self.numTrees),sparse=False)
            train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:]
            train_set = train_set1+train_set2
            test_set = [test_set]
            self.classifier.train(feature_sets)
            p = nltk.classify.accuracy(self.classifier,test_set)
            error+=p
        return error/N
            
    """ Construct a learning curve to see if there is overfitting"""
    def learningCurve(self,numTrials=4):
        accuracies = []
        feature_sets = self.getFeatures()
        for k in xrange(1,len(feature_sets)-1):
            total = 0
            for i in xrange(numTrials):
                self.classifier = SklearnClassifier(RandomForestClassifier(
                                                    n_estimators=self.numTrees),
                                                    sparse=False)
                random.shuffle(feature_sets)
                train_set,test_set = feature_sets[:k],feature_sets[k:]
                self.classifier.train(train_set)
                p = nltk.classify.accuracy(self.classifier,test_set)
                print len(train_set),len(test_set),p
                total+=p
            accuracies.append(total/numTrials)
        return accuracies
    
    """ Train on only k features and return training labels and predicted labels """
    def testClassify(self,k):
        feature_sets = self.getFeatures()
        random.shuffle(feature_sets)
        self.classifier = SklearnClassifier(RandomForestClassifier(
                                            n_estimators=self.numTrees),sparse=False)
        
        self.classifier.train(feature_sets[k:])
        features,ref_labels = zip(*feature_sets[:k])
        pred_labels = self.classifier.batch_classify(features)   
        return ref_labels,pred_labels
    
    """ nltk confusion matrix """
    def confusionMatrix(self,ref,test):
        ref.sort(key=lambda x: x[0])
        test.sort(key=lambda x: x[0])
        _,ref_labels = zip(*ref)
        _,test_labels = zip(*test)
        cm = ConfusionMatrix(ref_labels, test_labels)
        return cm

    def prob_classify(self,db,fastain):
        proIDs,pds,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            proteinID = toks[5]
            query_rows = genbank.proteinQuery(proteinID,db)
            ids,text = zip(*query_rows)
            text = ''.join(map(str,text))
            if text=='': 
                label = ['na']
                pd = None
            else:
                text = word_reg.findall(text)
                
            
                featureset = self.gene_features(text)
                assert text!=prevText
                assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.batch_classify(featureset)    
                pd = self.classifier.prob_classify([featureset])[0]
                    
            proIDs.append(proteinID)  
            pds.append(pd)
            labels+=label
        return proIDs,labels,pds

    def classifyPickle(self,pickle,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        gbkTable = genbank.GenBankTable()
        gbkTable.load(pickle)
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            locus_tag = toks[5]
            text = gbkTable.getLocusText(locus_tag)
            if text=='': 
                label = 'na'
            else:
                text = word_reg.findall(text)
                featureset = self.gene_features(text)
                #assert text!=prevText
                #assert featureset!=prevFeatureset
                prevFeatureset = featureset
                prevText = text
                label = self.classifier.classify(featureset)    
                #print label,text
            proIDs.append(locus_tag)  
            labels.append(label)
        return zip(proIDs,labels)
        
    """ Classifies proteins based on its text from sqlite3 database"""
    def classifyDB(self,db,fastain):
        proIDs,features,labels = [],[],[]
        prevFeatureset = ''
        prevText = ''
        for seq_record in SeqIO.parse(fastain, "fasta"):
            title = seq_record.id
            toks = title.split("|")
            locus_tag = toks[5]
            locus_rows = genbank_sqlite3.locusQuery(locus_tag,db)
            protein_rows = []
            for row in locus_rows:
                locus,proteinID = row
                query_rows = genbank_sqlite3.proteinQuery(proteinID,db)
                protein_rows+=query_rows
            #print len(protein_rows),locus_tag
            if len(protein_rows)==0:
                label = 'na'
            else:
                ids,text = zip(*protein_rows)
                text = ''.join(map(str,text))
                if text=='': 
                    label = 'na'
                else:
                    text = word_reg.findall(text)
                    featureset = self.gene_features(text)
                    #assert text!=prevText
                    #assert featureset!=prevFeatureset
                    prevFeatureset = featureset
                    prevText = text
                    label = self.classifier.classify(featureset)    
                    #print label,text
            proIDs.append(locus_tag)  
            labels.append(label)
        return zip(proIDs,labels)

    def classify(self,dbin,fastain,type='sqlite3'):
        if type=='sqlite3':
            return self.classifyDB(dbin,fastain)
        else:
            return self.classifyPickle(dbin,fastain)
Esempio n. 38
0
# 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
# featuresets = [] # list of tuples of the form (post, features)
# for post in posts: # applying the feature extractor to each post
#  # post.get('class') is the label of the current post
#  featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class'))))

#  print featuresets[0]

def preprocess(sentence):
    tokens = nltk.word_tokenize(sentence)
    tokens = [w for w in tokens if w not in stopwords.words("english")]
    features = {}
    for token in tokens:
        features[token]=tokens.count(token)
    return features
featureset = []
sentences = [
    "hello there, how are you?  Are you very happy??",
    "Yammering on all the time, what a loser"
    ]
for sentence in sentences:
    features = preprocess(sentence)
    featureset.append(features)

cls = SklearnClassifier(LinearSVC())
featuresets = []
featuresets.append((featureset[0],"first"))
featuresets.append((featureset[1],"second"))
cls.train(featuresets)
print cls.classify(preprocess("hello there, friends"))
Esempio n. 39
0
  elif(x==1):
    str='Bernoulli Naive Baeyes'
  elif(x==2):
    str='Logistic Regression'
  else:
    str='Support Vector'
  print(str,'classifier accuracy :',accuracy_score(ground_truth,predictions))
  print(str,'f1 score :',f1_score(ground_truth,predictions))

"""So we clearly see that Logistic regression classifier and Support vector perfectly classifies our dataset
But since SVM has more f1-score so we will make predictions based on SVM
"""

predictions=[]
for r in testing_set:
  predictions.append(SVC_clf.classify(r[0]))
print(predictions)

positive=0
negative=0
for i in range(0,len(predictions)):
  if predictions[i]==1:
    positive=positive+1
  else:
    negative=negative+1

print(positive,negative)

"""Sentimental Analysis"""

positive_percent=positive/(positive+negative)
Esempio n. 40
0

car_counter = 0
print("Started classification of youtube comments", datetime.utcnow())
results = {}
with open(os.path.join("..", "youtube-comments", "carwow-comments",
                       "all-comments.json"),
          "r",
          encoding="UTF-8") as f:
    cars = json.load(f)
    for car in cars:
        car_counter += 1
        results[car] = {}
        comments = cars[car]
        for comment in comments:
            category = classifier.classify(
                comment_to_feature_set(comment["text"]))
            if category in results[car]:
                results[car][category] += 1
            else:
                results[car][category] = 1
        print("(#" + str(car_counter) + ")", "Classification of comments for",
              car, "done")

# with open(os.path.join("..", "youtube-comments", "carwow-comments", "3-category-classification.json"), "w", encoding="UTF-8") as f:
with open(os.path.join("..", "youtube-comments", "carwow-comments",
                       "5-category-classification.json"),
          "w",
          encoding="UTF-8") as f:
    json.dump(results, f, indent=2, sort_keys=True)

print("Classification of youtube comments complete", datetime.utcnow())
Esempio n. 41
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    util.DPRINT = args.dprint
    featureset_name = os.path.basename(args.featurefn).split('.')[0]
    features.load_featurefile(args.featurefn)

    ## default is 1e-4.
    THETOL = 1e-3
    classifier_pairs = []
    classifier_pairs.append(("MFS", learn.MFSClassifier()))

    classifier = SklearnClassifier(LogisticRegression(C=1,
                                   penalty='l2',
                                   tol=THETOL))
    classifier_pairs.append(("maxent-l2-c1", classifier))
    stamp = util.timestamp()

    for fn in glob(args.testset + "/*data"):
        problems = semeval_testset.extract_wsd_problems(fn)

        w = problems[0][0]
        assert w.endswith(".n")
        w = w[:-2]
        load_training_for_word(w, args.bitextfn, args.alignfn, args.annotatedfn)

        bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
        oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
        if os.path.exists(bestoutfn):
            os.remove(bestoutfn)
        if os.path.exists(oofoutfn):
            os.remove(oofoutfn)

        training = None

        for problem in problems:
            w = problem[0]
            assert w.endswith(".n")
            w = w[:-2]
            print(problem)

            if training is None:
                training = trainingdata.trainingdata_for(w, nonnull=True)
                print("got {0} instances for {1}".format(len(training), w))
                labels = set(label for (feat,label) in training)
                if len(training) == 0:
                    print("no samples for", w)
                    break
                if len(labels) < 2:
                    print("there's only one sense for", w, " and it is ",
                          labels)
                    break
                classifier.train(training)

            rawtext = problem[2]
            surface, index = semeval_testset.head_surface_and_index(rawtext)
            replaced = re.sub(r"<head>(.*)</head>", " \\1 ", rawtext)
            annotated = preprocessing.preprocess(replaced, "en")
            sentence = [token.lemma for token in annotated]

            focus_index = find_head_token_index(annotated, surface, index)
            feats = features.extract_untagged(sentence, annotated, focus_index)

            bestoutfn = args.outputdir + "/{0}.{1}.best".format(w, "es")
            oofoutfn = args.outputdir + "/{0}.{1}.oof".format(w, "es")
            with open(bestoutfn, "a") as bestoutfile, \
                 open(oofoutfn, "a") as oofoutfile:

                answer = classifier.classify(feats)
                print(answer)
                dist = classifier.prob_classify(feats)
                oof_answers = topfive(dist)
                print(output_one_best(problem, "es", answer), file=bestoutfile)
                print(output_five_best(problem, "es", oof_answers),
                      file=oofoutfile)
Esempio n. 42
0
def CrossValidation(train_group, n=5):
    """Do Cross Validation of different classifiers

    For classifiers, I've written my own NaiveBayes Classifier and I also considered 
    several available classifiers in nltk and sklearn like 
    ['Maximum Entropy', 'DecisionTree', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC', 'NuSVC'].
    I want to compare performances of these classifiers and ouput their accuracy, precision, recall, F1.
    
    Different from Singlefold, cross validation can be more accurate and avoid overfitting.

    Args:
        train_group: The original training set contains all the news related with the stock and its label. 
            For example:
            ([[title1],[content1],[title2],[content2],...],'+1')
        n: How many folds you want. Default: 5.

    Returns:
        It doesn't return things, instead it prints the result. For each classifier, for example:
            ---------------------------------------
            N-FOLD CROSS VALIDATION RESULT (NaiveBayes)
            ---------------------------------------
            accuracy: 0.6479463537300922
            precision 0.6505853139411139
            recall 0.965771458662454
            f-measure 0.7774480712166171
    """
    print('Preparing...')
    random.shuffle(train_group)

    classifier_list = [
        'NaiveBayes', 'BernoulliNB', 'LogisticRegression', 'SVC', 'LinearSVC',
        'NuSVC'
    ]  # 'Maximum Entropy', 'DecisionTree']
    for cl in classifier_list:
        subset_size = int(math.floor(len(train_group) / n))

        accuracy = []
        precision = []
        recall = []
        F1 = []
        classifier = SklearnClassifier(NuSVC())

        for i in range(n):
            testing_this_round = train_group[i * subset_size:][:subset_size]
            training_this_round = train_group[:i * subset_size] + train_group[
                (i + 1) * subset_size:]
            train_set, test_set = PrepareSets(training_this_round,
                                              testing_this_round)

            if cl == 'NaiveBayes':
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier = nltk.NaiveBayesClassifier.train(train_set)
            # elif cl == 'Maximum Entropy':
            #     print('Training ' + cl + ' ' + str(i) + ' fold')
            #     classifier = nltk.MaxentClassifier.train(train_set, 'GIS', trace=0)
            elif cl == 'BernoulliNB':
                classifier = SklearnClassifier(BernoulliNB())
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier.train(train_set)
            elif cl == 'LogisticRegression':
                classifier = SklearnClassifier(LogisticRegression())
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier.train(train_set)
            elif cl == 'SVC':
                classifier = SklearnClassifier(LinearSVC())
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier.train(train_set)
            elif cl == 'LinearSVC':
                classifier = SklearnClassifier(LinearSVC())
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier.train(train_set)
            else:  # cl == 'NuSVC':
                classifier = SklearnClassifier(NuSVC())
                print('Training ' + cl + ' ' + str(i) + ' fold')
                classifier.train(train_set)
            # else:
            #     print('Training ' + cl + ' ' + str(i) + ' fold')
            #     classifier = nltk.DecisionTreeClassifier.train(train_set)

            # print(classifier.show_most_informative_features(10))

            print('Testing...')
            TP = 0
            FN = 0
            FP = 0
            TN = 0
            for i, (feats, label) in enumerate(test_set):
                observed = classifier.classify(feats)
                if label == '+1' and observed == '+1':
                    TP += 1
                elif label == '-1' and observed == '+1':
                    FP += 1
                elif label == '+1' and observed == '-1':
                    FN += 1
                elif label == '-1' and observed == '-1':
                    TN += 1

            accuracy.append((TP + TN) / len(test_set))
            recall.append(TP / (TP + FN))
            precision.append(TP / (TP + FP))
            F1.append(2 * (TP / (TP + FP)) * (TP / (TP + FN)) /
                      (TP / (TP + FP)) + (TP / (TP + FN)))

        pickle.dump(classifier, open('./' + cl + '.pkl', 'wb'))

        print('')
        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + cl + ')')
        print('---------------------------------------')
        print('accuracy:', np.mean(accuracy))
        print('precision', np.mean(precision))
        print('recall', np.mean(recall))
        print('f-measure', np.mean(F1))
        print('\n')
Esempio n. 43
0
    columns = [['predicted', 'predicted'], ['not mortality', 'mortality']]))
df_confusion = pd.crosstab(labels, prediction) 
df_norm = df_confusion.values / df_confusion.sum(axis=1)[:,None]

ax = sn.heatmap(df_norm, annot=True, annot_kws={"size": 20}, cmap="YlGnBu")
plt.xlabel('Predicted label', fontsize=20)
plt.ylabel('True label', fontsize=20)
plt.title('Confusion Matrix, w/Normalization', fontsize=20)
plt.show()


test_true, test_pred = [], []

for i, (features, label) in enumerate(testing):
    test_true.append(label)
    observed = nltk_ensemble.classify(features)
    test_pred.append(observed)

# need to use precision and recall instead to see false-positive rates
average_precision = metrics.average_precision_score(test_true, test_pred)

precision, recall, thresholds = metrics.precision_recall_curve(test_true, test_pred)
f1 = metrics.f1_score(test_true, test_pred)
auc = metrics.auc(recall, precision)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))
print('F1 score: {0:0.2f}'.format(f1))
print('AUC: {0:0.2f}'.format(auc))
plt.plot([0,1], [0.5,0.5], linestyle='--')
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
    'plot': (float(plot) / len(words)),
    'theme': (float(theme) / len(words))
}
maximumNB = max(statsNB.items(), key=operator.itemgetter(1))[0]
print(maximumNB, statsNB.pop(maximumNB))

print(
    '--------------------------------------------------------------------------------'
)
print('LogisticRegression Classifier')
actor = 0
plot = 0
theme = 0

for word in new_words:
    LRResult = LR_classifier.classify(word_feats(word))
    # print(word,classResultSK)
    if LRResult == 'actor':
        actor = actor + 1
    if LRResult == 'plot':
        plot = plot + 1
    if LRResult == 'theme':
        theme = theme + 1

statsLR = {
    'actor': (float(actor) / len(words)),
    'plot': (float(plot) / len(words)),
    'theme': (float(theme) / len(words))
}
maximumLR = max(statsLR.items(), key=operator.itemgetter(1))[0]
print(maximumLR, statsLR.pop(maximumLR))
Esempio n. 45
0
#%% Now we can see how we are doing via the various metrics

classifier = SklearnClassifier(knn(n_neighbors=17))
classifier.train(train)

referenceSets = {}
referenceSets['pos'] = set()
referenceSets['neg'] = set()
testSets = {}
testSets['pos'] = set()
testSets['neg'] = set()

shuffle(test)
for i, (features, label) in enumerate(test):
    referenceSets[label].add(i)
    predicted = classifier.classify(features)
    testSets[predicted].add(i)
print 'After training on %d samples, start to test on %d instances:' % (
    len(train), len(test))
print 'accuracy:        %.2f' % nltk.classify.util.accuracy(classifier, test)
print 'pos precision:   %.2f' % nltk.precision(referenceSets['pos'],
                                               testSets['pos'])
print 'neg precision:   %.2f' % nltk.precision(referenceSets['neg'],
                                               testSets['neg'])
print 'pos recall:      %.2f' % nltk.recall(referenceSets['pos'],
                                            testSets['pos'])
print 'neg recall:      %.2f' % nltk.recall(referenceSets['neg'],
                                            testSets['neg'])

testSets['neg'].clear()
testSets['pos'].clear()
Esempio n. 46
0
 correct, total = 0., 0.
 no_class_pos = 0.
 no_class_neg = 0.
 no_class_neu = 0.
 no_result_pos = 0.
 no_result_neg = 0.
 no_result_neu = 0.
 true_pos = 0.
 true_neg = 0.
 true_neu = 0.
 correct_class_pos = 0.
 correct_class_neg = 0.
 correct_class_neu = 0.
 
 for (tweet, class_type, original_tweet_string) in test_tweet_list:
     result = classifier.classify(get_feature_mapping(tweet))
     total += 1
     if class_type == result: correct += 1
     if class_type == 'positive':
         no_class_pos += 1
         if result == 'positive':
             correct_class_pos += 1;
     if result == 'positive':
         no_result_pos += 1
         if class_type == 'positive':
             true_pos += 1
     if class_type == 'negative':
         no_class_neg += 1
         if result == 'negative':
             correct_class_neg += 1;
     if result == 'negative':
Esempio n. 47
0
class DocumentClassifier():
    '''
    Train a classifier with labeled documents and classify new documents 
    into one of the labeled clases.
    We call 'dev docs' to the documents set provided for training the 
    classifier. These 'dev docs' are splitted into two sub sets: 'train docs' 
    and 'test docs' that would be used to train and test the machine learning
    model respectively.

    Parameters
    ----------
        train_p : float, 0.8 by default
            The proportion of the 'dev docs' used as 'train docs'
            Use values greater than 0 and lower than 1.
            The remaining docs will be using as 'test docs'
    
    eq_label_num : boolean, True by default
        If true, 'train docs' will have equal number of documents for each
        class. This number will be the lowest label count.
    
    complete_p : boolean, True by default
        Used when eq_label_num is True, but the lowest label count is not
        enough for getting the train_p proportion of 'train docs'. If this 
        attribute is True, more documents from 'test docs' will be moved
        to 'train docs' until we get train_p

    n_folds : integer, 10 by default
        Number of folds to be used in k-fold cross validation technique for
        choosing different sets as 'train docs'

    vocab_size : integer, 500 by default
        This is the size of the vocabulary set that will be used for extracting
        features out of the docs

    t_classifier : string, 'NB' by default
        This is the type of classifier model used. Available types are 'NB' 
        (Naive Bayes), 'DT' (decision tree), 'RF' (Random Forest), and 'SVM'
        (Support Vector Machine)

    language: string, 'english' by default
        Language on which documents are written

    stem: boolean, False by deafault
        If True, stemming is applied to feature extraction

    train_method: string, 'all_class_train' by default
        Choose the method to train the classifier. There are two options:
        'all_class_train' and 'cross_validation'
    '''
    def __init__(self,
                 train_p=0.8,
                 eq_label_num=True,
                 complete_p=True,
                 n_folds=10,
                 vocab_size=250,
                 t_classifier="NB",
                 language="english",
                 stem=False,
                 train_method="all_class_train"):
        self.train_p = train_p
        self.eq_label_num = eq_label_num
        self.complete_p = complete_p
        self.n_folds = n_folds
        self.vocab_size = vocab_size
        self.t_classifier = t_classifier
        self.language = language
        self.stem = stem
        self.train_method = train_method
        self._vocab = []
        self._classified_docs = []
        self._classifier = None
        self._accuracy = 0
        self._precision = {}
        self._recall = {}
        self._f_measure = {}
        self._train_docs = []
        self._test_docs = []

    def split_train_and_test(self, docs):
        '''
        Split the 'dev docs' set into the 'train docs' and 'test docs' subsets

        Parameters
        ----------
        docs: iterable
            An iterable which yields a list of strings

        '''

        categories_count = self.count_categories(docs)
        label_limit = min([c for (k, c) in categories_count.items()])
        labeled_docs = {}
        train_docs = []
        test_docs = []
        # Split docs by label
        for (cat, count) in categories_count.items():
            labeled_docs[cat] = shuffled([t for (t, k) in docs if k == cat])
        if self.eq_label_num:
            # Select the same number of doc for all labels
            for cat, cat_docs in labeled_docs.items():
                cat_limit = label_limit
                cat_train_docs = cat_docs[:cat_limit]
                cat_test_docs = cat_docs[cat_limit:]
                train_docs += [(doc, cat) for doc in cat_train_docs]
                test_docs += [(doc, cat) for doc in cat_test_docs]
            l_train = len(train_docs)
            l_docs = len(docs)
            l_test = len(test_docs)
            actual_p = l_train / l_docs
            # If the training proportion is not
            if self.complete_p == True and actual_p < self.train_p:
                shuffled_extra = shuffled(test_docs)
                extra_i = 0
                while (actual_p < self.train_p and extra_i < l_test):
                    aux_l_train = l_train + extra_i
                    actual_p = aux_l_train / l_docs
                    extra_i += 1
                train_docs += shuffled_extra[:extra_i]
                test_docs = shuffled_extra[extra_i:]
        else:
            label_limit = int(self.train_p * len(docs))
            shuffled_docs = shuffled(docs)
            train_docs = shuffled_docs[:label_limit]
            test_docs = shuffled_docs[label_limit:]
        self._train_docs = train_docs
        self._test_docs = test_docs

    def cross_validation_train(self, dev_docs):
        '''
        Applies k-fold cross validation technique to split the docs into different
        pairs of training and testing sets. For each pair, it trains and evals the
        a classifier, choosing the one with the best accuracy

        Parameters
        ----------
        dev_docs: iterable
            An iterable which yields a list of strings

        '''
        dev_docs = shuffled(dev_docs)
        accuracies = []
        best_accuracy = 0
        subset_size = int(len(dev_docs) / self.n_folds)

        for i in range(self.n_folds):
            classifier_list = []
            train_docs = (dev_docs[(i + 1) * subset_size:] + \
                          dev_docs[:i * subset_size])
            test_docs = dev_docs[i * subset_size:(i + 1) * subset_size]
            train_set = apply_features(self.get_doc_features, train_docs)
            if self.t_classifier == "NB":
                classifier = NaiveBayesClassifier.train(train_set)
            elif self.t_classifier == "DT":
                classifier = DecisionTreeClassifier.train(train_set)
            elif self.t_classifier == "RF":
                classifier = SklearnClassifier(RandomForestClassifier())\
                                                       .train(train_set)
            elif self.t_classifier == "SVM":
                classifier = SklearnClassifier(LinearSVC(), sparse=False)\
                                                         .train(train_set)

            classifier_list.append(classifier)
            test_set = apply_features(self.get_doc_features, test_docs, True)
            accuracies.append((accuracy(classifier, test_set)) * 100)

            if accuracies[-1] > best_accuracy:
                best_accuracy = accuracies[-1]
                self._classifier = classifier
                self._train_docs = train_docs
                self._test_docs = test_docs

    def all_class_train(self, dev_docs):
        '''
        Train classifier with train_p percentage of all classes. The remaining
        docs of each class is used for testing.

        Parameters
        ----------
        dev_docs: iterable
            An iterable which yields a list of strings
        '''
        categories_count = self.count_categories(dev_docs)

        labeled_docs = {}
        for (cat, count) in categories_count.items():
            labeled_docs[cat] = shuffled(
                [t for (t, k) in dev_docs if k == cat])

        train_docs = []
        test_docs = []

        for cat, l in labeled_docs.items():
            cat_limit = int(self.train_p * len(l))
            train_docs += [(t, cat) for t in l[:cat_limit]]
            test_docs += [(t, cat) for t in l[cat_limit:]]

        self._train_docs = train_docs
        self._test_docs = test_docs

        train_set = apply_features(self.get_doc_features, self._train_docs)
        # create and train the classification model according to t_classifier
        if self.t_classifier == "NB":
            self._classifier = NaiveBayesClassifier.train(train_set)
        elif self.t_classifier == "DT":
            self._classifier = DecisionTreeClassifier.train(train_set)
        elif self.t_classifier == "RF":
            self._classifier = SklearnClassifier(RandomForestClassifier())\
                                                         .train(train_set)
        elif self.t_classifier == "SVM":
            self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\
                                                          .train(train_set)

    def count_categories(self, docs):
        '''
        Count how many documents of each class are in the 'dev docs' set
        
        Parameters
        ----------
        docs: iterable
            An iterable which yields a list of strings

        Returns
        -------
        counters: dictionary
            A dictiionary where each item is the number of docs for a class
        '''

        categories = set([c for (t, c) in docs])
        counters = {}
        for cat in categories:
            counters[cat] = 0
        for (text, cat) in docs:
            counters[cat] += 1
        self._categories = sorted(categories)
        return counters

    def get_doc_features(self, doc):
        '''
        Extract features of a document, checking the presence of the words
        in the vocabulary

        Parameters
        ----------
        doc: string
            The doc from which features will be extracted

        Returns
        -------
        features: dictionary
            A dictionary where each item indicates the presence of a
            word from the vocabulary in the input doc
        '''

        features = {}
        for word in self._vocab:
            features['contains({})'.format(word)] = (word in doc)
        return features

    def train_classifier(self, dev_docs):
        '''
        Create the features vocabulary from 'dev docs', 
        Split 'dev docs', train the classifier with 'train docs',
        Evaluate accuracy with 'test docs'

        Parameters
        ----------
        dev_docs: iterable
            An iterable which yields a list of strings
        '''
        # create vocabulary for feature extraction
        ce = ConceptExtractor(num_concepts=self.vocab_size,
                              language=self.language,
                              pos_vec=['NN', 'NNP', 'NNS', 'NNPS'])
        ce.extract_concepts([t for (t, c) in dev_docs])
        self._vocab = sorted([c for (c, f) in ce.common_concepts],
                             key=str.lower)
        if (self.stem):
            self._vocab = [tokenize_and_stem(w, language=self.language)[0] \
                                                    for w in self._vocab]

        if self.train_method == "cross_validation":
            self.cross_validation_train(dev_docs)
        elif self.train_method == "all_class_train":
            self.all_class_train(dev_docs)

    def eval_classifier(self):
        '''
        Test the model and calculates the metrics of accuracy, precision,
        recall and f-measure
        '''
        test_set = apply_features(self.get_doc_features, self._test_docs, True)
        self._accuracy = accuracy(self._classifier, test_set)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(test_set):
            refsets[label].add(i)
            observed = self._classifier.classify(feats)
            testsets[observed].add(i)
        self.count_categories(self._train_docs)
        for cat in self._categories:
            self._precision[cat] = precision(refsets[cat], testsets[cat])
            self._recall[cat] = recall(refsets[cat], testsets[cat])
            self._f_measure[cat] = f_measure(refsets[cat], testsets[cat])

    def classify_docs(self, docs):
        '''
        First train the classifier with the labeled data.
        Then classifies the unlabeled data.

        Parameters
        ----------
        docs: iterable
            An iterable which yields a list of strings
        '''

        dev_docs = [(t, c) for (t, c) in docs if c != ""]
        unlabeled_docs = [t for (t, c) in docs if c == ""]
        self.train_classifier(dev_docs)
        self.eval_classifier()
        results = []
        for doc in unlabeled_docs:
            doc_feats = self.get_doc_features(doc)
            result = self._classifier.classify(doc_feats)
            results.append((doc, result))
        self._classified_docs = results
        self._final_cat_count = self.count_categories(dev_docs + results)

    @property
    def classified_docs(self):
        return self._classified_docs

    @property
    def accuracy(self):
        return self._accuracy

    @property
    def precision(self):
        return self._precision

    @property
    def recall(self):
        return self._recall

    @property
    def f_measure(self):
        return self._f_measure

    @property
    def category_count(self):
        return self._final_cat_count
Esempio n. 48
0
    train_set = data[:slice]
    test_set = data[slice:]

    # train classification models
    print 'Training models on', len(train_set), 'data samples...'
    nb = NaiveBayesClassifier.train(train_set)
    lr = SklearnClassifier(LogisticRegression()).train(train_set)
    kwfc = KeywordFrequencyClassifier()
    kwfc.train(train_set)

    # calculate and report model accuracy
    print '\nKey Word Frequency Classifier accuracy based on', len(
        test_set), 'samples:'
    print kwfc.accuracy(test_set)

    print '\nNaive Bayes accuracy based on', len(test_set), 'samples:'
    print nltk.classify.util.accuracy(nb, test_set)

    print '\nLogistic Regression accuracy based on', len(test_set), 'samples:'
    print nltk.classify.util.accuracy(lr, test_set)

    # an example
    sample_post = 'How many numbers less than 70 are relatively prime to it?'
    test = util.features(sample_post)

    # attempt to classsify sample sentence
    print '\nAn Example:\n', sample_post
    print 'Naive Bayes:', nb.classify(test)
    print 'Keyword Classifier', kwfc.predict(test)
    print 'Logistic Regression:', lr.classify(test)
class BookClassifier:

    def __init__(self):
        config = ConfigParser.ConfigParser()
        config.read("BookClassifier.config")
        cur_dir = os.getcwd()
        
        #Config parameters
        data_dir = config.get('GLOBAL', 'data_dir')
        op_dir = config.get('GLOBAL', 'output_dir')
        train_file = config.get('GLOBAL', 'train_file_name')
        train_file_2 = config.get('GLOBAL', 'train_file_name_2')
        self.bigram_threshold = int(config.get('GLOBAL', 'bigram_threshold'))
        self.k_fold = int(config.get('GLOBAL', 'k_fold'))
        self.unigram_threshold = int(config.get('GLOBAL', 'unigram_threshold'))

        self.data_dir = os.path.join(cur_dir, data_dir)
        self.output_dir = os.path.join(cur_dir, op_dir)
        self.train_file = os.path.join(self.data_dir, train_file)
        self.train_file_2 = os.path.join(self.data_dir, train_file_2)        
        self.logger_file = os.path.join(self.output_dir, "BookClassifier.log") 
        self.mode = int(sys.argv[1])        
        
        if self.mode == 1:
            output_file = config.get('GLOBAL', 'output_file_1') 
        elif self.mode  ==2:
            output_file = config.get('GLOBAL', 'output_file_2') 
        self.output_file = os.path.join(self.output_dir, output_file)
        
        #Data structures 
        self.stopwords_set = set(stopwords.words('english'))    
        self.toc_list = []
        self.training_feats = []
        self.test_cases = []
        self.book_instances = []
        self.selected_features = []
        self.book_category_set = set()
        self.bookid_to_toc_dict = {}   #toc - table of contents
       
        self.train_file_fd = None 
        self.train_file_2_fd = None
        self.output_file_fd = None

        #classifiers
        self.nb_classifier = None
        self.svm_classifier = None

    def initialize_logger(self):
        logging.basicConfig(filename=self.logger_file, level=logging.INFO)
        logging.info("Initialized logger")
        self.logging = logging

    def run_main(self):
        self.preprocessing()
        self.feature_selection()
        self.feature_extraction()       
        self.classification()
        self.testing()
        self.cross_validation() 
        self.close_files()        

    def clean_book_title(self, title):
        return nltk.word_tokenize(title.translate(None, string.punctuation))

    def clean_author_name(self, author):
        return author.split(";")

    def feature_extraction(self):
        #Features are extracted 
        for instance in self.book_instances:
            try:
                raw_data = instance and instance.strip() and instance.strip().split("\t")
                if raw_data and len(raw_data) == 4:
                    bookid = raw_data[0]
                    features = []
                    features.extend(self.clean_book_title(raw_data[2]))
                    features.extend(self.clean_author_name(raw_data[3]))
                    features.extend(self.bookid_to_toc_dict.get(raw_data[1], []))
                    train_feats_list = []
                    for feat in features:
                        if feat and feat.lower() in self.selected_features and feat.lower() not in self.stopwords_set:
                            train_feats_list.append((feat.lower(), True))
                    train_feats_list.extend(self.get_bigram([pair[0] for pair in train_feats_list if pair]))
                elif raw_data and len(raw_data) == 3:
                    self.test_cases.append(instance)
                else:
                    continue
                self.training_feats.append((dict(train_feats_list), bookid))            
            except:
                self.logging.info("Exception while running this instance %s\n" % instance)
                continue

    def get_bigram(self, features_list):
        #Top ten best bigrams are selected
        score = BigramAssocMeasures.chi_sq
        all_bigrams = BigramCollocationFinder.from_words(features_list)
        best_bigrams = all_bigrams.nbest(score, self.bigram_threshold)
        selected_bigrams = [(bigram, True) for bigram in best_bigrams]
        return selected_bigrams
        
    def classification(self):
        #Training NB classifier
        self.nb_classifier = NaiveBayesClassifier.train(self.training_feats)         
        
        #Training SVM classifier
        self.svm_classifier = SklearnClassifier(LinearSVC()) 
        self.svm_classifier.train(self.training_feats)
        
    def testing(self):
        #Predicting output for the test instances
        for instance in self.test_cases:
            try:
                raw_data = instance.strip() and instance.strip() and instance.strip().split("\t")
                if raw_data:
                    features = []
                    train_feats_list = []
                    features.extend(self.clean_book_title(raw_data[1]))
                    features.extend(self.clean_author_name(raw_data[2]))
                    for feat in features:
                        if feat and feat.lower() not in self.stopwords_set and feat.lower() in self.selected_features:
                            train_feats_list.append((feat.lower(),True)) 
                    train_feats_list.extend(self.get_bigram([pair[0] for pair in train_feats_list if pair]))
                
                label = self.svm_classifier.classify(dict(train_feats_list))
                self.output_file_fd.write("%s\t%s\n" % (raw_data[0], label))
            except:
                self.logging.info("Exception while running this instance %s\n" % instance)
                

    def cross_validation(self):
        #10 fold cross validation is performed
        train_feats_count = int(len(self.training_feats))
        fold_size = int(train_feats_count / self.k_fold)
        nb_accuracy_list = []
        svm_accuracy_list = []
        nb_f_val_list = []
        svm_f_val_list = []

        for a in range(self.k_fold):
            start_index = a * fold_size
            end_index = start_index + fold_size

            train_features = self.training_feats[:start_index] + self.training_feats[end_index:]
            test_features  = self.training_feats[start_index:end_index] 
            
            self.nb_classifier = NaiveBayesClassifier.train(train_features)         
            nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) 
            nb_accuracy_list.append(nb_acc)
       
            self.svm_classifier = SklearnClassifier(LinearSVC()) 
            self.svm_classifier.train(train_features)
            svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) 
            svm_accuracy_list.append(svm_acc)

            #Find F-Measure
            nb_f_val = self.compute_measures(test_features, self.nb_classifier)
            nb_f_val_list.append(nb_f_val)
            svm_f_val = self.compute_measures(test_features, self.svm_classifier)
            svm_f_val_list.append(svm_f_val)

        self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list))))
        self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list))))
        self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list))))
        self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list))))

    def compute_measures(self, test_features, classifier):
        #Average F measure calculation 
        actual_labels, predicted_labels = self.get_actual_and_predicted_labels(test_features, classifier)
        precision = self.find_precision(actual_labels, predicted_labels)
        recall = self.find_recall(actual_labels, predicted_labels)
        f_val = self.find_f_measure(precision, recall)
        return f_val

    def find_precision(self, actual_labels, predicted_labels):
        if not actual_labels and not predicted_labels:
            return 0
        precision_list = []
        for category in self.book_category_set:
            if not predicted_labels.get(category):
                continue
            precision = nltk.metrics.precision(actual_labels.get(category, set()), predicted_labels.get(category, set()))
            precision_list.append(precision)
        return float(sum(precision_list)/len(precision_list))

    def find_recall(self, actual_labels, predicted_labels):
        if not actual_labels and not predicted_labels:
            return 0
        recall_list = []
        for category in self.book_category_set:
            if not actual_labels.get(category):
                continue
            recall = nltk.metrics.recall(actual_labels.get(category, set()), predicted_labels.get(category, set()))
            recall_list.append(recall)
        return float(sum(recall_list)/len(recall_list))
         
    def find_f_measure(self, precision, recall):
        if precision == 0 and recall == 0:
            return 0
        f_val = 2 * (precision * recall) / float(precision + recall)
        return f_val

    def get_actual_and_predicted_labels(self, test_features, classifier):
        actual_labels = {}
        predicted_labels = {}
        for i, (features, label) in enumerate(test_features):
            actual_labels.setdefault(label, set()).add(i)
            labels = classifier.classify(features)
            predicted_labels.setdefault(labels, set()).add(i)
        return (actual_labels, predicted_labels)

    def preprocessing(self):
        self.initialize_logger()
        self.open_files()
        self.load_data()

    def feature_selection(self):
        self.clean_and_structure_toc_data()
        self.clean_train_data_and_find_best_features()
    
    def clean_train_data_and_find_best_features(self):
        #Top n best unigram features are selected
        freq_dist_obj = FreqDist()
        cond_freq_dist_obj = ConditionalFreqDist()
        self.book_category_set = set() 

        for instance in self.book_instances:
            try:
                raw_data = instance and instance.strip() and instance.strip().split("\t") 
                if not raw_data or len(raw_data) != 4 : continue  
                bookid  = raw_data[0]
                self.book_category_set.add(bookid)
                features = []
                features.extend(self.clean_book_title(raw_data[2]))
                features.extend(self.clean_author_name(raw_data[3]))
                features.extend(self.bookid_to_toc_dict.get(raw_data[1], []))
                for feat in features:
                    freq_dist_obj.inc(feat)
                    cond_freq_dist_obj[bookid].inc(feat)
            except:
                self.logging.info("Exception while running this instance %s \n" % instance)
                
        total_word_count = 0    
        for bookid in self.book_category_set:
            total_word_count += cond_freq_dist_obj[bookid].N()

        word_score_dict = {}
        for word, freq in freq_dist_obj.iteritems():
            score = 0
            if word and word.lower() in self.stopwords_set:continue
            for bookid in self.book_category_set:
                score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count)
            word_score_dict[word] = score
        self.select_top_n_best_features(word_score_dict)
        
    def select_top_n_best_features(self, word_score_dict):
        self.selected_features =  sorted(word_score_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
        total_select_count = int(len(self.selected_features) * self.unigram_threshold/float(100))
        self.selected_features = self.selected_features[:total_select_count]
        self.selected_features = set([pair[0].lower() for pair in self.selected_features if pair[0]])

    def clean_book_toc(self, toc):
        return [word  for word in re.sub("[^a-zA-Z]"," ", toc).split(" ") if word]

    def clean_and_structure_toc_data(self):
        #Extra training data - table of contents are cleaned and structured
        for instance in self.toc_list:
            raw_data = instance and instance.strip() and instance.strip().replace("↵","")
            if not raw_data:continue
            bookid = raw_data.split("\t")[0]
            clean_data = self.clean_book_toc(raw_data)
            self.bookid_to_toc_dict.setdefault(bookid, []).extend(clean_data[1:])
            
    def open_files(self):
        self.train_file_fd = open(self.train_file, 'r') 
        self.train_file_2_fd = open(self.train_file_2, 'r')
        self.output_file_fd = open(self.output_file, 'w')

    def load_data(self):
        self.load_train_data()
        if self.mode == 2:   #Load more train data only when it run as problem 2. 
            self.load_more_train_data()

    def load_train_data(self):
        #Train data loaded 
        self.book_instances = []
        for instance in self.train_file_fd.readlines():
            self.book_instances.append(instance) 
        self.book_instances = self.book_instances[1:]

    def load_more_train_data(self):
        #More training data are loaded for problem 2
        for instance in self.train_file_2_fd.readlines():
            self.toc_list.append(instance)
        self.toc_list = self.toc_list[1:]

    def close_files(self):
        self.train_file_fd.close() 
        self.train_file_2_fd.close()
        self.output_file_fd.close()
    pickle.dump(featureList, save_featureList)
    save_featureList.close()
    # Create featuresets ------------------------------------------------
    count = 0
    print("Extract feature vector for all tweets in one shoot")
    training_set = nltk.classify.util.apply_features(extract_features_2, tweet_dicts)
    print(training_set)

    """
    # Train the classifier
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
    # Test the classifier
    testTweet = 'Congrats @ravikiranj, i heard you wrote a new tech post on sentiment analysis'
    pre_process_result = pre_process_tweet(testTweet)
    processedTestTweet = process_tweet(pre_process_result)['featureVector']
    #print( NBClassifier.classify(extract_features(processedTestTweet)))
    #print(NBClassifier.show_most_informative_features(10))
    """
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    save_LinearSVC_classifier = open("pickled/LinearSVC_classifier_3_ways.pickled", "wb")
    pickle.dump(LinearSVC_classifier, save_LinearSVC_classifier)
    save_LinearSVC_classifier.close()

    LSVC_accuracy = nltk.classify.accuracy(LinearSVC_classifier, training_set)
    print(LSVC_accuracy)
    print(LinearSVC_classifier.classify(extract_features_2(process_tweet(Tweet7))))

    

file_path_test = os.path.join(
    'C:/Users/DComp2/Desktop/python learn/get_data/training_data',
    'test_data.txt')
text_file_test = open(file_path_test, "rU")
raw_data_test = text_file_test.readlines()
text_file_test.close()

question_test, coarse_label_test, fine_label_test = get_labels(raw_data_test)
labeled_data_test = zip(question_test, coarse_label_test)

test_data = question_test
print("test_data_length", len(test_data))
predict_labels = []
for que in test_data:
    test_features = combined_features(que)
    predicted = SVC_classifier.classify(test_features)
    predict_labels.append(predicted)

import numpy as np
true_label = np.array(coarse_label_test)
predicted_label = np.array(predict_labels)
from sklearn.metrics import precision_recall_fscore_support
precision, recall, fscore, support = precision_recall_fscore_support(
    true_label, predicted_label)
from collections import Counter
counts = Counter(coarse_label_test)
all_set = zip(precision, recall, fscore, support)
print
for precision, recall, fscore, support in all_set:
    print 'Precision:', round(precision, 2)
    print 'Recall:', round(recall, 2)
Esempio n. 52
0


#   print(wordlist.most_common(10))
# print(classifier.show_most_informative_features(32))
# print(extract_features())
# tweet = "'Love-cheat' Daniel Radcliffe splits with girlfriend Rosie Coker: London, Oct 19: Daniel Radcliffe has split wit... http://tinyurl.com/8oxx2ns "
# print(classifier.classify(extract_features(tweet.split())))


with open("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/test_data.tsv", "r") as testfile, open("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/evalfile.tsv", "w") as evalfile:
    tsvreader = csv.reader(testfile, dialect='excel-tab',delimiter="\t")
    evalwriter = csv.writer(evalfile, dialect='excel-tab', delimiter='\t')
    for line in tsvreader:
        tweet = line[3]
        result = classifier.classify(extract_features(tweet.split()))
        evalwriter.writerow([line[0], line[1], result, line[3]])

evaluator.evaluate("/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/test_data.tsv", "/Users/Jaaksi/Documents/Github/learnpython/harkkatyo/evalfile.tsv")

# print(classifier.show_most_informative_features(15))









def evaluate_features(feature_select):

    posFeatures = []
    negFeatures = []
    
    training = []
    #process positive dataset "processed_pro_GMO.txt"
    for i in short_pos.split('\n'):
        posWords = word_tokenize(i)
        posWords_tag = [feature_select(posWords),"pos"]
        #post each word as "pos" in positive dataset
        posFeatures.append(posWords_tag)
       
    #process negative dataset "processed_anti_GMO.txt"
    for i in short_neg.split('\n'):
        negWords = word_tokenize(i)
        negWords_tag = [feature_select(negWords),"neg"]
        negFeatures.append(negWords_tag)

    #get 6-Fold cross validation for Accuracy,Recall,Prediction
    num_folds = 6
    training = posFeatures + negFeatures
    cv = cross_validation.KFold(len(training),n_folds=6, shuffle=True, random_state=None)

    Naive_Accu = 0
    neg_Precision = 0
    neg_recall = 0
    pos_Precision = 0
    pos_recall = 0

    SVC_Accu = 0
    Regression_Accu = 0
    testFeatures = []

    precision = dict()
    recall = dict()
    average_Precision = dict()

    for traincv, testcv in cv:
        #BasedNaiveClassifier
        BasedNaiveClassifier = NaiveBayesClassifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        accuracy = (nltk.classify.util.accuracy(BasedNaiveClassifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        Naive_Accu += accuracy
        BasedNaiveClassifier.show_most_informative_features(10)

        save_classifier = open("GMO_Hanzhe/BasedNaiveClassifier10k.pickle","wb")
        pickle.dump(BasedNaiveClassifier, save_classifier)
        save_classifier.close()
        

        #LogisticRegression
        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        Regression_Accuracy = (nltk.classify.util.accuracy(LogisticRegression_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        Regression_Accu += Regression_Accuracy

        save_classifier = open("GMO_Hanzhe/LogisticRegression_classifier10k.pickle","wb")
        pickle.dump(LogisticRegression_classifier, save_classifier)
        save_classifier.close()

        #LinearSVC
        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training[traincv[0]:traincv[len(traincv)-1]])
        SVC_Accuracy = (nltk.classify.util.accuracy(LinearSVC_classifier, training[testcv[0]:testcv[len(testcv)-1]]))*100
        SVC_Accu += SVC_Accuracy

        save_classifier = open("GMO_Hanzhe/LinearSVC_classifier10k.pickle","wb")
        pickle.dump(LinearSVC_classifier, save_classifier)
        save_classifier.close()

        #initiates referenceSets and testSets
        referenceSets = collections.defaultdict(set)
        testSets = collections.defaultdict(set)

        for idx in testcv:
            testFeatures.append(training[idx])
        #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
        for i, (features, label) in enumerate(testFeatures):
            referenceSets[label].add(i)
            predicted = LogisticRegression_classifier.classify(features)
            testSets[predicted].add(i)  
#7/5/2015        
##        pos_Precision += (nltk.metrics.precision(referenceSets["pos"], testSets["pos"]))*100     
##        pos_recall += (nltk.metrics.recall(referenceSets["pos"], testSets["pos"]))*100 
##        neg_Precision += (nltk.metrics.precision(referenceSets["neg"], testSets["neg"]))*100
##        neg_recall += (nltk.metrics.recall(referenceSets["neg"], testSets["neg"]))*100
##
##        precision["pos"] = nltk.metrics.precision(referenceSets["pos"], testSets["pos"])     
##        recall["pos"] = nltk.metrics.recall(referenceSets["pos"], testSets["pos"]) 
##        precision["neg"] = nltk.metrics.precision(referenceSets["neg"], testSets["neg"])
##        recall["neg"] = nltk.metrics.recall(referenceSets["neg"], testSets["neg"])
##
##        save_classifier = open("GMOHedging/BasedNaiveClassifier.pickle","wb")
##        pickle.dump(BasedNaiveClassifier, save_classifier)
##        save_classifier.close()
###    average_precision["pos"] = precision["pos"]

    #get Average score for Accuracy, Precision and Recall
    accu = Naive_Accu/num_folds
#7/5/2015
##    pos_Precision = pos_Precision/num_folds
##    pos_recall = pos_recall/num_folds
##    neg_Precision = neg_Precision/num_folds
##    neg_recall = neg_recall/num_folds
    print("Average Naive Bayes Accuracy is:", accu)
#7/5/2015
##    print("Average LinearSVC_classifier Positive Precision is:", pos_Precision)
##    print("Average LinearSVC_classifier Positive Recall is:", pos_recall)
##    print("Average LinearSVC_classifier Negative Precision is:", neg_Precision)
##    print("Average LinearSVC_classifier Negative Recall is:", neg_recall)

    Regression_Accu = Regression_Accu/num_folds
    print("LogisticRegression_classifier accuracy percent:", Regression_Accu)

    SVC_Accu = SVC_Accu/num_folds
    print("LinearSVC_classifier accuracy percent:", SVC_Accu)
Esempio n. 54
0
sa = SentimentAnalyzer(validation_sample)
validation_set = sa.bow 
ground_truth = [r[1] for r in validation_set]

#print("Training MultinomialNB")
#MNB_clf = SklearnClassifier(MultinomialNB())
#MNB_clf.train(training_set)
#MNB_pred = [MNB_clf.classify(r[0]) for r in validation_set]
##for i in range(len(MNB_pred)):
##	if MNB_pred[i] == 5:
##		print(validation_sample.review_body[i])
##		print(validation_reviews[i],"\n+++++++++++++++++++++++++++++++")
#print("Got F1 score of", precision_score(ground_truth, MNB_pred, average='micro'))
#
#print("Training BernoulliNB")
#BNB_clf = SklearnClassifier(BernoulliNB())
#BNB_clf.train(training_set)
#BNB_pred = [BNB_clf.classify(r[0]) for r in validation_set]
#print("Got F1 score of", precision_score(ground_truth, BNB_pred, average='micro'))

print("Training LogisticRegression")
LogReg_clf = SklearnClassifier(LogisticRegression())
LogReg_clf.train(training_set)
LogReg_pred = [LogReg_clf.classify(r[0]) for r in validation_set]
print("Got F1 score of", precision_score(ground_truth, LogReg_pred, average='micro'))

#print("Training SGD")
#SGD_clf = SklearnClassifier(SGDClassifier())
#SGD_clf.train(training_set)
#SGD_pred = [SGD_clf.classify(r[0]) for r in validation_set]
#print("Got F1 score of", precision_score(ground_truth, SGD_pred, average='micro'))#
Esempio n. 55
0
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

classif = SklearnClassifier(LinearSVC())
classif.train(trainfeats)
print classif.labels()
test_skl = []
t_test_skl = []
for d in testfeats:
 test_skl.append(d[0])
 t_test_skl.append(d[1])
 
print(set(t_test_skl))

result = []
for item in test_skl:
	p = classif.classify(item)
	result.append(p)
	
print len(result)
print len(t_test_skl)

score = 0.0
for i in range(0,len(result)):
	if result[i] == t_test_skl[i]:
		score = score + 1.0

print score/len(result)

from sklearn.metrics import classification_report
# getting a full report
print classification_report(t_test_skl, result, labels=list(set(t_test_skl)),target_names=cls_set)