def trainClassifiers(tweets):
    # Generate the training set
    training_set = nltk.classify.util.apply_features(extract_features, tweets)
    print("Training set created!")

    # Train and save the Naive Bayes classifier to a file
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
    f = open('data/trained_classifiers/NBClassifier.pickle', 'wb')
    pickle.dump(NBClassifier, f, 1)
    f.close()
    print("NBClassifier Classifier Trained")

    #Train linear SVC
    linear_SVC_classifier = SklearnClassifier(LinearSVC())
    linear_SVC_classifier.train(training_set)

    # Train Max Entropy Classifier
    # MaxEntClassifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'IIS', trace=2, \
    #                        encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter = 5)
    # f = open('data/trained_classifiers/MaxEntClassifier.pickle', 'wb')
    # pickle.dump(MaxEntClassifier, f, 1)
    # f.close()
    # print("MaxEntClassifier Classifier Trained")

    # return (training_set, NBClassifier, MaxEntClassifier)
    return (training_set, NBClassifier, linear_SVC_classifier)
Exemple #2
0
def classifier_for_lemma(lemma, filenames):
    # XXX: always doing non-null and Random Forest for initial version
    classifier = SklearnClassifier(RandomForestClassifier(), sparse=False)
    print("loading training data for", lemma)
    load_training_for_word(lemma, filenames.bitextfn, filenames.alignfn,
                           filenames.annotatedfn)

    training = trainingdata.trainingdata_for(lemma, nonnull=True)
    print("got {0} instances for {1}".format(len(training), lemma))

    # delete the sentences themselves; we have the instances
    trainingdata.set_examples([], [])
    trainingdata.set_sl_annotated([])
    gc.collect()

    if len(training) > (20 * 1000):
        print("capping to 20k instances to fit in memory")
        training = training[: 20 * 1000]

    labels = set(label for (feat,label) in training)
    print("loaded training data for", lemma)
    if (not training) or len(labels) < 2:
        return None
    classifier.train(training)
    return classifier
Exemple #3
0
def validate(data, params, d):

    stop = stopwords.words("english")

    (rel_dict, Wv, b, L) = params

    print "validating, adding lookup"
    for split in data:
        for tree in split:
            for node in tree.get_nodes():
                node.vec = L[:, node.ind].reshape((d, 1))

    train_feats = []
    val_feats = []

    for tt, split in enumerate(data):

        if tt == 0:
            print "processing train"

        else:
            print "processing val"

        for num_finished, tree in enumerate(split):

            # process validation trees
            forward_prop(None, params, tree, d, labels=False)

            ave = zeros((d, 1))
            words = zeros((d, 1))
            count = 0
            wcount = 0
            word_list = []
            for ex, node in enumerate(tree.get_nodes()):

                if ex != 0 and node.word not in stop:
                    ave += node.p_norm
                    count += 1

            ave = ave / count
            featvec = ave.flatten()

            curr_feats = {}
            for dim, val in ndenumerate(featvec):
                curr_feats["_" + str(dim)] = val

            if tt == 0:
                train_feats.append((curr_feats, tree.ans))

            else:
                val_feats.append((curr_feats, tree.ans))

    print "training"
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train_feats)

    print "predicting..."
    train_acc = nltk.classify.util.accuracy(classifier, train_feats)
    val_acc = nltk.classify.util.accuracy(classifier, val_feats)
    return train_acc, val_acc
def learn_model(data,target):
    bestwords = best_of_words(data, target)
    # preparing data for split validation. 80% training, 20% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43)
    #classifier = BernoulliNB().fit(data_train,target_train)
    train_feature=[]
    test_feature=[]
    for i in range(len(data_train)):
        d=data_train[i]
        d=jieba.cut(d, cut_all=False)
        l=target_train[i]
        #tmp=[bigram(d),l]
        tmp = [dict([(word, True) for word in d if word in bestwords]), l]
        train_feature.append(tmp)
        
    for i in range(len(data_test)):
        d=data_test[i]
        d=jieba.cut(d, cut_all=False)
        l=target_test[i]
        #tmp=bigram(d)
        tmp = dict([(word, True) for word in d if word in bestwords])
        test_feature.append(tmp)
    
        
    classifier = SklearnClassifier(MultinomialNB())
    classifier.train(train_feature)
   
    predicted = classifier.classify_many(test_feature)
    
    evaluate_model(target_test,predicted)

    return classifier, bestwords
def score(trainset, testset, classifier):
    classifier = SklearnClassifier(classifier)
    classifier._vectorizer.sort = False
    classifier.train(trainset)
    (test, tag_test) = zip(*testset)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
def SVM(training_set, test_set):
    classifier = SklearnClassifier(LinearSVC())
    print("Training a new SVM classifier")
    classifier.train(training_set)
    print("Accuracy of SVM in training:",nltk.classify.accuracy(classifier, test_set))
#     classifier.show_most_informative_features(5)
    #print("Running new Decision Tree classifier")
    accuracy = nltk.classify.accuracy(classifier, test_set)
    trueLabels = [l for d, l in test_set]
    predictedLabels = classifier.classify_many([d for d,t in test_set])
    #print("Accuracy:",accuracy)
#     classifier.show_most_informative_features(MIF)
    def runTrained(test_set, hasTags=False):
        #print("Running pre-trained Decision Tree classifier")
        if hasTags:
            tagglessTest_set = [data for data, tag in test_set]
            acc = nltk.classify.accuracy(classifier, test_set)
            print("Accuracy:", acc)
            predictions = classifier.classify_many(tagglessTest_set)
            return ([e for e in zip(tagglessTest_set, predictions)], acc)
        else:
            tagglessTest_set = test_set         
        predictions = classifier.classify_many(tagglessTest_set)
        #print("Predicted Labels:",predictions)
        return [e for e in zip(tagglessTest_set, predictions)]
    return (runTrained, accuracy, predictedLabels, trueLabels) 
Exemple #7
0
class chatBot(object):

    def __init__(self):
        self.posts = nltk.corpus.nps_chat.xml_posts()
        self.categories = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer',
                'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis',
                'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
        self.mapper = [0, 2, 6, 3, 11, 5, 8, 1, 8, 3, 10, 11, 13, 13, 13]
        self.responses = {}
        self.featuresets = []
        self.train = []
        self.test = []
        self.testSet = []
        self.testSetClass = []
        self.classif = SklearnClassifier(LinearSVC())
        for i in range(0, 15):
            self.responses[i] = []
        for post in self.posts:
            self.featuresets.append((self.tokenize(post.text),self.categories.index(post.get('class'))))
            self.temp = self.responses[self.categories.index(post.get('class'))]
            self.temp.append(post.text)

    def tokenize(self, sentence):
        """
            Extracts a set of features from a message.
        """
        features = {}
        tokens = nltk.word_tokenize(sentence)
        for t in tokens:
            features['contains(%s)' % t.lower()] = True
        return features

    def talk(self):
        while 1:
            inp = raw_input("YOU: ")
            features = self.tokenize(inp)
            pp = self.classif.classify_many(features)
            pp = pp[0]
            pp = int(pp)
            m = self.mapper[pp]
            r = self.responses[m]
            val = randint(0, len(r))
            print("BOT: "+r[val])

    def trainSet(self):
        shuffle(self.featuresets)
        size = int(len(self.featuresets) * .1) # 10% is used for the test set
        self.train = self.featuresets[size:]
        self.test = self.featuresets[:size]
        self.classif.train(self.train)

        self.testSet = []
        self.testSetClass = []
        for i in self.test:
            self.testSet.append(i[0])
            self.testSetClass.append(i[1])
        self.batch = self.classif.classify_many(self.testSet)

    def statistics(self):
        print (classification_report(self.testSetClass, self.batch, labels=list(set(self.testSetClass)),target_names=self.categories))
Exemple #8
0
def main3():
    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.svm import LinearSVC
    from sklearn.metrics import confusion_matrix
    from matplotlib import pyplot

    svm = SklearnClassifier(LinearSVC(loss="hinge"))
    svm.train(trainData)
    print("SVM: ", nltk.classify.accuracy(svm, testData))
    results = svm.classify_many(item[0] for item in testData)

    print(results)
    from sklearn.metrics import classification_report

    # getting a full report
    print(classification_report(t_test_skl, results, labels=list(set(t_test_skl)), target_names=t_test_skl))

    # Compute confusion matrix
    import numpy as np
    cmm = confusion_matrix([x[1] for x in testData], results)

    print(cmm)
    cmm = np.array(cmm, dtype = np.float)
    print(cmm.shape)

    #f=figure()
    #ax = f.add_subplot(111)
    #show()
    #%pylab inline

    # Show confusion matrix in a separate window
    print(pyplot.imshow(cmm, interpolation='nearest'))
Exemple #9
0
def performCrossValidation(featureset, labels, foldsCount, sklearnclassifier, uniqLabels):
    accuracySum = 0.0
    precisionSums = defaultdict(float)
    recallSums = defaultdict(float)
    fscoreSums = defaultdict(float)
    crossValidationIterations = cross_validation.StratifiedKFold(labels, n_folds=foldsCount)
    for train, test in crossValidationIterations:
        trainset = [featureset[i] for i in train]
        testset = [featureset[i] for i in test]
        print("before train")
        classifier = SklearnClassifier(sklearnclassifier).train(trainset)

        true = [label for features, label in testset]
        predicted = classifier.classify_many([features for features, label in testset])

        precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels)
        accuracy = accuracy_score(true, predicted)
        accuracySum += accuracy

        for label, value in zip(uniqLabels, precisions):
            precisionSums[label] += value
        for label, value in zip(uniqLabels, recalls):
            recallSums[label] += value
        for label, value in zip(uniqLabels, fscores):
            fscoreSums[label] += value

    print("Average accurancy: {0:.3f}".format(accuracySum/foldsCount))
    measures = {label: (sum/foldsCount, recallSums.get(label)/foldsCount, fscoreSums.get(label)/foldsCount) for label, sum in precisionSums.items()}
    for label, (prec, recall, fscore) in measures.items():
        print("Average precision for {0}: {1:.3f}".format(label, prec))
        print("Average recall for {0}: {1:.3f}".format(label, recall))
        print("Average f score for {0}: {1:.3f}".format(label, fscore))
Exemple #10
0
def sentiment_classifier(debug):
	# trainingfp = open('training.csv', 'rb')
	train = pd.read_csv( 'training.csv', delimiter=',', quotechar='"', escapechar='\\',header=0 )
	num_tweets = train['TweetText'].size
	
	cleantweets = []
	for i in xrange(0, num_tweets):
		if debug and ( (i+1)%1000 == 0 ):
			print "Tweet %d of %d\n" % ( i+1, num_tweets )          
		cleantweets.append((tweet_to_words(train['TweetText'][i]), train['Sentiment'][i]))

	# vectorizer = CountVectorizer(analyzer = "word",   \
 #                             tokenizer = None,    \
 #                             preprocessor = None, \
 #                             stop_words = None,   \
 #                             max_features = 5000) 

	# train_data_features = vectorizer.fit_transform([t for (t,_) in cleantweets])
	
	# feature_labels = [(m,l) for ((f,l),m) in zip(cleantweets, train_data_features)]

	# forest = RandomForestClassifier(n_estimators = sensitivity)
	# forest = forest.fit(train_data_features, train['Sentiment'])
	classif = SklearnClassifier(LinearSVC())
	classif.train(cleantweets)

	return (classif)
def train(cleanedDataCollection, tagPool):
	posSamples = []
	negSamples = []

	featuresets = [(extractFeatures(d,tagPool), c) for (d,c) in cleanedDataCollection]
	for sample in featuresets:
		if sample[1] == "trash":
			negSamples.append(sample)
		else:
			posSamples.append(sample)

	train_set = negSamples[10:]+posSamples[10:]
	test_set = negSamples[:10]+posSamples[:10]


	# classifier = nltk.NaiveBayesClassifier.train(train_set)
	# print(nltk.classify.accuracy(classifier, test_set))
	# classifier.show_most_informative_features(5) 
	# return classifier

	sk_classifier = SklearnClassifier(MultinomialNB())
	sk_classifier.train(train_set)
	print "accuracy is: %s" % (accuracy(sk_classifier, test_set))

	precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier,  test_set, "useful")

	print "precision is: %s" % (precision)
	print "recall is: %s" % (recall)
	print "F-measure is: %s" % (fMeasure)
	return sk_classifier
def svm(train_data,preprocessing=True):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0],label=data[1]))
    cl = SklearnClassifier(LinearSVC())
    cl.train(training_data)
    return cl
Exemple #13
0
class SKClassifier:

    classifier = None

    def __init__(self, cls='SVC'):
        self.classifier = SklearnClassifier({
            'SVC': SVC(),
            'LogisticRegression': LogisticRegression(),
            'BernoulliNB': BernoulliNB()
        }[cls])
        if not self.classifier:
            self.classifier = SklearnClassifier(SVC())

    def train(self, trainset):
        self.classifier.train(trainset)

    def test(self, tagged, featuresets):
        predict = self.classifier.classify_many(featuresets)
        print predict
        return accuracy_score(tagged, predict)

    def classify(self, featureset):
        return self.classifier.classify(featureset)

    def classify_many(self, featuresets):
        return self.classifier.classify_many(featuresets)
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    # pred = classifier.batch_classify(test)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
def evaluate(classifier_alo):
    
    classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口
    classifier.train(trainFeatures) #训练分类器
    
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	
    i = 0
    for item in testFeatures:
        referenceSets[item[1]].add(i)
        predicted = classifier.classify(item[0])
        testSets[predicted].add(i)	
        i += 1
    
    pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    neg_pre =  nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    
    print (str('{0:.3f}'.format(float(pos_pre))) + "  "
    +str('{0:.3f}'.format(float(pos_recall))) + "  "
    +str('{0:.3f}'.format(float(neg_pre))) + "  "
    +str( '{0:.3f}'.format(float(neg_recall))) + "  "
    +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + "  "
    +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
Exemple #16
0
def evaluate(train_qs, test_qs, params, d):

    data = [train_qs, test_qs]
    (W, b, W2, b2, W3, b3, L) = params

    train_feats = []
    test_feats = []

    for tt, split in enumerate(data):

        for qs, ans in split:

            prev_qs = zeros((d, 1))
            prev_sum = zeros((d, 1))
            count = 0.
            history = []

            for dist in qs:

                sent = qs[dist]

                # input is average of all nouns in sentence
                # av = average(L[:, sent], axis=1).reshape((d, 1))
                history += sent
                prev_sum += sum(L[:, sent], axis=1).reshape((d, 1))
                if len(history) == 0:
                    av = zeros((d, 1))
                else:
                    av = prev_sum / len(history)

                # apply non-linearity
                p = relu(W.dot(av) + b)
                p2 = relu(W2.dot(p) + b2)
                p3 = relu(W3.dot(p2) + b3)

                curr_feats = {}
                for dim, val in ndenumerate(p3):
                    curr_feats['__' + str(dim)] = val

                if tt == 0:
                    train_feats.append( (curr_feats, ans[0]) )

                else:
                    test_feats.append( (curr_feats, ans[0]) )

    print 'total training instances:', len(train_feats)
    print 'total testing instances:', len(test_feats)
    random.shuffle(train_feats)

    # can modify this classifier / do grid search on regularization parameter using sklearn
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train_feats)

    print 'accuracy train:', nltk.classify.util.accuracy(classifier, train_feats)
    print 'accuracy test:', nltk.classify.util.accuracy(classifier, test_feats)
    print ''

    print 'dumping classifier'
    cPickle.dump(classifier, open('data/deep/classifier', 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def get_performance(clf_sel, train_features, test_features):
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    classification_error = False

    clf = SklearnClassifier(clf_sel)
    try:
        classifier = clf.train(train_features)
    except:
        classification_error = True
        # print (str(clf_sel.__class__),'NA')

    if str(clf_sel.__class__) == "<class 'sklearn.naive_bayes.MultinomialNB'>":
        pickle_cls(classifier, 'MultinomialNB')

    # print(str(clf_sel), 'accuracy:'(nltk.classify.accuracy(classifier, test_features)) * 100)

    if not classification_error:
        clf_acc = nltk.classify.accuracy(classifier, test_features)

        for i, (features, label) in enumerate(test_features):
            ref_set[label].add(i)
            predicted = classifier.classify(features)
            test_set[predicted].add(i)

        pos_precision = precision(ref_set['pos'], test_set['pos'])
        pos_recall = recall(ref_set['pos'], test_set['pos'])
        neg_precision = precision(ref_set['neg'], test_set['neg'])
        neg_recall = recall(ref_set['neg'], test_set['neg'])

        print(
            "{0},{1},{2},{3},{4},{5}".format(clf_sel.__class__, clf_acc, pos_precision, pos_recall, neg_precision,
                                             neg_recall))
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    # nltk.classify.scikitlearn(BernoulliNB())
    predict = classifier.classify_many(test)
    # classifier.prob_classify_many()
    return accuracy_score(tag_test, predict)
Exemple #19
0
	def __init__(self, scikit_classifier, train_file_name,template_file_name,labelled_feature_sets=None):
		from nltk.classify.scikitlearn import SklearnClassifier
		from sklearn.ensemble import AdaBoostClassifier
		from sklearn.naive_bayes import GaussianNB
		from sklearn.ensemble import RandomForestClassifier
		fe = FeatureExtractor()
		#self.classifier = SklearnClassifier(scikit_classifier,sparse=False)
		if(isinstance(scikit_classifier,RandomForestClassifier)):
			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
		elif(isinstance(scikit_classifier,GaussianNB)):
			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
		else:
			self.classifier = SklearnClassifier(scikit_classifier)
		self.compiled_templates = self.process_template(template_file_name)
		feature_sets = []
		if(labelled_feature_sets is not None):
			feature_sets = labelled_feature_sets
			logger.info("using a pre-computed feature_sets containing %i instances"%len(feature_sets))
		else:
			iob_data = 	file_to_instances(train_file_name)
			logger.info("instances ",len(iob_data))
			logger.info("tokens",count_tokens(iob_data))
			for n,instance in enumerate(iob_data):
			    sentence_n = n
			    pos_tags = [('z_POS',token[1]) for token in instance]
			    labels = [token[2] for token in instance]
			    tokens = [token[0] for token in instance]
			    for n,token in enumerate(tokens):
			        dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0]
			        feature_sets.append([dict_features, labels[n]])
		self.classifier.train(self.apply_feature_template(feature_sets,out_label=True))
		return
Exemple #20
0
 def __init__(self, cls='SVC'):
     self.classifier = SklearnClassifier({
         'SVC': SVC(),
         'LogisticRegression': LogisticRegression(),
         'BernoulliNB': BernoulliNB()
     }[cls])
     if not self.classifier:
         self.classifier = SklearnClassifier(SVC())
def cross_validate(data,model=None):
    training_set = nltk.classify.apply_features(preprocess,data)
    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None)
    if model == "svm" or model=="SVM":
        svm = SklearnClassifier(LinearSVC())
        for traincv, testcv in cv:
            classifier = svm.train(training_set[traincv[0]:traincv[len(traincv)-1]])
            print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])
Exemple #22
0
def svm(trainfeats, testfeats):
	y = []
	accuracy = []
	classif = SklearnClassifier(LinearSVC(C=0.032))
	classif.train(trainfeats)
	print "SVM output"
	print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
	y.append( nltk.classify.util.accuracy(classif, testfeats))
	print y
def svm(total_train_feats,total_test_feats):
    y = []
    accuracy = []
    classifier = SklearnClassifier(LinearSVC(C=0.032))
    classifier.train(total_train_feats)
    print 'train on %d instances, test on %d instances' % (len(total_train_feats), len(total_test_feats))
    y.append( nltk.classify.util.accuracy(classifier, total_test_feats))
    print y
    del classifier
    all_results.append(y)
Exemple #24
0
def train_classifier(trainfile):
    '''Training the classifier '''
    products,scores,reviews=load_text_from_file(trainfile)
    train_set=extract_features(reviews,scores)
    clf=SklearnClassifier(LinearSVC())
    #trainlen=int(len(train_set)*0.9)
    model=clf.train(train_set)
    #model=nltk.NaiveBayesClassifier.train(train_set)
    pk.dump(model,open('classifier.p','wb'))
    print 'Accuracy for the training set: ',nltk.classify.accuracy(model,train_set)
def buildClassifier_score(trainSet,devtestSet,classifier):
    #print devtestSet
    from nltk import compat
    dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签
    classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口
    #x,y in  list(compat.izip(*trainSet))
    classifier.train(trainSet) #训练分类器
    #help('SklearnClassifier.batch_classify')
    pred = classifier.classify_many(dev)#batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def store_classifier(clf, trainset, filepath):
    classifier = SklearnClassifier(clf)
    classifier.train(trainset)

    pred = classifier.prob_classify_many(extract_features(sentiment))
    p_file = open(filepath,'w+') #把结果写入文档
    # for i in pred:
    #     p_file.write(str(i.prob('pos'))+' '+str(i.prob('neg')))
    for (i,j) in zip(pred,sen_cur):
        p_file.write(str(i.prob('pos'))+'\t'+str(i.prob('neg'))+'\t'+j + '\n')
    p_file.close()
Exemple #27
0
def train():
    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=40)),
                         ('nb', MultinomialNB())])
    classif = SklearnClassifier(pipeline)
    
    
    pos = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/useful.txt', 'r').readlines()]
    neg = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/not.txt', 'r').readlines()]
    add_label = lambda lst, lab: [(x, lab) for x in lst]
    classif.train(add_label(pos, 'pos') + add_label(neg, 'neg'))
    return classif
Exemple #28
0
	def learn_model(self,featuresets):
		"""
		trains and tests the logistic regression classifier on the data
		"""
		random.shuffle(featuresets)
	
		limit = int(0.75*len(featuresets)) #partitioning 3:1 for train:test
		train_set = featuresets[:limit]
		test_set = featuresets[limit:]
	
		lr_classifier = SklearnClassifier(LogisticRegression())
		lr_classifier.train(train_set)
		
		print 'Logistic classifier Accuracy : ',str(nltk.classify.accuracy(lr_classifier,test_set)*100)
Exemple #29
0
def performTestValidation(trainset, testset, sklearnclassifier, uniqLabels):
        classifier = SklearnClassifier(sklearnclassifier).train(trainset)
        true = [label for features, label in testset]
        predicted = classifier.classify_many([features for features, label in testset])

        precisions, recalls, fscores, support = precision_recall_fscore_support(true, predicted, pos_label=None, labels=uniqLabels)
        accuracy = accuracy_score(true, predicted)

        print("Test accuracy: {0:.3f}".format(accuracy))
        measures = {label: (precision, recall, fscore) for label, precision, recall, fscore in zip(uniqLabels, precisions, recalls, fscores)}
        for label, (prec, recall, fscore) in measures.items():
            print("Precision for {0}: {1:.3f}".format(label, prec))
            print("Recall for {0}: {1:.3f}".format(label, recall))
            print("F score for {0}: {1:.3f}".format(label, fscore))
    def getSubjObj(self, text):
        words = Text(text.split(" "))
        bigrams = self.getBigrams(words)
        subjclassifier = self.loadSOClsssifier()
        posnegclassifier = self.loadPNClsssifier()

        subj_or_obj = SklearnClassifier.classify(subjclassifier, bigrams)
        if subj_or_obj == "objective":
            return "neutral"

        pos_or_neg = SklearnClassifier.classify(posnegclassifier, bigrams)

        if pos_or_neg == "negative":
            return "negative"
        else:
            return "positive"
random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)
###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:",
      (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle", "wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
def trainAndPickleAllClassifiers(training_set):

    classifier = nltk.NaiveBayesClassifier.train(training_set)
    savePickle(classifier, "picklefiles_eng/basicClassifier.pickle")
    print("Basic classifier saved")

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    savePickle(MNB_classifier, "picklefiles_eng/MNBClassifier.pickle")
    print("MNB classifier saved")

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    savePickle(BernoulliNB_classifier, "picklefiles_eng/BNBClassifier.pickle")
    print("BNB classifier saved")

    LogisticRegression_classifier = SklearnClassifier(
        LogisticRegression(solver='liblinear'))
    LogisticRegression_classifier.train(training_set)
    savePickle(LogisticRegression_classifier,
               "picklefiles_eng/LRClassifier.pickle")
    print("LRC classifier saved")

    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    savePickle(SGDClassifier_classifier,
               "picklefiles_eng/SGDClassifier.pickle")
    print("SGD classifier saved")

    LinearSVC_classifier = SklearnClassifier(LinearSVC(max_iter=10000))
    LinearSVC_classifier.train(training_set)
    savePickle(LinearSVC_classifier,
               "picklefiles_eng/LinearSVCClassifier.pickle")
    print("LinearSVC classifier saved")

    NuSVC_classifier = SklearnClassifier(NuSVC(gamma='auto'))
    NuSVC_classifier.train(training_set)
    savePickle(NuSVC_classifier, "picklefiles_eng/NUSVCClassifier.pickle")
    print("NuSVC classifier saved")
    def train(self):
        self.pos = open("data/positive.txt", "r").read()
        self.neg = open("data/negative.txt", "r").read()
        self.words = []
        self.doc = []

        for p in self.pos.split('\n'):
            self.doc.append((p, "pos"))
            words = word_tokenize(p)
            pos = nltk.pos_tag(words)
            for w in pos:
                if w[1][0] in ["J"]:
                    self.words.append(w[0].lower())

        for p in self.neg.split('\n'):
            self.doc.append((p, "neg"))
            words = word_tokenize(p)
            pos = nltk.pos_tag(words)
            for w in pos:
                if w[1][0] in ["J"]:
                    self.words.append(w[0].lower())

        pickle.dump(self.doc, open("pickle/doc.pickle", "wb"))
        self.words = nltk.FreqDist(self.words)
        self.wordFeat = [
            self.i for (selfi, self.c) in self.words.most_common(5000)
        ]
        pickle.dump(self.wordFeat, open("pickle/wordFeat.pickle", "wb"))
        self.featSet = [(trainClassifier().featureFind(self.rev,
                                                       self.wordFeat),
                         self.category)
                        for (self.rev, self.category) in self.doc]
        random.shuffle(self.featSet)
        self.testSet = self.featSet[10000:]
        self.triainSet = self.featSet[:10000]
        pickle.dump(self.featSet, open("pickle/featSet.pickle", "wb"))
        ONB = nltk.NaiveBayesClassifier.train(self.triainSet)
        print("Original Naive Bayes Algo accuracy:",
              round((nltk.clify.accuracy(ONB, self.testSet)) * 100, 2), "%")
        pickle.dump(ONB, open("pickle/ONB.pickle", "wb"))
        MNB = SklearnClassifier(MultinomialNB())
        MNB.train(self.triainSet)
        print("MultinomialNB accuracy:",
              round((nltk.clify.accuracy(MNB, self.testSet)) * 100, 2), "%")
        pickle.dump(MNB, open("pickle/MNB.pickle", "wb"))
        BNB = SklearnClassifier(BernoulliNB())
        BNB.train(self.triainSet)
        print("BernoulliNB accuracy percent:",
              round((nltk.clify.accuracy(BNB, self.testSet)) * 100, 2), "%")
        pickle.dump(BNB, open("pickle/BNB.pickle", "wb"))
        LR = SklearnClassifier(LogisticRegression())
        LR.train(self.triainSet)
        print("LogisticRegression accuracy:",
              round((nltk.clify.accuracy(LR, self.testSet)) * 100, 2), "%")
        pickle.dump(LR, open("pickle/LR.pickle", "wb"))
        LSVC = SklearnClassifier(LinearSVC())
        LSVC.train(self.triainSet)
        print("LinearSVC accuracy:",
              round((nltk.clify.accuracy(LSVC, self.testSet)) * 100, 2), "%")
        pickle.dump(LSVC, open("pickle/LSVC.pickle", "wb"))
        SGDC = SklearnClassifier(SGDClassifier())
        SGDC.train(self.triainSet)
        print("SGDClassifier accuracy:",
              round(nltk.clify.accuracy(SGDC, self.testSet) * 100, 2), "%")
        pickle.dump(SGDC, open("pickle/SGDC.pickle", "wb"))
def LogisticRegressionAlgorithm(training_set):
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    return LogisticRegression_classifier
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
     
    ]

models = zipper(names,classifiers)

#Wrap models in nltk and find their accuracy then select best method
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}: Accuracy: {}'.format(name,accuracy))
    
#ensemble method -- Voting Classifier for better accuracy
    
from sklearn.ensemble import VotingClassifier

names=['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
def main():

  stopwords = []
  with open('stopwords.txt', 'r',encoding="utf-8") as sw:
    for s_word in sw.readlines():
        #print("SWORD!!!!", s_word)
        stopwords.append(s_word.strip())

  # We also need to preprocess stopwords to get rid of zairs, zabars, shadds, etc.
  stopwords = ' '.join(stopwords)
  stopwords = preprocess([stopwords], stopwords, True)
  #print("Preprocessed stopwords: {}".format(stopwords))
  pos = []
  neg = []
  neu = []
  #Just reading 5 documents from both positive and negative files
  path=['']
  file=open('dataset/SemEval2017-task4-train.subtask-A.arabic.txt', encoding = "utf-8")
  for line in file:
    data=line.split("\t")
    tweet=data[2]
    sentiment=data[1]
    if(sentiment=='positive'):
      pos.append(tweet)
    elif(sentiment=='neutral'):
      neu.append(tweet)
    else:
      neg.append(tweet)

  num=728
  for i in range(1,num):
    try:
      file=open('dataset/Positive/positive'+str(i)+'.txt',encoding='utf-8')
      for line in file:
        pos.append(line)
    except:
      num=num+1
      pass
  num=338
  for i in range(1,num):
    try:
      file=open('dataset/Negative/negative'+str(i)+'.txt',encoding='utf-8')
      for line in file:
        neg.append(line)
    except:
      num=num+1
      pass


  print("Positive:", len(pos))
  print("Negative:", len(neg))
  print("Neutral:", len(neu))


  print("Preprocessing All Files -->")
  all_words = []
  documents = []
  #print("neg", neg)
  for p in pos:
    if(len(p)>2):
      documents.append((p, 1))
      words = preprocess([p], stopwords)
      try:
        words = words[0]
        for w in words:
            #print("w", w)
            all_words.append(w)
      except:
        pass

  for n in neg:
    if(len(n)>2):
      documents.append((n,-1))
      words = preprocess([n], stopwords)
      try:
        words = words[0]
        for w in words:
            all_words.append(w)
      except:
        pass

  for n in neu:
    if(len(n)>2):
      documents.append((n, 0))
      words = preprocess([n], stopwords)
      try:
        words = words[0]
        for w in words:
            all_words.append(w)
      except:
        pass
  print("Creating Word Features for all Documents-->")

  # Frequency Distribution
  all_words = nltk.FreqDist(all_words)
  #print("all freqDis words", all_words)
  word_features = list(all_words.keys())[:4000]
  #Pickling the word features
  save_word_features = open("word_features.pickle", "wb")
  pickle.dump(word_features, save_word_features)
  save_word_features.close()

  #Pickling stopwords
  save_stopwords = open("stopwords.pickle", "wb")
  pickle.dump(stopwords, save_stopwords)
  save_stopwords.close()

  #print("word features: {}".format(word_features))
  featuresets = [(find_features(rev, stopwords, word_features), category) for (rev, category) in documents]
  print("Preparing the Testing/Training Dataset")
  #print(featuresets)
  random.shuffle(featuresets)
  training_set, testing_set = train_test_split(featuresets)

  # Training and successive pickling of the classifiers.

  print("Training the NaiveBayes Classifier-->")

  nbclassifier = nltk.NaiveBayesClassifier.train(training_set)
  print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(nbclassifier, testing_set)) * 100)
  nbclassifier.show_most_informative_features(15)
  #Pickling the nbclassifier
  save_nb_classifier = open("nbclassifier.pickle", "wb")
  pickle.dump(nbclassifier, save_nb_classifier)
  #print("corpus: {}\n\n".format(corpus))

  #output = preprocess(corpus, stopwords)

  #print("output: {}\n\n".format(output))
  #print(len(documents))
  t = documents[14][0]
  pp, p = sentAnalysis(t, word_features, nbclassifier, stopwords)

  print (pp.samples())
  print ("NaiveBayes Prediction: ", p)
  print("Training the Linear SVM Classifier-->")
  

  LinearSVC_classifier = SklearnClassifier(LinearSVC())
  LinearSVC_classifier.train(training_set)
  
  print("Accuracy the Linear SVM Classifier-->") 
  print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

  save_classifier = open("LinearSVC_classifier5k.pickle", "wb")
  pickle.dump(LinearSVC_classifier, save_classifier)
  save_classifier.close()
  t = documents[0][0]
  p = sentAnalysisSVM(t, word_features, LinearSVC_classifier, stopwords)
  print ("SVM Prediction: ", p)
Exemple #37
0
    def createClassifiers(self):
        if not os.path.exists('pickled_algos'):
            os.makedirs('pickled_algos')
        logger.debug('list file %s', glob.glob("trainingData/*.txt"))
        trainingFiles = glob.glob("trainingData/*.txt")
        allowed_word_types = ['N', 'V', 'J']
        otherStopWords = [
            'for', 'best', 's', 'amp', 'a', 'the', 'me', 'at', 'here',
            'chennai'
        ]
        stop_words = set(stopwords.words('english'))
        classifierResult = {}
        for trFile in trainingFiles:
            category = os.path.basename(trFile)
            category = os.path.splitext(category)[0]
            short_pos = open(trFile, "r").read()
            for p in short_pos.split('\n'):
                sent = ' '.join(
                    re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                           " ", p).split())
                self.documents.append((sent, category))
                words = word_tokenize(sent)
                pos = nltk.pos_tag(words)
                for w in pos:
                    word = w[0].lower()
                    if w[1][0] in allowed_word_types and word not in stop_words and word not in otherStopWords:
                        self.all_words.append(word)
                    #logger.debug('banned1 words123 %s',w[0].lower())
        #logger.debug('all words345 %s',self.all_words)
        #return
        save_documents = open("pickled_algos/documents.pickle", "wb")
        pickle.dump(self.documents, save_documents)
        save_documents.close()
        self.all_words = nltk.FreqDist(self.all_words)
        self.word_features = list(self.all_words.keys())[:5000]
        save_word_features = open("pickled_algos/word_features5k.pickle", "wb")
        pickle.dump(self.word_features, save_word_features)
        save_word_features.close()

        featuresets = [(self.find_features(rev), category)
                       for (rev, category) in self.documents]
        logger.debug('length of feature sets %s', len(featuresets))
        random.shuffle(featuresets)
        #return
        save_feature_sets = open("pickled_algos/featuresets.pickle", "wb")
        pickle.dump(featuresets, save_feature_sets)
        save_feature_sets.close()
        print(len(featuresets))
        splitLength = len(featuresets) / 2
        testing_set = featuresets[(splitLength):]
        training_set = featuresets[:(splitLength)]

        classifier = nltk.NaiveBayesClassifier.train(training_set)
        print("Original Naive Bayes Algo accuracy percent:",
              (nltk.classify.accuracy(classifier, testing_set)) * 100)
        classifier.show_most_informative_features(15)

        ###############
        save_classifier = open("pickled_algos/originalnaivebayes5k.pickle",
                               "wb")
        pickle.dump(classifier, save_classifier)
        save_classifier.close()

        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier.train(training_set)
        print("MNB_classifier accuracy percent:",
              (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)
        classifierResult['MNB_classifier'] = (nltk.classify.accuracy(
            MNB_classifier, testing_set)) * 100

        save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb")
        pickle.dump(MNB_classifier, save_classifier)
        save_classifier.close()

        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier.train(training_set)
        print("BernoulliNB_classifier accuracy percent:",
              (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) *
              100)
        classifierResult['BernoulliNB_classifier'] = (nltk.classify.accuracy(
            BernoulliNB_classifier, testing_set)) * 100

        save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle",
                               "wb")
        pickle.dump(BernoulliNB_classifier, save_classifier)
        save_classifier.close()

        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training_set)
        print("LogisticRegression_classifier accuracy percent:",
              (nltk.classify.accuracy(LogisticRegression_classifier,
                                      testing_set)) * 100)
        classifierResult['LogisticRegression_classifier'] = (
            nltk.classify.accuracy(LogisticRegression_classifier,
                                   testing_set)) * 100

        save_classifier = open(
            "pickled_algos/LogisticRegression_classifier5k.pickle", "wb")
        pickle.dump(LogisticRegression_classifier, save_classifier)
        save_classifier.close()

        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training_set)
        print("LinearSVC_classifier accuracy percent:",
              (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) *
              100)
        classifierResult['LinearSVC_classifier'] = (nltk.classify.accuracy(
            LinearSVC_classifier, testing_set)) * 100

        save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle",
                               "wb")
        pickle.dump(LinearSVC_classifier, save_classifier)
        save_classifier.close()

        ##NuSVC_classifier = SklearnClassifier(NuSVC())
        ##NuSVC_classifier.train(training_set)
        ##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

        SGDC_classifier = SklearnClassifier(SGDClassifier())
        SGDC_classifier.train(training_set)
        print("SGDClassifier accuracy percent:",
              nltk.classify.accuracy(SGDC_classifier, testing_set) * 100)
        classifierResult['SGDClassifier'] = nltk.classify.accuracy(
            SGDC_classifier, testing_set) * 100

        save_classifier = open("pickled_algos/SGDC_classifier5k.pickle", "wb")
        pickle.dump(SGDC_classifier, save_classifier)
        save_classifier.close()
        return classifierResult
Exemple #38
0
                                         test_results[label])
        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score

    for result in sorted(metrics_results):
        print('{0}: {1}'.format(result, metrics_results[result]))

    if output:
        output_markdown(output,
                        Approach='Vader',
                        Dataset='labeled_tweets',
                        Instances=n_instances,
                        Results=metrics_results)


if __name__ == '__main__':
    from nltk.classify import NaiveBayesClassifier, MaxentClassifier
    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.svm import LinearSVC

    naive_bayes = NaiveBayesClassifier.train
    svm = SklearnClassifier(LinearSVC()).train
    maxent = MaxentClassifier.train

    demo_tweets(naive_bayes)
    # demo_movie_reviews(svm)
    # demo_subjectivity(svm)
    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
    # demo_vader_tweets()
Exemple #39
0
featuresets = []
for (rev, category) in documents:
    featuresets.append((find_features(rev), category))

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# posterior = prior occurence x liklihood / evidence

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo accuracy percent:",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# ERROR
#GNB_classifier = SklearnClassifier(GaussianNB())
#GNB_classifier.train(training_set)
#print("GNB_classifier accuracy percent:", (nltk.classify.accuracy(GNB_classifier, testing_set)) * 100)

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier accuracy percent:",
      (nltk.classify.accuracy(BNB_classifier, testing_set)) * 100)

##LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    for w in word_features:
        features[w] = (w in words)
    return features


# add every feature and its category to featuresets
featuresets = []

for (rev, category) in documents:
    rev = lemmatize_verbs(word_tokenize(rev))
    rev = (' '.join(rev)).strip()
    featuresets.append((find_features(rev), category))

random.shuffle(featuresets)

# 70:30 ratio of 10664 data
training_set = featuresets[:7465]
testing_set = featuresets[7465:]

classifier = SklearnClassifier(
    LogisticRegression(solver='lbfgs')).train(training_set)

save_classifier = open("pickled/logisticreg.pickle", "wb")
pickle.dump(classifier, save_classifier, pickle.HIGHEST_PROTOCOL)
save_classifier.close()

print('Logistic Regression Accuracy: ',
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
voted_labels = Classify(classifier)
print("Classification:", voted_labels.classify(testing_set[0][0]))
Exemple #41
0
def train(trainfeats, testfeats, nlt=True, skl=True, most=0):
    # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    nltk_output = "none"
    sklearn_output = "none"

    if nlt:

        my_classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = my_classifier.classify(feats)
            testsets[observed].add(i)

        # precision and recall
        accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100
        pos_prec = precision(refsets['pos'], testsets['pos']) * 100
        pos_rec = recall(refsets['pos'], testsets['pos']) * 100
        neg_prec = precision(refsets['neg'], testsets['neg']) * 100
        neg_rec = recall(refsets['neg'], testsets['neg']) * 100

        # round
        accuracy = round(accuracy, 1)
        pos_prec = round(pos_prec, 1)
        pos_rec = round(pos_rec, 1)
        neg_prec = round(neg_prec, 1)
        neg_rec = round(neg_rec, 1)

        # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
        # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
        my_classifier.show_most_informative_features(most)

        nltk_output = "nlt, " + str(accuracy) + ", " + str(
            pos_prec) + ", " + str(neg_prec) + ", " + str(
                pos_rec) + ", " + str(neg_rec) + "\n"

    if skl:

        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier._vectorizer.sort = False
        MNB_classifier.train(trainfeats)
        mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100
        mnb = round(mnb, 1)
        print(mnb)

        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier._vectorizer.sort = False
        BernoulliNB_classifier.train(trainfeats)
        bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100
        bnb = round(bnb, 1)
        print(bnb)

        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier._vectorizer.sort = False
        LogisticRegression_classifier.train(trainfeats)
        lr = (nltk.classify.accuracy(LogisticRegression_classifier,
                                     testfeats)) * 100
        lr = round(lr, 1)
        print(lr)

        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier._vectorizer.sort = False
        LinearSVC_classifier.train(trainfeats)
        lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100
        lsvc = round(lsvc, 1)
        print(lsvc)

        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier._vectorizer.sort = False
        NuSVC_classifier.train(trainfeats)
        nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100
        nsvc = round(nsvc, 1)
        print(nsvc)

        voted_classifier = VoteClassifier(NuSVC_classifier,
                                          LinearSVC_classifier, MNB_classifier,
                                          BernoulliNB_classifier,
                                          LogisticRegression_classifier)
        voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100
        voted = round(voted, 1)
        print(voted)

        sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(
            lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(
                voted) + "\n"

    return (nltk_output, sklearn_output)
Exemple #42
0
def MultinomialNaiveBayes():
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    return MNB_classifier
Exemple #43
0
featuresets = [(find_features(text), category)
               for (text, category) in twitterDataset[:15000]]

random.shuffle(featuresets)
training_set = featuresets[:12000]
testing_set = featuresets[:5000]
print("Done with generating training_set and testing_set")

# pickle classifiers for easier use in the future
NB_classifier = nltk.NaiveBayesClassifier.train(training_set)
save_classifier1 = open("classifiers/naive_bayes.pickle", "wb")
pickle.dump(NB_classifier, save_classifier1)
save_classifier1.close()
print("Done training and pickling NB_classifier")

MNB_classifier = SklearnClassifier(MultinomialNB()).train(training_set)
save_classifier2 = open("classifiers/multinomial_naive_bayes.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier2)
save_classifier2.close()
print("Done training and pickling MNB_classifier")

BNB_classifier = SklearnClassifier(BernoulliNB()).train(training_set)
save_classifier3 = open("classifiers/bernoulli_naive_bayes.pickle", "wb")
pickle.dump(BNB_classifier, save_classifier3)
save_classifier3.close()
print("Done training and pickling BNB_classifier")

LinearSVC_classifier = SklearnClassifier(LinearSVC()).train(training_set)
save_classifier4 = open(
    "classifiers/linear_support_vector_classification.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier4)
Exemple #44
0
    save_word_features = open("word_features_sentiment_lda.pickle", "wb")
    pickle.dump(word_features_lda, save_word_features)
    save_word_features.close()

    featuresets = [(find_features(rev, word_features_lda), categorie)
                   for (rev, categorie) in documents
                   ]  # Retourne une liste de dict ou chaque mot est une clé
    #print("featuresets : ", featuresets)

    random.shuffle(featuresets)
    print("nombre de mot pertinant : ", len(featuresets))
    nbset = int(len(featuresets) / 2)
    testing_set = featuresets[nbset:]
    training_set = featuresets[:nbset]

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    #print("sklearn classifier créer en LogisticRegression : \n",LogisticRegression_classifier)
    #LogisticRegression_classifier.fit(training_set)
    #print(LogisticRegression_classifier)
    print(
        "LogisticRegression_classifier accuracy percent:",
        (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) *
        100)

    print("Labels :", LogisticRegression_classifier.labels())
    '''
    dictum = [tupl[0] for tupl in testing_set]            
    try :
        print("classify many:" , LogisticRegression_classifier.classify_many(dictum)) 
    except :
Exemple #45
0
]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(names, classifiers))

# check accuracy for each model
for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print("{} Accuracy: {}".format(name, accuracy))
"""
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
Exemple #46
0
def main():

    df = pd.read_csv('/Users/sid/Desktop/gender-classifier-DFE-791531.csv',
                     encoding='latin1')
    #df = shuffle(shuffle(shuffle(df)))
    print(df.head(10))

    all_descriptions = df['description']
    all_tweets = df['text']
    all_genders = df['gender']
    all_gender_confidence = df['gender:confidence']
    description_tweet_gender = []

    # comment out if running the program for the first time in order to download the stopwords data file
    '''
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download() 
    '''
    # Creation of bag of words for description
    bag_of_words_male = []
    bag_of_words_female = []
    c = 0  # for the index of the row
    stop = stopwords.words('english')
    for tweet in all_tweets:
        description = all_descriptions[c]
        gender = all_genders[c]
        gender_confidence = all_gender_confidence[c]

        if str(tweet) == 'nan':
            tweet = ''
        if str(description) == 'nan':
            description = ''

        # removal of punctuations
        for punct in string.punctuation:
            if punct in tweet:
                tweet = tweet.replace(punct, " ")
            if punct in description:
                description = description.replace(punct, " ")

        # remove the rows which has an empty tweet and description
        # remove the rows with unknown or empty gender
        # remove the rows which have gender:confidence < 80%
        if (str(tweet) == 'nan'
                and str(description) == 'nan') or str(gender) == 'nan' or str(
                    gender) == 'unknown' or float(gender_confidence) < 0.8:
            c += 1
            continue

        if str(gender) == 'male':
            bag_of_words_male = add_to_bag(bag_of_words_male, tweet,
                                           description, stop)
            description_tweet_gender.append(
                (tweet + " " + description, gender))

        elif str(gender) == 'female':
            bag_of_words_female = add_to_bag(bag_of_words_female, tweet,
                                             description, stop)
            description_tweet_gender.append(
                (tweet + " " + description, gender))

        c += 1

    print(len(bag_of_words_male))
    print(len(bag_of_words_female))

    common_words_ratio = []
    common_words = list(
        set(bag_of_words_male).intersection(bag_of_words_female))
    uniquewords_male = list(
        set(bag_of_words_male).difference(set(bag_of_words_female)))
    uniquewords_female = list(
        set(bag_of_words_female).difference(set(bag_of_words_male)))

    bag_of_words = nltk.FreqDist(common_words)
    top_words = []
    for word in bag_of_words.most_common(100):
        top_words.append(word[0])
    print("Number of common words", top_words)

    bag_of_words = nltk.FreqDist(uniquewords_male)
    top_words = []
    for word in bag_of_words.most_common(100):
        top_words.append(word[0])
    print("Number of common words", top_words)

    bag_of_words = nltk.FreqDist(uniquewords_female)
    top_words = []
    for word in bag_of_words.most_common(100):
        top_words.append(word[0])
    print("Number of common words", top_words)

    print("Number of common words", len(common_words))
    print("Number of unique words", len(uniquewords_male))
    print("Number of unique words", len(uniquewords_female))

    c = 0
    for word in common_words:
        r = bag_of_words_male.count(word) / bag_of_words_female.count(word)
        if r > 1:
            common_words_ratio.append(r)
        else:
            common_words_ratio.append(-1 / r)
        c += 1

    print("Maximum ratio", max(common_words_ratio))
    print("Minimum ratio", min(common_words_ratio))

    N_common_words_ratio = np.array(common_words_ratio)

    mu = np.mean(N_common_words_ratio)  # mean of distribution
    sigma = np.std(N_common_words_ratio)  # standard deviation of distribution
    num_bins = 35

    fig, ax = plt.subplots()

    # the histogram of the data
    n, bins, patches = ax.hist(N_common_words_ratio, num_bins)

    # add a 'best fit' line
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-0.5 * (1 / sigma *
                                                             (bins - mu))**2))
    ax.plot(bins, y, '--')
    ax.set_xlabel('ratio_of_common_words_count used by Male & Female')
    ax.set_ylabel('frequency')
    ax.set_title("Histogram: $\mu=$" + str(mu) + " $\sigma=$" + str(sigma))

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    #plt.show()

    c = 0
    for ratio in common_words_ratio:
        if ratio < 1.3:
            del common_words[c]
        else:
            c += 1

    male_wordset = list(set(uniquewords_male).union(set(common_words)))
    female_wordset = list(set(uniquewords_female).union(set(common_words)))

    # creating the feature set, training set and the testing set
    feature_set = [(find_features(male_wordset, female_wordset, text), gender)
                   for (text, gender) in description_tweet_gender]
    training_set = feature_set[:int(len(feature_set) * 4 / 5)]
    testing_set = feature_set[int(len(feature_set) * 4 / 5):]

    print("Size of feature set", len(feature_set))
    print("Size of training set", len(training_set))
    print("Size of testing set", len(testing_set))

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    accuracy = nltk.classify.accuracy(LogisticRegression_classifier,
                                      testing_set) * 100
    print("Logistic Regression classifier accuracy =", accuracy)

    X = []
    y = []

    # Creating a different version of the feature set
    c = 0
    for (dicti, gender) in feature_set:
        X.append([])
        a = dicti['mset_count']
        b = dicti['fset_count']
        X[c].append(a)
        X[c].append(b)
        y.append(gender)
        c += 1

    np_X = np.array(X)

    binary_y = []

    for result in y:
        if result == "male":
            binary_y.append(1)
        else:
            binary_y.append(0)

    # Using a contour plot and scatter diagram to plot the data points and  the decisssion boundary

    clf = LogisticRegression(random_state=0,
                             solver='lbfgs',
                             multi_class='multinomial').fit(np_X, binary_y)

    xx, yy = np.mgrid[0:80:1, 0:80:1]
    grid = np.c_[xx.ravel(), yy.ravel()]
    probs = clf.predict_proba(grid)[:, 1].reshape(xx.shape)

    f, ax = plt.subplots(figsize=(8, 6))
    contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu", vmin=0, vmax=1)
    ax_c = f.colorbar(contour)
    ax_c.set_label("$P(y = 1)$")
    ax_c.set_ticks([0, .25, .5, .75, 1])

    ax.scatter(np_X[45:, 0],
               np_X[45:, 1],
               c=binary_y[45:],
               s=35,
               cmap="RdBu",
               vmin=-.2,
               vmax=1.2,
               edgecolor="white",
               linewidth=1)

    ax.set(aspect="equal",
           xlim=(0, 60),
           ylim=(0, 60),
           xlabel="Num_words in male_wordset",
           ylabel="Num_words in female_wordset")

    ax.set_title("Decision Boundary")

    plt.show()

    # code for predicting whether someone is male or female on the basis of
    '''
Exemple #47
0
def LinearSupportVectorClassification():
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    return LinearSVC_classifier
Exemple #48
0
#!/usr/bin/env python

"""
SVC
"""

__author__ = "Manan Kalra"
__email__ = "*****@*****.**"


import nltk
from nltk.classify.scikitlearn import SklearnClassifier  # wrapper to include the scikit-learn algorithms
from main import naive_bayes as original
from sklearn.svm import SVC

svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(original.training_set)
svc_accuracy = nltk.classify.accuracy(svc_classifier, original.testing_set) * 100

print("\nBernoulli NB Accuracy: ", svc_accuracy)
Exemple #49
0
def BernoulliNaiveBayes():
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    return BernoulliNB_classifier
Exemple #50
0
Classifier = pickle.load(classifier_file)
classifier_file.close()

print("Checking Original Naive Bayes Algo Accuracy Percentage :",
      nltk.classify.accuracy(Classifier, testing_set) * 100)
Classifier.show_most_informative_features(20)

#Saving the classifier using pickle
#In pickle write should be in bytes after python 3 and so on
save_classifier = open("naivebayes.pickle", "wb")
#which you need to save where
pickle.dump(Classifier, save_classifier)
#closing
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Checking MNB Classifier Accuracy Percentage :",
      nltk.classify.accuracy(MNB_classifier, testing_set) * 100)

#GaussianNB_classifier = SklearnClassifier(GaussianNB())
#GaussianNB_classifier.train(training_set)
#print("Checking GaussianNB Classifier Accuracy Percentage :", nltk.classify.accuracy(GaussianNB_classifier,testing_set)*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("Checking BernoulliNB Classifier Accuracy Percentage :",
      nltk.classify.accuracy(BernoulliNB_classifier, testing_set) * 100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
def testdocumenttester(Test_data):

    print("Evaluating Linear SVC Classifier... ")
    start = time.time()
    classifier6 = SklearnClassifier(LinearSVC())
    classifier6.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    print("Evaluating Logistic Regression Classifier...")
    start = time.time()
    Logistic_regression_classifier = SklearnClassifier(LogisticRegression())
    Logistic_regression_classifier.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    print("Evaluating Naive Bayes  Classifier..")
    start = time.time()
    Naive_classifier = naivebayes.NaiveBayesClassifier.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    print("Evaluating Multinomial Classifier")
    start = time.time()
    Multi_Classifier = SklearnClassifier(MultinomialNB())
    Multi_Classifier.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    print("Evaluating SDG Classifier")
    start = time.time()
    SGD_Classifier = SklearnClassifier(SGDClassifier())
    SGD_Classifier.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    print("Evaluating Bernoulli Classifier")
    start = time.time()
    BernoulliNB_Classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_Classifier.train(Train_data)
    end = time.time()
    t_min = (end - start)
    print("Complete successfully...", t_min, "secs")

    Voter_Classifier = VoteClassifier(classifier6,
                                      Logistic_regression_classifier,
                                      Naive_classifier, Multi_Classifier,
                                      SGD_Classifier, BernoulliNB_Classifier)

    count = 0
    uncount = 0
    conf_uncount = 0
    calsen = Test_data.split('\n')
    total = len(calsen)
    st = 0
    for sentence in Test_data.split('\n'):
        feature = feature_extraction(sentence)
        vote_result = Voter_Classifier.classify(feature)
        if vote_result != 'ExceptGood':
            confidence = Voter_Classifier.confidence(feature)
            if confidence >= 0.6 and vote_result == 'Good':
                count = count + 1
            else:
                if vote_result == 'Good':
                    count = count + 1
                else:
                    uncount = uncount + 1
        else:
            count = count + 1
        st = st + 1
        per = (st / total) * 100
        per2 = int(per)
        print('\rCompleted....', per2, '%', end='', flush=True)

    print("\n\t\t\tAccuracy Status For Numeric\t\t\t")
    print("Number of Statements:\t\t\t", (count + uncount))
    print("Number of Good Answers(conf >= 0.6):\t\t\t", count)
    print("Number of Bad Answers:\t\t\t", uncount)
    print("Confidence Level:\t\t\t>0.6(60%)")
    print("Total Accuracy:\t\t\t", count / (count + uncount) * 100)
print final_score(NuSVC())#BernoulliNB()) #使用开发集中得出的最佳分类器
"""
#二、把分类器存储下来
#(存储分类器和前面没有区别,只是使用了更多的训练数据以便分类器更为准确)
word_scores = create_word_bigram_scores()
best_words = find_best_words(word_scores, 1500)

posFeatures = pos_features(best_word_features)
negFeatures = neg_features(best_word_features)

#trainSet = posFeatures + negFeatures
trainSet = posFeatures[:25]+negFeatures[:25]
testSet = posFeatures[25:35]+negFeatures[25:35]
test = posFeatures[35:50]+negFeatures[35:50]

NuSVC_classifier = SklearnClassifier(NuSVC(probability=True))
NuSVC_classifier.train(trainSet)
pickle.dump(NuSVC_classifier, open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/NuSVC_classifier.pkl','w'))

#在存储了分类器之后,就可以使用该分类器来进行分类了。
#三、使用分类器进行分类,并给出概率值
#给出概率值的意思是用分类器判断一条评论文本的积极概率和消极概率。给出类别也是可以的,也就是可以直接用分类器判断一条评论文本是积极的还是消极的,但概率可以提供更多的参考信息,对以后判断评论的效用也是比单纯给出类别更有帮助。

#1. 把文本变为特征表示的形式
#要对文本进行分类,首先要把文本变成特征表示的形式。而且要选择和分类器一样的特征提取方法。
#moto = pickle.load(open('D:/code/review_set/senti_review_pkl/moto_senti_seg.pkl','r')) #载入文本数据
#moto = test
def extract_features(data):
    feat = []
    for i in data:
        feat.append(best_word_features(i))
classifier = pickle.load(classifier_f)
classifier_f.close()

accuracy = nltk.classify.accuracy(classifier, test_set)

# Decent range: 65% - 85%
print('Original Naive Bayes Algorithm accuracy percentage: {}%'.format(
    accuracy * 100))
classifier.show_most_informative_features(15)

save_classifier = open('naivebayes.pickle', 'wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

# SklearnClassifier(x) is a nltk wrapper for SK-learn x
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print('MNB_classifier accuracy percent: {}%'.format(
    nltk.classify.accuracy(MNB_classifier, test_set) * 100))

# GaussianNB_classifier = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(train_set)
# print(
#     'GaussianNB_classifier accuracy percent: {}%'.format(nltk.classify.accuracy(GaussianNB_classifier, test_set) * 100))

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train_set)
print('BernoulliNB_classifier accuracy percent: {}%'.format(
    nltk.classify.accuracy(BernoulliNB_classifier, test_set) * 100))

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
Exemple #54
0
def StochasticGradientDescent():
    SGDClassifier_classifier = SklearnClassifier(
        SGDClassifier(max_iter=1000, tol=None))
    SGDClassifier_classifier.train(training_set)
    return SGDClassifier_classifier
lsvc = SklearnClassifier(LinearSVC())
lsvc.train(data)
print('Linear SV Classifier trained.')
print("Linear SV Classifier: {:.2f}%".format(accuracy(lsvc, data) * 100))
save_model(lsvc, './models2.7/lsvc.pickle')

# NuSV Classifier
nusvc = SklearnClassifier(NuSVC())
nusvc.train(data)
print('NuSV Classifier trained.')
print("NuSV Classifier: {:.2f}%".format(accuracy(nusvc, data) * 100))
save_model(nusvc, './models2.7/nusvc.pickle')
"""

# Decision Tree Classifier
dtc = SklearnClassifier(DecisionTreeClassifier())
dtc.train(data)
print('Decision Tree Classifier trained.')
print("Decision Tree Classifier: {:.2f}%".format(accuracy(dtc, data) * 100))
save_model(dtc, './models2.7/dtc.pickle')

# Random Forest Classifier
rfc = SklearnClassifier(RandomForestClassifier())
rfc.train(data)
print('Random Forest Classifier trained.')
print("Random Forest Classifier: {:.2f}%".format(accuracy(rfc, data) * 100))
save_model(rfc, './models2.7/rfc.pickle')

# End
end = timer()
print("Time taken:", end - start)
Exemple #56
0
featuresets = [(find_features(rev), category) for (rev, category) in documents
               ]  #making a set for training and testing with categories

training_set = featuresets[:1900]  #training set
testing_set = featuresets[1900:]  #testing set

# Finally, making a classifier object with naive bayes to train and test the dataset. Also printing the first 15 most informative features. Also, using the GaussianNB, MultinomialNB and BernoulliNB classifiers from sklearn library.

classifier = nltk.NaiveBayesClassifier.train(
    training_set)  #making classifier object with naive bayes
print("Original Naive Bayes Algo Accuray percent:",
      (nltk.classify.accuracy(classifier, testing_set)) *
      100)  #prining the accuracy of testing set
classifier.show_most_informative_features(
    15)  #getting first 15 most informative features

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Algo Accuray percent:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier Algo Accuray percent:",
      (nltk.classify.accuracy(BNB_classifier, testing_set)) * 100)

GNB_classifier = SklearnClassifier(GaussianNB())
GNB_classifier.train(training_set)
print("GNB_classifier Algo Accuray percent:",
      (nltk.classify.accuracy(GNB_classifier, testing_set)) * 100)
Exemple #57
0
# for the Bayes model
foldAccuracies = []
foldNegativePrecisions = []
foldNegativeRecalls = []
foldNegativeFScores = []
foldPositivePrecisions = []
foldPositiveRecalls = []
foldPositiveFScores = []

for i in range(num_folds):
    cv_test = featuresets[i * subset_size:][:subset_size]
    cv_train = featuresets[:i * subset_size] + featuresets[(i + 1) *
                                                           subset_size:]
    # use NB classifier
    classifier = nltk.NaiveBayesClassifier.train(cv_train)
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    #lg = LogisticRegression_classifier.train(cv_train)

    print('  ')
    print('FOLD ' + str(i))
    print('For this fold:')
    #print('Accuracy on Fold Test Set: ' + str(nltk.classify.accuracy(LogisticRegression_classifier, cv_test)))
    print('Accuracy on Fold Test Set: ' +
          str(nltk.classify.accuracy(classifier, cv_test)))
    foldAccuracies.append(str(nltk.classify.accuracy(classifier, cv_test)))
    # most informative feauures
    # now get fold stats such as precison, recall, f score
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(cv_test):
def cat():
    business = os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Business/") #Give your dataset paths here
    document_edit= os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Document editing or checking/")
    employment_arrangements = os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Employment arrangements/")
    logistic_arrangements = os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Logistic Arrangements/")
    personal_prof = os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Personal but in professional context/")
    purely_personal = os.fsencode("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Purely Personal/")
    documents = []
    all_words = []

    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Business/")
    for file in os.listdir(business):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"business") )
            op.close()

    os.chdir("..")
    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Document editing or checking/")
    for file in os.listdir(document_edit):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"document_edit") )
            op.close()

    os.chdir("..")
    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Employment arrangements/")
    for file in os.listdir(employment_arrangements):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"employment_arrangements") )
            op.close()

    os.chdir("..")
    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Logistic Arrangements/")
    for file in os.listdir(logistic_arrangements):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"logistic_arrangements") )
            op.close()

    os.chdir("..")
    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Personal but in professional context/")
    for file in os.listdir(personal_prof):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"personal_prof") )
            op.close()

    os.chdir("C:/Users/pravi/Anaconda3/PROGRAMS/Mini_Project/preprocessed_dataset/Purely Personal/")
    for file in os.listdir(purely_personal):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            op = open(filename,"r")
            f = op.read()
            for w in word_tokenize(f):
                all_words.append(w.lower())
            documents.append( (f,"purely_personal") )
            op.close()

    os.chdir("..")
    os.chdir("..")
    word_features = list(all_words)[:5000]

    featuresets = [(find_features(mail, word_features), category) 
                    for (mail, category) in documents]

    random.shuffle(featuresets)
    
    train_num = int(0.85*(len(featuresets))) #85% of featuresets is used for training
    test_num = int(0.15*(len(featuresets))) #15% of featuresets is used for testing
    training_set = featuresets[:train_num]
    testing_set = featuresets[-test_num:]

    classifier = nltk.NaiveBayesClassifier.train(training_set)
    p = open('my_classifier.pickle', 'wb')
    pickle.dump(classifier, p)
    p.close()
    print("Naive Bayes accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100)

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    p5 = open('my_MNB_classifier.pickle', 'wb')
    pickle.dump(MNB_classifier, p5)
    p5.close()
    print("Multinomial NB accuracy percent: ",(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    p6 = open('my_logisticRegression_classifier.pickle', 'wb')
    pickle.dump(LogisticRegression_classifier, p6)
    p6.close()
    # return LogisticRegression_classifier.classify(find_features(text))
    print("Logistic Regression accuracy percent: ",(nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

    LinearSVM_classifier = SklearnClassifier(LinearSVC())
    LinearSVM_classifier.train(training_set)
    p7 = open('my_linear_svc_classifier.pickle', 'wb')
    pickle.dump(LinearSVM_classifier, p7)
    p7.close()
    print("Linear SVM accuracy percent: ",(nltk.classify.accuracy(LinearSVM_classifier, testing_set))*100)
Exemple #59
0
    def train_different_model(self):
        training_set = self.training_set
        testing_set = self.testing_set

        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier.train(training_set)
        print "MultinomialNB accuracy percent:", nltk.classify.accuracy(
            MNB_classifier, testing_set)

        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier.train(training_set)
        print("BernoulliNB_classifier accuracy percent:",
              (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) *
              100)

        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training_set)
        print("LogisticRegression_classifier accuracy percent:",
              (nltk.classify.accuracy(LogisticRegression_classifier,
                                      testing_set)) * 100)

        SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
        SGDClassifier_classifier.train(training_set)
        print("SGDClassifier_classifier accuracy percent:",
              (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) *
              100)

        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training_set)
        print("LinearSVC_classifier accuracy percent:",
              (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) *
              100)

        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier.train(training_set)
        print("NuSVC_classifier accuracy percent:",
              (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)