Esempio n. 1
0
 def test_bernollinb_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(BernoulliNB()).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     ['ham', 'spam']
     self.assertEqual(ccm, ['ham', 'spam'])
Esempio n. 2
0
 def test_svc_returns_correct_result(self):
     train_data = [({
         "a": 4,
         "b": 1,
         "c": 0
     }, "ham"), ({
         "a": 5,
         "b": 2,
         "c": 1
     }, "ham"), ({
         "a": 0,
         "b": 3,
         "c": 4
     }, "spam"), ({
         "a": 5,
         "b": 1,
         "c": 1
     }, "ham"), ({
         "a": 1,
         "b": 4,
         "c": 3
     }, "spam")]
     classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
     test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
     ccm = classif.classify_many(test_data)
     self.assertEqual(ccm, ['ham', 'spam'])
Esempio n. 3
0
def read(filename):
    fp = open(filename, "r")
    f = fp.readlines()
    vocab = [s.encode('utf-8').split() for s in f]
    #print vocab
    voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4)
    #print voc_vec.syn0.shape
    #print type(voc_vec['yav'])
    #Openning data file
    fp.close()
    fp = open("test_data.txt", "r")
    f = fp.read()
    tokens = nltk.word_tokenize(f)
    D = OrderedDict()
    sentences = []
    #print len(tokens)
    for word in tokens[0:200]:
        D[word.split("|")[0]] = word.split("|")[1]
        sentences.append(word.split("|")[0])
    #print D

    train_data = []

    for key in D:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        train_data.append((x, D[key]))
    classif = SklearnClassifier(BernoulliNB()).train(train_data)
    #print train_data

    test_data = []
    D2 = OrderedDict()
    for word in tokens[200:300]:
        D2[word.split("|")[0]] = word.split("|")[1]
    expected_list = []
    for key in D2:
        l = voc_vec[key]
        x = {}
        x['a'] = l[0]
        x['b'] = l[1]
        x['c'] = l[2]
        x['d'] = l[3]
        test_data.append(x)
        expected_list.append(D2[key])
    predicted = classif.classify_many(test_data)
    print len(predicted)
    print len(expected_list)
    print accuracy_score(expected_list, predicted, normalize=False)
Esempio n. 4
0
do_evaluation (pairs)
do_evaluation (pairs, pos_cls='neg')

#%% Other classifier : SVM ###################################################

# http://www.nltk.org/howto/classify.html
# Run example

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

t0 = time.time()
classif = SklearnClassifier(SVC(), sparse=False).train(train_set)
print(round(time.time()-t0,2))

classif.classify_many(test_set[0][0])

sizeTrain = [800]  # the first 100, the first 300 ,etc
testDoc = [800, 1000] # 800 to 999

classif.classify_many(test_set[0][0])

#%% SVM Class ################################################################

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
    
class SVM:
    """ SVM
    
    data = dict (key = pos or neg), value = list of filenames
Esempio n. 5
0
ngram_truncate(ngram_records, 1000)
print('done truncation')
train_max = 55000
train_data4 = [(x, y)
               for x, y in zip(ngram_records[:train_max], labels[:train_max])]

# for C in [1,0.8,0.6,0.4,0.2]:
# for C in [0.1, 0.08, 0.06, 0.04, 0.02]:
for C in [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02]:
    # for C in [0.2, 0.1, 0.08, 0.06, 0.04, 0.02]:
    print('C=', C)
    classifier4 = SklearnClassifier(LogisticRegression(C=C, penalty='l1'),
                                    sparse=False).train(train_data4)

    val_labels4 = classifier4.classify_many(ngram_records[train_max:])
    aa = [x == y for x, y in zip(val_labels4, labels[train_max:])]
    print(np.mean(aa))
    accu.append(np.mean(aa))

    train_labels4 = classifier4.classify_many(ngram_records[:train_max])
    aatr = [x == y for x, y in zip(train_labels4, labels[:train_max])]
    print(np.mean(aatr))
    classifiers.append(classifier4)

    # with open('classifiers_temp_b0p03_v5_n6_l1_sweep_th1000_scale.pkl','wb') as f:
    #     pickle.dump(classifiers,f)

    with open('DEBU_classifiers_temp_b0p1_v5_n6_l1_VFB_sweep_th1000_scale.pkl',
              'wb') as f:
        pickle.dump(classifiers, f)
Esempio n. 6
0
                    isNER = False
                    w = lancaster_stemmer.stem(word[0])
                    features['{}'.format(w.lower()), '{}'.format(word[1]),
                             '{}'.format(isNER)] = 'inference'
    return features


trainFeaturesets0 = [(get_features_basic(post), post.get('value'))
                     for post in trainRoot]
testFeaturesets0 = [(get_features_basic(post), post.get('value'))
                    for post in testRoot]

classifier0 = SklearnClassifier(BernoulliNB()).train(trainFeaturesets0)

actual = [t[1] for t in testFeaturesets0]
prediction = classifier0.classify_many([fs for (fs, l) in testFeaturesets0])

result = zip(actual, prediction)
truePositive = 0
falseNegative = 0
falsePositive = 0
trueNegative = 0

for a in result:
    if a[0] == 'TRUE':
        if a[1] == 'TRUE':
            truePositive = truePositive + 1
        else:
            falseNegative = falseNegative + 1
    else:
        if a[1] == 'TRUE':
    #region SVMClassifier
    WriteLog("\nEntering SVM", ClassificationLogFile)
    trainD = list()
    testD = list()
    gTruth = list()

    #Formatting the Data
    for dictPair in training_set:
        trainD.append(dictPair)
    for dictPair in testing_set:
        testD.append(dictPair[0])
        gTruth.append(dictPair[1])

    WriteLog("Starting SVM Training", ClassificationLogFile)
    SVMClassifier = SklearnClassifier(SVC(), sparse=False).train(trainD)
    SVMPredictions = SVMClassifier.classify_many(testD)

    WriteLog("SVM Training Set Accuracy:", ClassificationLogFile)
    WriteLog(str(accuracy_score(gTruth, SVMPredictions, normalize=True, sample_weight=None)), ClassificationLogFile)

    #SVM Classification
    WriteLog("SVM Classification", ClassificationLogFile)
    DoClassify(SVMClassifier, SVMtopicResultsTxt, topicTweetsLDATxt)

    #SVM Predictions
    WriteLog("SVM Predictions:", ClassificationLogFile)
    WriteLog(SVMPredictions, ClassificationLogFile)
    #endregion

    #region NaiveBayes
    WriteLog("\nNaive Bayes Training", ClassificationLogFile)
 def LinSVC():
     classifierLinSVC = SklearnClassifier(LinearSVC(),
                                          sparse=False).train(train_set)
     return classifierLinSVC.classify_many(test)
rating_names = [student['name'] for student in ratings]
data_names = list(set([student['Name'] for student in data]))
#cleans text for classifying
for i,student in enumerate(data):
	text = tech.cleanse(student['Student Comment'])
	data[i]['Student Comment'] = text

#split into testing and training sets
n = len(data)
test_idx = random.sample(xrange(n),int(n*0.5))
train_idx = set(xrange(n))-set(test_idx)

test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx]))
train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx]))

#classifier = NaiveBayesClassifier.train(train_set)
classif.train(test_set)
#Compute accuracy
test_data,test_label = zip(*test_set)
train_data,train_label = zip(*train_set)

predictions = classif.classify_many(test_data)

print confusion_matrix(test_label,predictions)
print matthews_corrcoef(test_label,predictions)
'''
#Only work if using built-in NLTK classifier
print ('Accuracy: {0:.2f}%'.format(100 * nltk.classify.accuracy(classif, test_set)))
classif.show_most_informative_features(20)
'''
 def NBtfidf():
     classifierTF = SklearnClassifier(pipeline).train(train_set)
     return classifierTF.classify_many(test)
 def LinSVC():
     classifierLinSVC = SklearnClassifier(LinearSVC(), sparse=False).train(train_set)
     return classifierLinSVC.classify_many(test)
    """
	Linear (Bernoulli) SVC
	Implementation of Support Vector Machine classifier using libsvm: 
	the kernel can be non-linear but its SMO algorithm does not scale to
	 large number of samples as LinearSVC does.
	"""

    from nltk.classify import SklearnClassifier
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC

    print " "
    print "============================="
    print "Bernoulli SVC Classifier:"
    classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
    classifierBi.classify_many(test)

    for pdist in classifierBi.prob_classify_many(test):
        print pdist.prob("human"), pdist.prob("auto")

    for i in range(len(classifierBi.classify_many(test))):
        print classifierBi.classify_many(test)[i]

    classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set)
    classifierSVC.classify_many(test)

    # svc = nltk.classify.accuracy(classifierSVC, test_set)
    # print 'accuracy is %.2f' %round(svc*100,4), '%'
    def SVC():
        classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
        return classifierSVC.classify_many(test)
Esempio n. 13
0
class MyClassifier:
    def __init__(self, load_clf=False, load_tr_data=False):
        self.features = self.__load_support_vector_features()
        self.training_data = []
        self.n_samples = 0
        self.all_tweets = self.__load_tweets_from_file()  # list not dict

        # Classifier loading
        if load_clf:
            self.load_clf()
        else:
            self.clf = SklearnClassifier(SVC(), sparse=False)

        # Training Data loading
        if load_tr_data:
            self.__load_training_data()

    def __load_tweets_from_file(self):
        # open latest file
        list_of_files = glob.glob(
            "datasets_twitter/twitter_training_data_raw*.txt")
        latest_file = max(list_of_files, key=os.path.getctime)
        f = open(latest_file, "r", encoding="UTF-8")

        tweet_list = []
        for line in f:
            line = line.split("%\t%")
            tweet_text, tweet_id = line[0], line[1]
            tweet_list.append((tweet_text, tweet_id))

        return tweet_list

    def __load_support_vector_features(self):
        feature_f = open("verifiability_features.txt", "r")

        # get all features
        support_vector_features = []
        for line_f in feature_f:
            support_vector_features.append(line_f.replace("\n", ""))

        feature_f.close()

        return support_vector_features

    def __get_sample(self, text_str):
        """
        Changes the text_str into a sample of data in the form of [0, 0, 0, ...]
        This is to be used by the classifier, when
            1) Assembling Training Data, and
            2) Testing data.
        It returns a list of int, which is basically a count of how many of each feature existed in text_str.

        :param text_str: a string of text which is to be verified
        :return: curr_sample, a list of int, sort of mapped to self.features
        """
        tokens = pos_tag(word_tokenize(text_str))

        curr_sample = [0] * len(
            self.features)  # list of n_features of 0s ex. [0, 0, 0, ..]

        for token in tokens:  # for each feature
            t_text, t_feature = token[0], token[1]
            try:
                for index in range(len(self.features)):
                    if t_feature == self.features[index]:
                        # when found, increment/decrement sample vector's  value

                        if self.features[index] == self.features[-1]:
                            # checking if there is a "?" in the text
                            if token[0] == "?":
                                # decrement
                                curr_sample[index] -= 1
                                break
                        else:
                            curr_sample[index] += 1
                            break

            except IndexError:
                # if the feature isn't in the sv_features list
                pass

        return curr_sample

    def __get_training_target(self, sample):
        """
        Returns the label depending on the sample given.

        :param sample: int[] from self.__get_sample()
        :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable
        """
        # check sample if VER or NVER
        t_sum = 0
        for v in sample:
            if v < 0:
                # if there exists a "?" in the sample text
                # (this is the only reason why there'd be a -ve value in curr_sv)
                t_sum = -1
                break

            t_sum += v

        if t_sum > 0:
            return "VER"
        else:
            return "NVER"

    def __assemble_training_data(self):
        """
        Construct the training data using the twitter training data set.

        To be used directly prior to training the Classifier

        :return:
        """

        for tweet in self.all_tweets:
            # get the sample and target for each tweet
            tweet_text = tweet[0]
            curr_sample = self.__get_sample(tweet_text)
            curr_target = self.__get_training_target(curr_sample)

            # change the above into training data
            tr_dict = {}
            for i in range(len(self.features)):
                tr_dict[self.features[i]] = curr_sample[i]

            tup = (tr_dict, curr_target)

            # add to self.training_data
            self.training_data.append(tup)

        # repeat

    def __save_training_data(self):
        timestamp = '{:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now())
        f = open(
            "datasets_twitter/twitter_training_dataset" + timestamp + ".json",
            "w+")
        json_data = json.dumps(self.training_data)
        f.write(json_data)
        f.close()

    def __load_training_data(self):
        list_of_files = glob.glob(
            "datasets_twitter/twitter_training_dataset*.json")
        latest_file = max(list_of_files, key=os.path.getctime)
        f = open(latest_file, "r")
        s = f.readline()
        js = json.loads(s)
        for i in js:
            tup = (i[0], i[1])  # sample, target
            self.training_data.append(tup)

    def train_with_svc(self):
        # make the training data
        self.__assemble_training_data()

        # Train the classifier
        self.clf.train(self.training_data)

        # save classifier as soon as it is trained
        self.save_clf()

    def predict_single(self, test_text):
        """
        Predict a single sample. Then based on user's input, add the sample to the training data with the correct label.

        :param test_text:
        :return:
        """

        test_sample = self.__get_sample(test_text)
        test_dict = {}
        for index in range(len(self.features)):
            test_dict[self.features[index]] = test_sample[index]

        pred = self.clf.classify_many([test_dict])
        return (pred[0], test_sample)

    def predict_multiple(self, test_list):
        """
        Predict more than one sample at a time.

        :param test_list:
        :return:
        """
        # translate test_list into clf passable data format
        test_data = []
        for i in test_list:
            curr_test_sample = self.__get_sample(i)
            test_dict = {}
            for index in range(len(self.features)):
                test_dict[self.features[index]] = curr_test_sample[index]

            test_data.append(test_dict)

        # predict
        pred = self.clf.classify_many(test_data)
        return pred

    def update_pred_into_training(self, test_tweet, pred_val):
        """
        Adds predicted ( {feat:sample}, target ) to training data
        then saves the training data

        if test_text already exists in the training data
            update the target value instead
            then save the training data


        :param test_tweet: a tweet in the form of (tweet_text, tweet_id)
        :param pred_val: the value of the prediction made by the classifier
        :return:
        """
        # a flag to make sure only one part of the code is run
        updated = False

        # localise
        test_tweet_text = test_tweet[0]

        # if text exists in training data already, update the target for this tweet
        for i in range(len(self.all_tweets)):
            tweet = self.all_tweets[i]

            if test_tweet_text == tweet[0]:  # if found
                test_sample = self.__get_sample(test_tweet_text)

                # make into trainable data format
                test_dict = {}
                for j in range(len(self.features)):
                    test_dict[self.features[j]] = test_sample[j]
                test_target = pred_val

                tup = (test_dict, test_target)

                # get the current tup for the test_text and replace
                self.training_data[i] = tup

                # there should only be one tweet with the same text
                updated = True
                break

        # if test_text is not in the training data already
        if not updated:
            # make into trainable data format
            test_sample = self.__get_sample(test_tweet_text)
            test_dict = {}
            for j in range(len(self.features)):
                test_dict[self.features[j]] = test_sample[j]
            test_target = pred_val

            tup = (test_dict, test_target)

            # add tweet to all_tweets and training data
            # get tweet_id
            self.all_tweets.append(test_tweet)
            self.training_data.append(tup)

            # consistency
            updated = True

        # save the training data to file
        self.__save_training_data()
        # train the classifier again
        self.train_with_svc()

    def load_clf(self):
        """
        Load a previously trained and saved classifier.
        :return:
        """
        self.clf = joblib.load("twitterClassifier.pkl")

    def save_clf(self):
        """
        Save the current classifier to file
        :return:
        """
        joblib.dump(self.clf, "twitterClassifier.pkl")
 def NBtfidf():
     classifierTF = SklearnClassifier(pipeline).train(train_set)
     return classifierTF.classify_many(test)
Esempio n. 15
0
##0-Suffix,1-Previous Number,2-Next Number ,3-Previous wordform,4-next wordform, 5-post position,6-present word form,7-POS####

from preprocess_train import features,number;
from preprocess_test import features_test,number_test;
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
train_data=[[x for x in range(2)] for y in range(357)]
test_data=[[x for x in range(1)] for y in range(11)]
for i in range(0,357):
    train_data[i][0]={'Suffix':features[i][0], 'Previous morph':features[i][1],'Next morph':features[i][2],'Previous wordform':features[i][3],
                      'Next wordform':features[i][4],'postposition':features[i][5],'wordform':features[i][6],'pos':features[i][7]}
    train_data[i][1]=number[i]
for i in range(0,11):
    test_data[i]={'Suffix':features_test[i][0], 'Previous morph':features_test[i][1],'Next morph':features_test[i][2],'Previous wordform':features_test[i][3],
                  'Next wordform':features_test[i][4],'postposition':features_test[i][5],'wordform':features[i][6],'pos':features[i][7]} 
classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
result=classif.classify_many(test_data)
classif1 = SklearnClassifier(BernoulliNB()).train(train_data)
result1=classif1.classify_many(test_data)
print result1
Esempio n. 16
0
            label_train.append(X_label[item])
        print("test")
        for item in test_indices:
            list_test_data.append(X[item])
            label_test.append(X_label[item])
        break

    print(list_train_data)
    print(label_train)
    print(list_test_data)
    print(label_test)

    # bieu dien vector input
    train_data = buildVectorTrainData(list_feature, list_train_data,
                                      label_train)
    # example train_data = [({"stupid": 0, "lovely": 1, "dog": 2,"cat":0}, "positive_dog"),
    #               ({"stupid": 1, "lovely": 0, "dog": 0, "cat": 2}, "negative_cat"),
    #               ({"stupid": 0, "lovely": 0, "dog": 0, "cat": 0}, "normal")]
    test_data = buildVectorTestData(list_feature, list_test_data)

    # models and measure
    classif = SklearnClassifier(BernoulliNB()).train(train_data)
    # classif = SklearnClassifier(SVC(C=1.0, kernel='rbf', degree=3), sparse=False).train(train_data)
    # measure accuracy
    y_pred = classif.classify_many(test_data)
    y_true = label_test
    print(accuracy_score(y_true, y_pred))
    # y_true = [0, 1, -1, -1, 0]
    # y_pred = [0, 0, -1, 1, 0]
    # target_names = ['class 0', 'class 1', 'class 2']
    # print(classification_report(y_true, y_pred, target_names=target_names))
Esempio n. 17
0
}, "ham"), ({
    "a": 0,
    "b": 3,
    "c": 4
}, "spam"), ({
    "a": 5,
    "b": 1,
    "c": 1
}, "ham"), ({
    "a": 1,
    "b": 4,
    "c": 3
}, "spam")]
classif = SklearnClassifier(BernoulliNB()).train(train_data)
test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}]
classif.classify_many(test_data)
classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
classif.classify_many(test_data)


def print_maxent_test_header():
    print(' ' * 11 +
          ''.join(['      test[%s]  ' % i for i in range(len(test))]))
    print(' ' * 11 + '     p(x)  p(y)' * len(test))
    print('-' * (11 + 15 * len(test)))


def test_maxent(algorithm):
    print('%11s' % algorithm)
    try:
        classifier = nltk.classify.MaxentClassifier.train(train,
    """
	Linear (Bernoulli) SVC
	Implementation of Support Vector Machine classifier using libsvm: 
	the kernel can be non-linear but its SMO algorithm does not scale to
	 large number of samples as LinearSVC does.
	"""

    from nltk.classify import SklearnClassifier
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC

    print ' '
    print '============================='
    print 'Bernoulli SVC Classifier:'
    classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
    classifierBi.classify_many(test)

    for pdist in classifierBi.prob_classify_many(test):
        print pdist.prob('human'), pdist.prob('auto')

    for i in range(len(classifierBi.classify_many(test))):
        print classifierBi.classify_many(test)[i]

    classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set)
    classifierSVC.classify_many(test)

    # svc = nltk.classify.accuracy(classifierSVC, test_set)
    # print 'accuracy is %.2f' %round(svc*100,4), '%'
    def SVC():
        classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
        return classifierSVC.classify_many(test)