Esempio n. 1
0
class ExpenseClassifier:

    def __init__(self):
        training_data = self._load_data("data")
        self.category_classifier  = NaiveBayesClassifier([(x[0], x[1]) for x in  training_data])
        self.avoidability_classifier = NaiveBayesClassifier([(x[0], x[2]) for x in  training_data])
        self.ordinary_classifier =  NaiveBayesClassifier([(x[0], x[3]) for x in  training_data])

    def classify(self, description):
        res = {}
        res['category'] = self.category_classifier.classify(description)
        res['avoidable'] = self.avoidability_classifier.classify(description)
        res['ordinary'] = self.ordinary_classifier.classify(description)
        return res

    def accuracy(self):
        test_data = self._load_data("test")
        res = {}
        res['category'] = self.category_classifier.accuracy([(x[0], x[1]) for x in test_data])
        res['avoidable'] = self.avoidability_classifier.accuracy([(x[0], x[2]) for x in test_data])
        res['ordinary'] = self.ordinary_classifier.accuracy([(x[0], x[3]) for x in test_data])
        return res

    def _load_data(self, folder):
        data = []
        for f in glob.glob(folder + "/*.csv"):
            with open(f) as csvfile:
                spamreader = csv.reader(csvfile, delimiter=',')
                for row in spamreader:
                    if row[DESCRIPTION] and row[CATEGORY] and row[AVOIDABLE] and row[ORDINARY]:
                        data.append((norm(row[DESCRIPTION]), row[CATEGORY], row[AVOIDABLE], row[ORDINARY]))
        return data
Esempio n. 2
0
def train_n_test(file_path):
	documents= load_data(file_path)
	random.shuffle(documents)
	generate_bigrams(data.wordlist)	
	train = documents[0:110]
	test = documents[110:]
	#classifier = NaiveBayesClassifier(train)
	#classifier = NaiveBayesClassifier(train,feature_extractor=get_features)
	classifier = NaiveBayesClassifier(train,feature_extractor=get_feats)
	print classifier.accuracy(test)
    def generate_model(self):
        print("Gathering and processing tweets...")
        # Shuffle list of username-label tuples
        tuple_list = usermapping.data_tuples.items()

        # Split and grab tweets for users
        results = utils.flatten([ self.fetch_data(t)
                                  for t in tuple_list ])
         
        # TODO: Cross-validation generation
        trn_ratio = int(len(results) * 0.85)
        shuffle(results)
        print(len(results))
        print(trn_ratio)
        train = results[:trn_ratio]
        test = results[trn_ratio:]

        # Instantiate and train classifier
        print("Training...")
        cl = NaiveBayesClassifier(train)
        cl.train()
        
        # Save model
        print("Saving model...")
        utils.save_model(cl)

        # Classify test
        print("Testing...")
        print("Accuracy: {0}".format(cl.accuracy(test)))
        return cl
def create_sentiment():
    """
        Train sentiment model and save.

        Input type: None 
        Output: Model as pickle 
    """

    random.seed(1)

    test = [
        ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'),
        ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'),
        ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'),
        ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'),
        ("He was so nervous shaking all over his voice quivering",'neg'),
        ("The game looked nice too very cute art style ",'pos'),
        ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement  I hope it works out for them aswell",'pos'),
        ("However following that up with the weird PvZ thing was odd To say the least",'neg'),
        ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'),
        ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'),
        ("I want to give him a cookie",'pos'),
        ("Im getting a copy Im gonna support my indie devs",'pos'),
        ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'),
        ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'),
        ("Honored  Im 100 sure that was intentional",'neg'),
        ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'),
        ("The confirmation was who was talking not what they were talking about ",'neg'),
        ("How awkward is it for a pop singer to perform at a video game conference",'neg'),
        ("Oh god did they warn him that he will get zero reaction",'neg'),
        ("I really hope so",'pos'),
        ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg')
        ]


    # Grab review data
    reviews = [
        (list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)
        ]
    random.shuffle(reviews)

    # Divide into 10% train/test splits
    new_train, new_test = reviews[:1900], reviews[1900:]

    # Train the NB classifier on the train split
    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(test + new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    cl.show_informative_features(5)

    # Save model for use in creating social model sentiment
    with open('sentiment_clf_full.pkl', 'wb') as pk:
        pickle.dump(cl, pk)
    print 'done saving model'
Esempio n. 5
0
def main():
    data =[]
    train =[]
    test =[] 
    with open('hellopeter_labelled.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        spamreader = list(spamreader)
        for row in spamreader:
            if (row[13] =='strongly positive'): 
                data.append((row[8],'pos'))
            if (row[13] =='positive' ): 
                data.append((row[8],'pos'))
            if ( row[13] =='neutral' ): 
                data.append((row[8],'neu'))
            if ( row[13] =='negative'): 
                data.append((row[8],'neg'))
            if (row[13] =='strongly negative' ): 
                data.append((row[8],'neg'))
                
                
    train = data[:1000]
    test = data[1001:]
    
    for innf in test:
        print innf
            
    cl = NaiveBayesClassifier(train)
   
    for tnew in test: 
            print '%%%%%%%'
            print ' '
            print  tnew[0]
            print  tnew[1]
            print '%%%%%%%'
            print '#######'
            cl.classify(tnew[0])
            prob_class =  cl.prob_classify(tnew[0])
            print '----max prob---'
            print prob_class.max()
            print '-----+ve-----'
            print prob_class.prob("pos")
            print '-----neutral-----'
            print prob_class.prob("neu")
            print '------ve-----'
            print prob_class.prob("neg")
            
    cl.accuracy(test)
Esempio n. 6
0
class LanguageDetector(object):
    def __init__(self, train=SAMPLE_TRAIN, feature_extractor=FeatureExtractors.last_word_extractor()):
        self.train = train
        self.classifier = NaiveBayesClassifier(self.train, feature_extractor)
    
    def accuracy(self, test_set=SAMPLE_TEST):
        return self.classifier.accuracy(test_set)

    def show_features(self):
        return self.classifier.show_informative_features(5)
Esempio n. 7
0
def run_test(train, test, name):
   print "Training..."
   cll = NaiveBayesClassifier(train)
   print "Done training\n"
   accuracy = cll.accuracy(test)
   print "Accuracy: " + str(accuracy)

   # get matching lists of predicted and true labels
   pred_labels = list()
   true_labels = list()
   for obj in test:
      prob_label = cll.prob_classify(obj[0]).max()
      true_label = obj[1]
      true_labels.append(true_label)
      pred_labels.append(prob_label)

   # transform our labels to numbers
   labels = cll.labels()
   i = 0
   label_num = dict()
   for label in labels:
      label_num[label] = i
      i = i + 1

   # match our predicted and true labels with the number representations
   true_label_nums = list()
   pred_label_nums = list()
   for true_l, pred_l in zip(true_labels, pred_labels):
      true_label_nums.append(label_num[true_l])
      pred_label_nums.append(label_num[pred_l])

   cm = confusion_matrix(true_label_nums, pred_label_nums)
   print cm
   print "\n"

   with open("test_results.txt", "a") as tr:
      tr.write(str(name) + "\n")
      tr.write(str(accuracy) + "\n")
      tr.write(str(cm))
      tr.write("\n\n")

   import matplotlib.pyplot as plt
   fig = plt.figure()
   ax = fig.add_subplot(111)
   cax = ax.matshow(cm)
   plt.title("Confusion Matrix For "+name)
   fig.colorbar(cax)
   ax.set_xticklabels(['']+labels)
   ax.set_yticklabels(['']+labels)
   plt.xlabel("Predicted")
   plt.ylabel("True")
   plt.savefig('plots/'+name+'.pdf', bbox_inches='tight') 
def main():
	print "This is Naive Bayes' Classifier..."

	#read training data
	#training_data = open("training_data").readlines()
	training_data = open("training_data_final").readlines()
	#load training data
	training_tuples = loadData(training_data)

	training_tuples_api = make_api_tuples(training_tuples)
	print training_tuples_api

	#display tuples
	#for t in training_tuples:
	#	t.show()

	#gather classes
	classes = filterClasses(training_tuples)
	#print "classes = ", classes

	#gather vocab
	vocab = getVocab(training_tuples)
	#print vocab

	#generate prior
	prior = generatePrior(training_tuples, classes)
	#print prior

	#generate likelihood
	likelihood = generateLikelihood(training_tuples, vocab, classes)
	#print likelihood

	#read test data
        #test_data = open("test_data").readlines()
        test_data = open("test_data_final").readlines()
        #load test data
        test_tuples = loadData(test_data)

	test_tuples_api = make_api_tuples(test_tuples)
	#calculate C-MAP
	posterior = predict(test_tuples, classes, prior, likelihood)
	showResults(training_data, test_data, posterior)

	#calculate accuracy
	evaluateAccuracy(test_tuples, posterior)

	#Naive Bayes API
	cl = NaiveBayesClassifier(training_tuples_api)
	# Compute accuracy
	print("Accuracy: {0}".format(cl.accuracy(test_tuples_api)))
Esempio n. 9
0
def train(pos_examples, neg_examples, train_fraction=0.6):
    """Train a classifier, holding out train_fraction of pos_examples and neg_examples as a test set.
    Return the tuple:
        
        (the classifier, accuracy, positive test example list, negative test example list, )

    """

    pos_split = int(train_fraction * len(pos_examples))
    pos_train, pos_test = pos_examples[0:pos_split], pos_examples[pos_split:]
    neg_split = int(train_fraction * len(neg_examples))
    neg_train, neg_test = neg_examples[0:neg_split], neg_examples[neg_split:]

    cl = NaiveBayesClassifier(pos_train + neg_train)
    return cl, cl.accuracy(pos_test + neg_test), pos_test, neg_test
def create_sentiment_model():

    random.seed(1)

    # Grab some movie review data
    reviews = [(list(movie_reviews.words(fileid)), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]
    random.shuffle(reviews)
    new_train, new_test = reviews[:1900], reviews[1900:]

    cl = NaiveBayesClassifier(new_train)

    # Compute accuracy
    accuracy = cl.accuracy(new_test)
    print("Accuracy: {0}".format(accuracy))

    # Show 5 most informative features
    print cl.show_informative_features(5)

    with open('sentiment_clf_full.pkl', 'wb') as pk:
        dill.dump(cl, pk)
    print 'done saving model'
Esempio n. 11
0
test = data_sets.subte_test

#tx_cl = "I feel amazing!"
#tx_prob = "This one's a doozy."
tx_cl = "El subte esta demorado"
tx_prob = "El subte funciona bien"

cl = NaiveBayesClassifier(train)
print cl.classify(tx_cl)
print cl.classify("El subte funciona bien")
prob_dist = cl.prob_classify(tx_prob)
print prob_dist.max()
print round(prob_dist.prob("pos"), 2)
print round(prob_dist.prob("neg"), 2)

print cl.accuracy(data_sets.en_test)
print cl.show_informative_features(5)

#Using TextBlob
blob = TextBlob("No funca por que hay obras para mejorar la cosa", classifier=cl)
print blob.sentiment
print blob.classify()

blob = TextBlob("El subte funciona normal", classifier=cl)
print blob.sentiment
print blob.classify()

blob = TextBlob("Se realizan obras en el subte A", classifier=cl)
print blob.sentiment
print blob.classify()
Esempio n. 12
0
    ('its scary how coordinated the entire status quo is to make sure brexit happens no matter the cost',
     'neg'),
    ('turns out  has pulled out of two television debates this week after the shocker with mr neil',
     'pos'),
    ('greens  kicks off bbc 7way debate with a zinger about johnson on brexit  his ovenready meal is made',
     'pos'),
    ('we offered proof the tories were trying to sell the nhs  we offered proof they were lowering health  safety standards  we ',
     'neg'),
    ('my greatest fear is that the election of a johnson government and the hard brexit that will follow would lead to corruption o��_��_',
     'neg'),
    ('at last a proper night�۪s sleep the loony corbynistas destroyed brexit secured and a nice cup of tea all is well in the wo��_',
     'pos'),
    ('brexit closure  johnson wins commanding victory in uk election ', 'pos')
]

testing = [
    ('the democratic pay fulfills its true function during the primary election not the general once it succeeds in crushing',
     'pos'),
    ('two years ago days before the general election there was a terrorist attack in london bridge borough market now two weeks',
     'neg'),
    ('an annual reminder that both our nonpropoional voting system and split left vote are utter trash garbage  144m for',
     'neg'),
    ('o your kids know everyone thinks you�۪re a degenerate', 'neg'),
    ('talking to residents in bulwell market today where the messages were clear ��_��_��_get brexit done��_� and ��_��_��_we can��_�t vote for c��_��__',
     'pos')
]

cl = NaiveBayesClassifier(training)

print(cl.accuracy(testing))
print(cl.show_informative_features(5))
Esempio n. 13
0
def extractor(word):
    feats = {}
    last_letter = word[-1]
    feats["last_letter({0})".format(last_letter)] = True
    return feats


if __name__ == "__main__":
    # customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs.txt':'spanish','./texts/wordsEs2.txt':'spanish'}
    """ customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs2.txt':'spanish'} 
	for customDictFilename, customDictLang in customDicts.items(): 
		currentDict = open(customDictFilename,'r') 
		for line in currentDict: 
			wordTrain = (line.replace('\r','').replace('\n',''),customDictLang) 
			train.append(wordTrain) 
		currentDict.close() """
    # print train
    lang_detector = NaiveBayesClassifier(train, feature_extractor=extractor)
    #  lang_detector = NaiveBayesClassifier(train)
    print lang_detector.accuracy(test)
    lang_detector.show_informative_features(5)
    while 1:
        try:
            line = sys.stdin.readline()
            # print line
            print lang_detector.classify(line)
        except KeyboardInterrupt:
            break
        if not line:
            break
            try:
                words=msg.words
            except:
                continue
            for word in words:
                if word not in stopwords.words() and not word.isdigit():
                    list_tuples.append((word.lower(),tabsep[0]))
            c+=1 #limiting factor begins
            if c==500: #limiting factor ends
                return list_tuples

a = time.time()
entire_data = get_list_tuples("SMSSpamCollection.txt")
print "It took "+str(time.time()-a)+" seconds to import data" 
print 'data imported'
random.seed(1)
random.shuffle(entire_data)
train = entire_data[:250]
test = entire_data[251:500]
print 'training data'
a = time.time()
cl = NaiveBayesClassifier(train)
print "It took "+str(time.time()-a)+" seconds to train data"
print 'data trained, now checking accuracy:'
accuracy = cl.accuracy(test)
print "accuracy: "+str(accuracy)
print cl.classify("Hey bud, what's up") #ham
print cl.classify("Get a brand new mobile phone by being an agent of The Mob! Plus loads more goodies! For more info just text MAT to 87021") #spam


Esempio n. 15
0
# train the Naive Bayes Classifier
print ("Training Naive Bayes Classifier...")
print ()
sys.stdout.flush()
nbc = NaiveBayesClassifier(traindata)

# show the most informative features used for classification
nbc.show_informative_features(5)
print ()
sys.stdout.flush()

# test the Naive Bayes Classifier
print ("Testing Naive Bayes Classifier...")
sys.stdout.flush()
acc = nbc.accuracy(testdata)
print ("Accuracy:", round (acc, 4))
print ()

# print the confusion matrix
print ("Printing Confusion Matrix...")
print ()
sys.stdout.flush()

conf = []
for row in testdata:
	conf.append((row[1], nbc.classify(row[0])))	
Counter(conf)

print ("Total size of test data :	%d" %len(testdata))
print ("Original(>) Predicted (V)") 
Esempio n. 16
0
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import training
import timeit
from stemstop import stem_words

factory = StemmerFactory()
stemmer = factory.create_stemmer()

cl = NaiveBayesClassifier(training.train)

print("akurasi data training %f\n" %cl.accuracy(training.test))

text = """Jombang - Nurul Huda (47), pria asal Kelurahan Kranggan, Kecamatan Prajurit Kulon, Kota Mojokerto ini diringkus polisi lantaran diduga mencabuli, menyiksa, dan memeras seorang santri di sebuah pondok pesantren di Jombang. Untuk melancarkan aksinya, tersangka menyamar menjadi putra kiai (biasa dipanggil Gus) yang bisa mengajarkan ilmu kebatinan kepada korban.

Kasat Reskrim Polres Jombang AKP Wahyu Hidayat mengatakan korban adalah remaja laki-laki asal Kabupaten Malang yang merupakan santri. Sementara tersangka juga biasa mengaji di pondok pesantren itu.

Perbuatan bejat Nurul bermula pada 2014 lalu. Saat itu, korban berusia sekitar 16 tahun atau masih di bawah umur. Kepada korban, pria yang sehari-hari berjualan pulsa ini mengaku sebagai putra kiai dan biasa dipanggil Gus Nurul.

"Korban diajak tersangka ke musala di dekat pondok. Saat kondisi sepi dan hanya berdua dengan korban, tersangka mencabuli korban," kata Wahyu kepada wartawan, Minggu (20/3/2016).

Lantaran menganggap Nurul sebagai Gus, korban pun tak berani menolak permintaan tersangka. Terlebih lagi, tersangka membujuk korban bahwa perbuatan cabul itu untuk memasukkan ilmu kebatinan ke tubuh korban.

"Tersangka berdalih untuk mengajari korban ilmu tasawuf. Nyatanya itu hanya untuk memuluskan niat tersangka agar bisa mencabuli korban," ungkapnya.

Menurut Wahyu, perbuatan cabul itu dilakukan tersangka kepada korban berulang kali selama 2 tahun terakhir. Bahkan korban diminta membayar uang kepada tersangka setiap kali usai melakukan pencabulan. Nilainya antara Rp 200.000 hingga jutaan rupiah.

"Tersangka juga meminta uang dari korban berulang kali. Total kerugian korban Rp 40 juta," sebutnya.

Tak tahan dengan perbuatan Nurul, lanjut Wahyu, korban pun memutuskan buka mulut ke teman sesama santri. Mendapat dukungan dari teman-temannya, korban memberanikan diri melapor ke Polres Jombang, Kamis (17/3).
Esempio n. 17
0
     'Network issue'),
    ('The work outstanding is Needs pushing through across carrier poles',
     'Network issue'),
    ('The work outstanding is There is a hr in if network', 'Network issue'),
    ('The work outstanding is Nlp requires new d side', 'Network issue')
]

test_corpus = [
    ('requires pushing through to DP and pushing to PCP, on D3',
     'Network issue'),
    ('Task Completed Successfully', 'Task Completed Successfully'),
    ('The work outstanding is Prove pair from the pcp', 'Network issue'),
    ('because Hoist were required which could not be obtained on the day',
     'Assistance- Hoist required'),
    ('The work outstanding is D/w to run by 2 hoists near HV cables pr still to divert but nte5c fitted',
     'Assistance- Hoist required')
]

model = NBC(training_corpus)
accuracy = model.accuracy(test_corpus)


def apply_nlp(data):
    print("in apply nlp method")
    print(data)
    return model.classify(data)


def get_accuracy():
    return accuracy
    #############################################################
    # SAVING THE MODEL CALIBRATED
    #############################################################
    import pickle
    f = open(model1_name, 'wb')
    pickle.dump(model1, f)
    f.close()

finally:
    print("model1 has just being loaded, and ready to be used.")
    print("#################################################")
    print("##################  model1  ######################")

with open('test.json', 'r') as test_file:
    model1_accuracy = model1.accuracy(test_file, format=None)
    print("model1 accuray = '%s' " %model1_accuracy)


###############################################################################
# CREATING A NEUTRAL CLASS FROM POSITIVE AND NEGATIVE
###############################################################################

print("#################################################")


text3 = "We did not like his results."
#probability_classification_chosen = 'neg'
#probability_positive = '0.11'
#probability_negative = '0.89'
Esempio n. 19
0
    #############################################################
    # SAVING THE MODEL CALIBRATED
    #############################################################
    import pickle
    f = open(model1_name, 'wb')
    pickle.dump(model1, f)
    f.close()

finally:
    print("model1 has just being loaded, and ready to be used.")
    print("#################################################")
    print("##################  model1  ######################")

with open('test.json', 'r') as test_file:
    model1_accuracy = model1.accuracy(test_file, format=None)
    print("model1 accuray = '%s' " %model1_accuracy)


###############################################################################
# CREATING A NEUTRAL CLASS FROM POSITIVE AND NEGATIVE
###############################################################################

print("#################################################")


text3 = "We did not like his results."
#probability_classification_chosen = 'neg'
#probability_positive = '0.11'
#probability_negative = '0.89'
Esempio n. 20
0
def final_utterance_appreciation_analysis(final_utterance):
	"""
	Input: A list of final utterances by the user.
	Output: The percentage of the people expressing appreciation at the end of the conversation.

	Algorithm:
	1. Create a training set and a validation set of conversation which are manually classified into "appreciation" and "nonappreciation"
	   The differentiation criteria is based on the existence of the words of gratitude.
	2. Train the Naive Bayesian classifier algorithm using the training set.
	3. If the accuracy of the classifier algorithm in classifying the validation dataset into "appreciation" and "nonappreciation",
	   apply the algorithm to all the list final_utterance using a for loop.
	4. Use a dictionary data structure during the loop to store the number of people who express gratitude and who do not express gratitude.
	5. Calculate the percentage of people who express gratitude.

	How the Native Bayesian Classifier Algorithm from TextBlob Package Works:

	For training dataset:
	In order to find the probability for classifying the sentence with a label of "appreciation" and "nonappreciation",
	the algorithm first removes all the meaningless stop words such as "the" and "a" in the sentence.
	Then it calculates the frequency of the remaining tokens and creates a likelihood table that maps the tokens (which are the features)
	to the probability of the token being labelled as "appreciation" and "nonappreciation".

	For a new sentence, it removes all the meaningless stop words and calculate the probability of the sentence being "appreciation"
	or "nonappreciation" based on the 'naive' assumption that all features are independent, given the label:
	|                       P(label) * P(f1|label) * ... * P(fn|label)
	|  P(label|features) = --------------------------------------------
	|                                         P(features)

	"""

	classified_dict = {"appreciation": 0, "non-appreciation": 0}

	train = [('Very well. How about the price for the trip to Essen?', 'nonappreciation'),
	         ("I'd like to book the Cairo package. Thank you!", 'appreciation'),
	         ('oh heck yeah!! economy - I need the money', 'nonappreciation'),
	         ('Then I will take it!', 'nonappreciation'),
	         ('Awesome!!! Thanks!!!', 'appreciation'),
	         ('What??? :disappointed:', 'nonappreciation'),
	         ('Yes do that', 'nonappreciation'),
	         ('Thank you kindly!', 'appreciation'),
	         ('Ok, thank you for your time anyways', 'appreciation'),
	         ('thank you very much for your patience you are an absolute gem','appreciation'),
	         ('Thank you so much!', 'appreciation'),
	         ('Lots of swanky hotels to choose from! Well, based on length of trip, that one to SL sounds like a great deal. I think I wanna go ahead with booking that', 'nonappreciation'),
	         ('Uh huh', 'nonappreciation'),
	         ('Jerusalem to Kingston. I swear if I have to repeat myself again then I will sue', 'nonappreciation'),
	         ('Ok, thanks anyway','appreciation'),
	         ('Looking to go from San Francisco to MArseille. ', 'nonappreciation'),
	         ('Book me for September 18 to 22. Let me know if its more than 2800 because thats all I can afford', 'nonappreciation'),
	         ('duuuude. ah\nwhat about Ciudad Juarez', 'nonappreciation'),
	         ('Well what if I leave the 8th', 'nonappreciation'),
	         ('Ok :+1: we out', 'nonappreciation'),
	         ('Yes!!!!!', 'nonappreciation'),
	         ('ok fine lets do it, business class please', 'nonappreciation'),
	         ('WOE IS ME, FOR I HAVE NOT', 'nonappreciation'),
	         ('ah damn', 'nonappreciation'),
	         ('okay bye', 'nonappreciation'),
	         ('Yikes. Ok Buenos Aires it is\nBook it please\nBusiness class', 'nonappreciation'),
	         ('shit yassss we goin in. Book it for us, please.', 'nonappreciation'),
	         ('well, this is rather disappointing we cannot spend our family vacation near the airport. i wont be booking anything today in this case, goodbye', 'nonappreciation'),
	         ('Thanks! Very excited!', 'appreciation'),
	         ('NOT GOOD', 'nonappreciation'),
	         ("you're a lifesaver", "appreciation"),
	         ('ah. if i could book, i would book this one. well thanks for your time, ill come back next year and save my vacation days for a trip to San Diego.', "appreciation"),
	         ('Great, thanks a lot!', "appreciation"),
	         ("WHAT!?!?! Ugh, kill me now. Okkay fine. I'll look somewhere else.", "nonappreciation"),
	         ("I guess that sound okay, I'll take it", "nonappreciation"),
	         ("Ok, that's fine\nBook it", "nonappreciation"),
	         ('I like the sound of that one. Heart of the city would be better than near a mall.\nLets book business class in Buenos Aires.', "nonappreciation"),
	         ('cool bye', "nonappreciation"),
	         ("let's book :wink:", "nonappreciation"),
	         ('Done, booked! Thanks!', 'appreciation'),
	         ('Okay will consider it and get back to you, thanks!', 'appreciation'),
	         ('DOPE. book it', 'nonappreciation'),
	         ('Hmm. Okay well im just gonna take the information you gave me and discuss it with my wife before booking something she might not enjoy. Thanks for the help!', 'appreciation'),
	         ('Thanks! You were a great help!', 'appreciation'),
	         ('i said 2.5 wasnt good enough', 'nonappreciation'),
	         ('No thats the last straw, we are taking our business elsewhere', 'nonappreciation'),
	         ('Thanks :slightly_smiling_face:', 'appreciation'),
	         ('Hi Do you fly from Ulsan to London??', 'nonappreciation'),
	         ('Ok then leave from Beijing', 'appreciation'),
	         ('i need to get away from a little longer than that one. so lets book vancouver please and thanks', "appreciation"),
	         ("Let's book Valencia. Pleasure doing business with you.", "appreciation"),
	         ('Thank you bot.', "appreciation"),
	         ('No worries, thanks!', "appreciation"),
	         ("That sucks. I'll look somewhere else", "nonappreciation"),
	         ('I am giving you one last time to you your job. you better tread carefully here, my friend,\nCairo to Porto Alegre or I will raise hell', "nonappreciation"),
	         ('Bye. And thanks for nothing.', "nonappreciation"),
	         ("Yes, I'll take it. Thank you", "nonappreciation"),
	         ('no there are 7 of us', "nonappreciation"),
	         ('for 712.00 it sounds like a very nice deal I will book flight on August 26 for 6 days. Thank you for your help.', 'appreciation'),
	         ('3.5 it is then. lets book it', 'nonappreciation'),
	         ('but fine, book it', 'nonappreciation'),
	         ('no can do', "nonappreciation"),
	         ('Thank you very much.', "nonappreciation"),
	         ('gracias!', "appreciation"),
	         ("Perfect! I'll book it", "nonappreciation"),
	         ('Do you do flights leaving from Tel Aviv?', "nonappreciation"),
	         ('that seem good, i will book! Gracias!', "appreciation"),
	         ("No it's alright! thanks though!", "appreciation"),
	         ('okay well its crucial i get there from Fortaleza so I will call someone else', "nonappreciation"),
	         ('how is that possible', "nonappreciation"),
	         ('Well what about in Goiania.?','nonappreciation'),
	         ('ok no thats not good enough im going elsewhere', "nonappreciation"),
	         ('amazing! thanks!', "appreciation"),
	         ('Lets do Business class', "nonappreciation"),
	         ("Oh Okay well i'll look somewhere else. Thanks anyway.", "appreciation"),
	         ('you dont have any flights to birmingham yeah i find that pretty freakin hard to believe', "nonappreciation"),
	         ('This is HORRIBLE', "nonappreciation"),
	         ("yes, you're right.. thank you", "appreciation"),
	         ('ok thanks so much', "appreciation"),
	         ('what if i changed the dates. sept 2 and 23', "nonappreciation"),
	         ('Thank you, but I will go use another service that can better satisfy my escapist fantasies', "appreciation"),
	         ("I really want a spa. If you have nothing to offer with a spa, I'll shop around then.", 'nonappreciation'),
	         ('Oh dear, thats quite above our 3 thousand dollar budget.', 'nonappreciation'),
			 ('dope! thanks', 'appreciation'),
			 ('No worries! Bye!', 'nonappreciation'),
			 ('Ok Lets lock in San Diego', "nonappreciation"),
			 ("You're great", 'appreciation'),
			 ('ok. book it out of Milan please', 'nonappreciation)'),
			 ('ill go for Ciudad Juarez', "nonappreciation"),
			 ('Thank you wozbot!', "appreciation"),
			 ('yes please', "nonappreciation"),
			 ("Usually I wouldn't want to be caught dead in a 3.5 star hotel, but I'm short on time here. Get us on that trip, business class", "nonappreciation"),
			 ('GREAT Thanks!!!!!!!!', "appreciation"),
			 ("I think I'll stick to the 11 day package in Belem at Las Flores, seems like the best deal and it had a good user rating. Let's book that one.", "nonappreciation"),
			 ('thnx', "appreciation"),
			 ('no it HAS to be baltimore and it HAS to be perfect. thanks anyways', "appreciation"),
			 ("Perfect! I'll book it", "nonappreciation"),
			 ("That's it?", "nonappreciation"),
			 ('I shall take the 5 star package!', "nonappreciation"),
			 ('thank you so much', "appreciation"),
			 ('YOU ARE RUINING MY MARRIAGE', "nonappreciation")]

	validation = [('Yes chief', "appreciation"),
				 ("Thanks! I'm sure it will be amazinggg", "appreciation"),
				 ("Weeeelllll this is a no brainer, I 'll just leave the next day and save a whole lotta money! Can you book this for me right away so I don't lose it?", "nonappreciation"),
				 ("Ok I'll book the package with 8 days in Pittsburgh from August 17th to the 24th. Thank you.", "appreciation"),
				 ('Thanks - will do', "appreciation"),
				 ('Killing it! thank', "appreciation"),
				 ('Thanks, you too', "appreciation"),
				 ('thank you wozbot :slightly_smiling_face: toodles', "appreciation"),
				 ('spectacular book please', "nonappreciation"),
				 ("Well, I reckon I'll just book this one.", "nonappreciation"),
				 ("yea so I've heard... send me to Paris then", 'nonappreciation'),
				 ('Fortaleza\n5 stars', "nonappreciation"),
				 ('I guess I can increase my budget by 1000', 'nonappreciation'),
				 ('ok see ya', "nonappreciation"),
				 ('leaving from anywhere??', "nonappreciation"),
				 ("That's it! Thank you so so much :):):)", "appreciation"),
				 ('Done. Book it.', "nonappreciation"),
				 ('Great, sounds perfect. Thank you.', "appreciation"),
				 ('Thats all i had my heart set on!!', "nonappreciation"),
				 ("That sounds like the better hotel. Can't be too cautious travelling by myself for the first time! I will book that deal in an economy class ticket, I'm not ready for business class YET, need to pass that bar exam!",  "nonappreciation"),
				 ('Then I will take my search elsewhere', "nonappreciation"),
				 ('Ya thanks', "appreciation"),
				 ('Thank you, glad to be going back so soon', "appreciation"),
				 ('well okay I can always take the tram in to the city. I will book that one.', "nonappreciation"),
				 ('This is hopeless', "nonappreciation"),
				 ('Great, thank you. I will most certainly book my next vacation with you.', "appreciation"),
				 ('thank youuuu', "appreciation"),
				 ('Lock it down', "nonappreciation"),
				 ("Please help! My lovely parents have been married fof 20 years and they've never taken a trip together. I'm thinking of getting them out of town Sept 6 to 9\nyou got anything good for 2 adults leaving sao paulo, for under 2400?", "nonappreciation"),
				 ('we can also go to Kochi', "nonappreciation"),
				 ('no but we can stay for 9 days instead of 3', "nonappreciation"),
				 ('thanks you!', "appreciation"),
				 ('Just under budget. ok bye now', "nonappreciation"),
				 ('thankyou', "appreciation"),
				 ('can you tell me the price and nearby attractions?', "nonappreciation"),
				 ('1 adult', "nonappreciation"),
				 ('San Jose to Porto Alegre please. oh it needs to be between sept 18 to 22', "nonappreciation"),
				 ('Ok sold! please enter a booking for us', "nonappreciation"),
				 ('I can leave from Tel aviv and I want to go to San Jose with 7 adults for 2500', "nonappreciation"),
				 ('Well what about in Goiania.?', "nonappreciation"),
				 ('you are being unhelpful just answer yes or no, is it near a park or beach?', "nonappreciation"),
				 ('thak you', "appreciation"),
				 ('I shall take the 5 star package!', "nonappreciation"),
				 ('Okay but what if I leave from Naples instead. Can you get me to Manas from Naples?', "nonappreciation"),
				 ("I'm a woman! Try to find something 9000 or less if you can.", "nonappreciation"),
				 ("That's perfect.", "nonappreciation"),
				 ('ok. fine. I have a 4500 $ budjet and I will star as long as that money lasts. thx', "appreciation"),
				 ('sure fine flexible actually no i dont wanna go any more', "nonappreciation"),
				 ("No, unfortunately I can't. Guess I'll just take a staycation this time :disappointed: Thanks anyway", "appreciation"),
				 (" I'll book this one. Thank you, friend!", "appreciation"),
				 ('No we can only go to Porto... or Porto. Thanks.', "appreciation")]

	cl = NaiveBayesClassifier(train) # train the Naive Bayesian Classifier algorithm
	if cl.accuracy(validation) > 0.90: # check if the accuracy of the Naive Bayesian Classifier algorithm in classifying the validation data set is greater than 90%.
		cl.update(validation)	# update the Naive Bayesian Classifier algorithm with the validation data set.

		for m in final_utterance:
			if cl.classify(m) == "appreciation":
				classified_dict["appreciation"] += 1
			else:
				classified_dict["non-appreciation"] += 1

	# calculate the percentage of people expressing appreciation
	return "{}% people express appreciation.".format(float(classified_dict["appreciation"] / (float(classified_dict["appreciation"] + classified_dict["non-appreciation"]))) * 100)
        for row in reader:
            train.append((row[0] + ": " + row[2], row[3]))

    with open(prefix + 'TestData.csv', "rb") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            test.append((row[0] + ": " + row[2], row[3]))

    print("Read data for " + prefix)

    cl = NaiveBayesClassifier(train)
    pickle.dump(cl, open(prefix + "Classifier.pkl", "wb"))

    #Compute accuracy
    print "Model trained for " + prefix + ". Accuracy:" + str(
        cl.accuracy(test))

    print "Most informative features for " + prefix + ":"
    # Show 100 most informative features
    cl.show_informative_features(50)
    classifiers.append(cl)

print "Trained all classifiers, loading untagged data."

full = []
with open('notTaggedData.csv', "rb") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        full.append((row[0] + ": " + row[2], row[3]))

print "Data loaded, predicting classes for whole dataset."
Esempio n. 22
0
predicted = []
actual = []
#print(cl.accuracy(devlist))
for tweet in testlist:
    predicted.append(cl.classify(tweet[0]))
    actual.append(tweet[1])
print("gamma = " + str(gamma))

c = 0
for i in range(500):
    if (predicted[i] == "twitter"):
        c += 1
    print("predicted: " + predicted[i] + " - actual: " + actual[i])
print(float(c) / 500)

print("micro = " + str(metrics.f1_score(actual, predicted, average='micro')))
print("macro = " + str(metrics.f1_score(actual, predicted, average='macro')))

print("accuracy = " + str(cl.accuracy(testlist)))
print("micro recall = " +
      str(metrics.recall_score(actual, predicted, average='micro')))
print("macro recall = " +
      str(metrics.recall_score(actual, predicted, average='macro')))

print("micro precision = " +
      str(metrics.precision_score(actual, predicted, average='micro')))
print("macro precision = " +
      str(metrics.precision_score(actual, predicted, average='macro')))
#print(cl.classify("This is a test"))
print(cl.informative_features(10))
Esempio n. 23
0
	('El trafico es terrible a esta hora','tra'),
	('Estoy estancado en el trafico','tra'),
	('La fila de autos es interminable','tra'),
	('Los semaforos no estan funcionando','tra'),
	('En hora pico el trafico no avanza nada','tra'),
	('Las calles en el sector estan con muchos baches','obr'),
	('Las lluvias han afectado las calles, muchos huecos, no se puede transitar','obr'),
	('Los parques estan maltrados, no existe un buen mantenimiento de las areas verdes','obr'),
	('No tenemos un buen paso peatonal en el sector, los autos cruzan demasiado rapido','obr'),
	('Nadie es capaz de reparar la acera','obr'),
	('Es imposible circular por esta calle','obr')
]

test = [
	('Muchos huecos en las vias al valle de los chillos','obr'),
	('El parque esta descuidado, no cuidan las areas verdes','obr'),
	('Imposible circular por la via interoceanica, existe demasiado trafico','tra')
]

translated_sentences = [(str(TextBlob(sentence).translate(to='en')), category) for (sentence, category) in train]

cl = NaiveBayesClassifier(translated_sentences)

for (sentence, category) in test:
	translation = TextBlob(sentence)
	print '=================================================='
	print 'Oracion Espanol: ' + sentence + '\nOracion Ingles: ' + str(translation.translate(to='en')) + '\nCategoria: ' + category + '\nCategoria Adivinada: ' + str(cl.classify(str(translation.translate(to='en'))))
	print '=================================================='

print 'Exactitud: ' + str(cl.accuracy(test))
Esempio n. 24
0
    def train_model(self):

        #print(twitter_samples.fileids())

        pos_tweets = twitter_samples.strings('positive_tweets.json')
        #print(len(pos_tweets))  # Output: 5000

        neg_tweets = twitter_samples.strings('negative_tweets.json')
        #print(len(neg_tweets))  # Output: 5000

        # all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
        # print (len(all_tweets)) # Output: 20000

        # positive tweets words list
        pos_tweets_set = []
        for tweet in pos_tweets:
            pos_tweets_set.append((tweet, 'pos'))

        # negative tweets words list
        neg_tweets_set = []
        for tweet in neg_tweets:
            neg_tweets_set.append((tweet, 'neg'))

    #  print(len(pos_tweets_set), len(neg_tweets_set))  # Output: (5000, 5000)

        pos_tweets = twitter_samples.strings('positive_tweets.json')
        #  print(len(pos_tweets))  # Output: 5000

        neg_tweets = twitter_samples.strings('negative_tweets.json')
        # print(len(neg_tweets))  # Output: 5000

        # all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
        # print (len(all_tweets)) # Output: 20000

        # positive tweets words list
        pos_tweets_set = []
        for tweet in pos_tweets:
            pos_tweets_set.append((tweet, 'pos'))

        # negative tweets words list
        neg_tweets_set = []
        for tweet in neg_tweets:
            neg_tweets_set.append((tweet, 'neg'))

    # print(len(pos_tweets_set), len(neg_tweets_set))  # Output: (5000, 5000)

    # radomize pos_reviews_set and neg_reviews_set
    # doing so will output different accuracy result everytime we run the program

        shuffle(pos_tweets_set)
        shuffle(neg_tweets_set)

        test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
        train_set = pos_tweets_set[1000:2000] + neg_tweets_set[1000:2000]

        # print(len(test_set), len(train_set))  # Output: (200, 400)

        # train classifier

        classifier = NaiveBayesClassifier(train_set)

        # calculate accuracy
        accuracy = classifier.accuracy(test_set)

        print("Accuracy")
        print(accuracy)  # Output: 0.715

        # show most frequently occurring words
        print(classifier.show_informative_features(10))

        # saving classfier
        #############################################################################
        save_classifier = open("naivebayes.pickle", "wb")
        pickle.dump(classifier, save_classifier)
        save_classifier.close()
Esempio n. 25
0
training_data_selected.to_csv(
    'data/training.1600000.processed.noemoticon_train.csv',
    encoding='utf-8',
    index=False,
    columns=["clean_tweet", "target"])

# In[14]:

with open('data/training.1600000.processed.noemoticon_train.csv', 'r') as fp:
    cl = NaiveBayesClassifier(fp, format="csv")

# In[ ]:

with open('data/training.1600000.processed.noemoticon_test.csv', 'r') as fp:
    print(cl.accuracy(fp))

# In[16]:

consumer_key = "bQTxFUG99KfRrATIE0OncIq0J"
consumer_secret = "TvR3gJ7y1YZ6Or9KkiDMVxp7gFIkM0j7k3I480Gipivw7KsX4H"
access_token = "3303138865-gqhgjAmeQ6LHywdPJUwCuBA08Y2ZN8W46T7KOHW"
access_token_secret = "9o1YtfOm1Gt89k0hQhQ1Mx0YKKyS0JPWE5CgE8zmJXLOB"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

#public_tweets = api.home_timeline()
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

cl = NaiveBayesClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]

# Update the classifier with the new training data
cl.update(new_train)
#for sentence in blob.sentences:
#	print(sentence.classify());
# Compute accuracy
print(cl.classify("There burgers are amazing"));
print(cl.classify("There burgers are not amazing"));
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
cl.show_informative_features(5)

#Ignoring warnings.
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore',category = DeprecationWarning)
positive_sen = []
negative_sen = []


with open('sentiment_train.json', 'r') as fp:
  cl = NaiveBayesClassifier(fp, format="json")
print("Sucessfully built the classifier ....")

print("calculating the accuracy of classifier...")
with open('sentiment_test.json', 'r') as test:
  print("classifier accuracy:")
  print(cl.accuracy(test,format="json"))
#73%

sentiment_classifier = open("naivebayes.pickle","wb")
print("Generating pickle file....")

print("Dumping pickle....")
pickle.dump(cl, sentiment_classifier)

sentiment_classifier.close()
print("Pickle file created.....")


# with open("naivebayes.pickle", "rb") as classifier_f:
# 	cl = pickle.load(classifier_f)
# classifier_f.close()
from textblob.classifiers import NaiveBayesClassifier as NBC

training_corpus = []

for k in range(len(train)):
    training_corpus.append((train.content[k], train.sentiment[k]))

test_corpus = []

for l in range(len(test)):
    test_corpus.append((test.content[l], test.sentiment[l]))

model = NBC(training_corpus)

print(model.accuracy(test_corpus))

from sklearn.metrics import classification_report

predictions = []
for m in range(len(test)):
    predictions.append(model.classify(test.content[m]))
print(classification_report(test.sentiment, predictions))

predictions_df = pd.DataFrame({
    'Content': test.content,
    'Emotion_predicted': predictions,
    'Emotion_actual': test.sentiment
})
predictions_df.to_csv('naive_emotion_recognizer.csv', index=False)
        exit(error)
        
    print("Importing...")
    a = time.time()
    data_tuples = get_training_tuples(sys.argv[1])
    print("Data import time: {0}s".format(time.time()-a))
    # Shuffle data: first 250 will be training set; last 250, the test set
    random.seed(1)
    random.shuffle(data_tuples)
    training = data_tuples[:250]
    test = data_tuples[251:500]

    # Train classifier
    print("Training...")
    a = time.time()
    cl = NaiveBayesClassifier(training)
    print("Training time: {0}s".format(time.time()-a))

    # Test classifier
    print("Accuracy: {0}".format(str(cl.accuracy(test))))

    # Classify stuff
    while True:
        text = input("Enter text to classify or 'q' to quit: ")
        if text == 'q':
            print("Exiting")
            break
        else:
            print("Class: {0}".format(cl.classify(text)))
    
stopwordsnltk = nltk.corpus.stopwords.words('portuguese')


def applyChanges(text):
    text = re.sub(r"http\S+", "", str(text))
    index = text.find(':')
    index = index + 1
    text = text[index:]
    text = text.lower()
    text = re.sub(r"//\S+", "", str(text))
    text = re.sub(r"@/\S+", "", str(text))
    return text


cl = NaiveBayesClassifier(listTraning)
accuracy = cl.accuracy(listTests)

for linha in listAnalisys:
    blob = TextBlob(linha, classifier=cl)
    if blob.classify() == "Positivo":
        positivo = positivo + 1

    if blob.classify() == "Negativo":
        negativo = negativo + 1

    if blob.classify() == "Neutro":
        neutro = neutro + 1

    if blob.classify() == "Ambíguo":
        ambiguo = ambiguo + 1
Esempio n. 31
0
    ("pesticides exposure factors and risk for PD", 'no'),
    ("pesticides or solvents is a risk factor for PD", 'yes')
]

test = [
    ("Parkinson's disease (PD) has been linked to pesticide exposures", 'yes'),
    ('pesticides is associated with an increased risk of developing PD',
     'yes'),
    ("pesticides are involved in the aetiology of Parkinson's disease (PD),",
     'yes'), ("Parkinson's disease phenyotype (PDP) in a pesticide", 'no'),
    ('PD and previous exposure to pesticide', 'no'),
    ('pesticide exposure and PD', 'no')
]

cl = NaiveBayesClassifier(train)
print(cl.accuracy(test))
print(cl.show_informative_features(10))

cl1 = cl.classify(
    "pesticides, and describe the importance for DA neuron survival and PD")
cl2 = cl.classify('PD to exposure to pesticide')
cl3 = cl.classify(
    'pesticides was highly significant in the studies in which PD')
cl4 = cl.classify(
    "pesticide exposure is associated with an increased risk for developing Parkinson's disease (PD"
)
cl5 = cl.classify('PD risk was increased by exposure to any-type pesticide')
cl6 = cl.classify('pesticides is associated to PD')
cl7 = cl.classify(
    'pesticides or in the extent of mitochondrial dysfunction, oxidative stress and neuronal loss may predispose individuals to PD'
)
Esempio n. 32
0
     "Bacillary Hemoglobinuria"),
    ("Chronic painless,cellulitis in lymph nodes, yellow pus ,ulcers",
     "Bovine farcy"), ("Ticks all over the body", "Body ticks"),
    ("Running stomach", "Diahorea"),
    ("Improper feeding. Pain, sweating, & constipation, kicking, & groaning.",
     "Colic"),
    ("Animals are most apt to contact foot rot when forced to live in wet, muddy, unsanitary lots for long periods of time",
     "Foot rot"),
    ("Overeating of grain, or lush, highly improved pasture grasses, Affected animals experience pain and may have fever as high as 106 degrees F",
     "Founder"),
    ("Lungs are affected.  However, other organs may be affected.  Some animals show no symptoms; others appear unthrifty & have a cough",
     "Tuberculosis"),
    ("calf having difficulties in breathing, sore throat ,nasal mucus discharge, dead tissues on gums",
     "Calf")
]

test = [(
    "Diphtheria Lives in soil, litter, & unclean stables& enters the body through small scratches or wounds. Difficulty breathing, eating, and drinking.Patches of yellowish, dead tissue appear on the edges of the tongue, gums, & throat.  Often, a nasal discharge occurs",
    "Calf"), ("Gases of fermentation", "bloat"),
        ("temperature too high 41 degrees", "Grain overload"),
        ("loss of weight,", "ketosis"),
        ("Decrease in appetite", "Pregnancy toxemia"),
        ("excessive fluid,feces usually soft and foul smelling.",
         "Simple indigestion"),
        ("red dark urine,", "Bacillary Hemoglobinuria"),
        ("ulcers", "Bovine farcy"), ("Ticks all over the body", "Body ticks"),
        ("Running stomach", "Diahorea")]

cl = NaiveBayesClassifier(train)
print 'The determined accuracy is: {}'.format(cl.accuracy(test))
def machinelearning():
    import random, time, nltk, csv, threading
    from textblob import TextBlob
    from nltk.corpus import stopwords
    from textblob.classifiers import NaiveBayesClassifier

    warnings.filterwarnings("ignore")

    # from wandb import magic
    # import wandb
    # wandb.init(magic=True)
    # wandb.init(project="uncategorized")
    # add file paths here
    file1 = "/home/blackfalcon/gitstuff/Detecting-Spoof-Emails-with-Information-Fusion/Dataset/SMSSpamCollection"
    file2 = "/home/blackfalcon/gitstuff/Detecting-Spoof-Emails-with-Information-Fusion/Dataset/SMSSpamCollection"

    # we calculate the row count and and the training amount we are going to use for
    # our classifier the current dataset current has around 6k or spam and ham (mixed)
    row_count = len(list(csv.reader(open(file1))))
    print(row_count)
    dothis = row_count - 1
    # using int to round the train amount (Lower BOUND)
    trainamount = int(row_count / 4)
    print(trainamount)
    # Since the train amount is going to be 1/4 of the data set we need to increment
    # by 1 so that we start classifying the next row and until the end of the file
    therest = trainamount + 1
    print(therest)

    # bigchungas 55k unclassified
    big_count = len(list(csv.reader(open(file2))))
    big_counter = big_count - 1
    print(big_count)

    # using stop words causes a massive INCREASE in import time so we have to use
    # a specific one to reduce the time taken , for example "english"
    # if left blank the stopwords function of NLTK searches all of its dictionaries
    # around 24! so if on average it takes 15 seconds to check it will take 245s
    # to check all of them not including sorting them into tuples
    def get_list_tuples(read_file):
        list_tuples = []
        with open(read_file, "r", encoding="utf-8", errors="ignore") as r:
            c = 0
            for line in r:
                tabsep = line.strip().split("\t")
                msg = TextBlob(tabsep[1])
                try:
                    words = msg.words
                except:
                    continue
                for word in words:
                    if word not in stopwords.words("english") and not word.isdigit():
                        list_tuples.append((word.lower(), tabsep[0]))
                c += 1
                if c == row_count:
                    break
            return list_tuples

    # used for the super extreme case
    def get_list_spam(read_file):
        list_tuples = []
        with open(read_file, "r", encoding="utf-8", errors="ignore") as r:
            c = 0
            for line in r:
                tabsep = line.strip().split("\t")
                msg = TextBlob(tabsep[1])
                try:
                    words = msg.words
                except:
                    continue
                for word in words:
                    if word not in stopwords.words("english") and not word.isdigit():
                        list_tuples.append((word.lower(), tabsep[0]))
                c += 1
                # print(c)
                if c == big_counter:
                    break
            return list_tuples

    print("importing data...")
    a = time.time()
    entire_data = get_list_tuples(file1)
    unknown_data = get_list_spam(file2)

    print("It took " + str(time.time() - a) + " seconds to import data")
    print("data imported")
    print("shuffle the data")
    random.seed(1)
    random.shuffle(entire_data)
    random.shuffle(unknown_data)

    # train = entire_data[:row_count]
    # test = entire_data[:row_count]

    train = entire_data[:row_count]
    # train = unknown_data[1:2000]
    test = unknown_data[:big_count]
    print("training data")
    a = time.time()
    cl = NaiveBayesClassifier(train)
    # cl2 = MaxEntClassifier(train)
    # cl3 = DecisionTreeClassifier("call the police")
    # Timing and calculate accuracy
    print("It took " + str(time.time() - a) + " seconds to train data")
    print("data trained, now checking accuracy:")

    a = time.time()
    accuracy = cl.accuracy(test)
    # acc2 = cl2.accuracy(test)
    print("accuracy: " + str(accuracy))
    # print ("accuracy: "+str(acc2))
    print("It took " + str(time.time() - a) + "to calculate the accuracy")
    print(cl.classify("Oops, I'll let you know when my roommate's done"))  # ham
    print(
        cl.classify(
            "Get a brand new mobile phone by being an agent of The Mob! Plus loads more goodies! For more info just text MAT to 87021"
        )
    )  # spam
    print(
        cl.classify(
            "Doctors hate him, see how this man grew his dick upto six inches with this new method!"
        )
    )  # spam
    print(cl.classify("You just won $32432840928432 zimbabewewewewew dolla "))
    # from google.colab import output
    # output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
    return cl
Esempio n. 34
0
class TestNaiveBayesClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text),
                     basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(
            ["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                     self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
                      lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(
                len(train_set)))
Esempio n. 35
0
train_len = total_len * (100 - int(test_perc))*0.01

index = 0
trainset=[]
testset=[]
for data in final_topic_feature:
    if index < train_len:
        trainset.append(data)
    else:
        testset.append(data)
    index=index+1

print "***************Naive Bayes Classification******************"
starttime = time.clock()
cl = NaiveBayesClassifier(trainset)
endtime = time.clock()

print "Total Offline cost is ", endtime-starttime,"s"

for i in range(0,10):
    starttime = time.clock()
    cl.classify(testset[i][0])
    endtime = time.clock()
    print "Total Online cost of ", i+1, "is ",endtime-starttime,"s"

startime = time.clock()
print "Accuracy of the model", cl.accuracy(testset)*100
endtime = time.clock()

print "Total test time is ", endtime-starttime,"s"
Esempio n. 36
0
    start_offset = j_contents.find('<section id="postingbody">')
    end_offset = j_contents.find("</section>", start_offset)
    post_body = j_contents[start_offset:end_offset]
    post_body = post_body.replace('<section id="postingbody">', " ")
    try:
        post_body = post_body.decode("utf-8")
    except UnicodeDecodeError:
        continue

    categorize.append([post_body, i_file])

Bayes = NaiveBayesClassifier(train)

print os.getcwd()

print Bayes.accuracy(test)


pos = []
neg = []
for body in categorize:

    judge = Bayes.classify(body[0])
    if judge == "positive":
        call(["mv", "./" + body[1], "prostitutes/"])
        os.getcwd()
    if judge == "negative":
        call(["mv", "./" + body[1], "non_prostitutes/"])
try:
    os.mkdir("hard_to_classify")
except OSError:
Esempio n. 37
0
#the classifier used is Naive Bayes Classifier and the dataset is training dataset of fake reviews. #
#We have 800 fake reiews where 400 are positive and 400 are negative

# python classification_script.py Fake_train.csv Fake_test.csv > Fake_NB_classification.txt
# python classification_script.py Real_train.csv Real_test.csv > Real_NB_classification.txt



if __name__ == '__main__':

	train_file = sys.argv[1]
	test_file = sys.argv[2]
	start_time = time.time()

	print "--- "+str(time.time() - start_time)+ " seconds ---"
	print 'Training...'
	with open(train_file, 'rb') as fp:
		cl = NaiveBayesClassifier(fp, format='csv')

	print "--- "+str(time.time() - start_time)+ " seconds ---"
	print 'Testing...'
	with open(test_file, 'rb') as fp_t:
		accuracy_value = cl.accuracy(fp_t, format='csv') 

	print 'Writing to file...'
	print accuracy_value

	cl.show_informative_features(5)

	print "--- "+str(time.time() - start_time)+ " seconds ---"
Esempio n. 38
0
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
from Training import train
from Testing import test
from app import answer
import sys

cl = NaiveBayesClassifier(train)

# Classify some text
#print(cl.classify(name))

#Classify a TextBlob
#blob = TextBlob("They look blessed.", classifier=cl)
blob = TextBlob(answer, classifier=cl)

#print(blob)
#print(blob.classify())

for sentence in blob.sentences:
    #print(sentence)
    #print(sentence.classify())
    feedback = sentence.classify()

# Compute accuracy
#print("Accuracy: {0}".format(cl.accuracy(test)))
accuracy = "Accuracy: {0}".format(cl.accuracy(test))

# Show 5 most informative features
features = cl.show_informative_features(5)
from textblob.classifiers import NaiveBayesClassifier
from text.blob import TextBlob

cl = NaiveBayesClassifier("train.csv", format="csv")

# Classify some text
# print(cl.classify("Their burgers are amazing."))  # "pos"
# print(cl.classify("I don't like their pizza."))   # "neg"
 
# Classify a TextBlob
blob = TextBlob("The President's budget makes these investments while reducing the deficit by raising revenues from millionaires, billionaires, and corporations that are not paying their fair share.The President's budget would restore unemployment compensation to the long-term unemployed who are struggling to find work and to make ends meet. Due to Congress's inaction more than 2 million Americans (including more than 112,000 Illinoisans) have lost long-term unemployment insurance benefits.His budget also takes common-sense, long overdue steps that will boost our economy:  increasing the federal minimum wage and moving forward with comprehensive immigration reform.These proposals would truly help Americans succeed and build an economy that works for everyone. Congress should embrace them and expand opportunities for all Americans.I am very thankful that President Barack Obama has named March 2014 National Colorectal Cancer Awareness Month. All types of cancer continue to claim too many lives in our country and around the world.", classifier=cl)
print(blob)
print(blob.classify())
 
for sentence in blob.sentences:
    print(sentence)
    print(sentence.classify())
 
# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy("test.csv", format="csv")))
 
# Show 5 most informative features
cl.show_informative_features(20)
Esempio n. 40
0
# python classification_script.py Fake_train.csv Fake_test.csv > Fake_NB_classification.txt
# python classification_script.py Real_train.csv Real_test.csv > Real_NB_classification.txt

with open('Fake_test.csv') as f:
	reader=csv.DictReader(f)
	for line in reader:
		test_data = [tuple(line) for line in csv.reader(f)]

if __name__ == '__main__':

	train_file = sys.argv[1]
	#test_file = sys.argv[2]
	start_time = time.time()

	print "--- "+str(time.time() - start_time)+ " seconds ---"
	print 'Training...'
	with open(train_file, 'rb') as fp:
		cl = NaiveBayesClassifier(fp, format='csv')

	print "--- "+str(time.time() - start_time)+ " seconds ---"
	print 'Testing...'
	
	accuracy_value = cl.accuracy(test_data) 

	print 'Writing to file...'
	print accuracy_value

	cl.show_informative_features(5)

	print "--- "+str(time.time() - start_time)+ " seconds ---"
Esempio n. 41
0
def activeLearning(NAME, datapath, infile, iterations = 3, portion = 10):
    logger = logging.getLogger('signature.activeLearning')
    logger.info('Active learning model building')
    
    #load data
    review_file = open(infile,"r")
    
    #convert to appropriate format
    review_corpus = list()
    for i, line in enumerate(review_file):
        try:
            #filter out non-ascii simbols
            review = json.loads(line)
            review_corpus.append([re.sub(r'[^\x00-\x7f]', r' ', review['text']), review['textFeatures']])
        except:
            logger.error(review['text'])
            continue
    review_file.close()
    logger.info('Data converted - %d reviews'%len(review_corpus))
    
    
    #Shuffle dataset
    #random.seed(1)
    random.shuffle(review_corpus)
    
    try:
        current_train = json.loads(open(datapath + '%s_current_train.json'%NAME,'r').read())
    except:
        current_train = list()
    
    for t in current_train:
        try:
            review_corpus.remove(t[0])
        except:
            pass
    
    logger.info("Len(current_train) = %d"%len(current_train))
    
    '''
    Prepare first portion
    '''
    if len(current_train) > 10:
        #train model
        cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor)
        
        #prepare next portion
        ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train)
        #ratio = 0.5
        logger.info('ratio = %.3f\nclassifying train set ...'%ratio)
        train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]]
        train_classify.sort()
        reviews_portion = train_classify[:portion]
    
    else:
        reviews_portion = [y for y in enumerate(review_corpus[:portion])]

    
    '''
    main iterations of active learning
    '''
    for iteration in range(iterations):
        #ask for labels
        for p in range(len(reviews_portion)):
            var = input('''\n\n%s \n(%f)\nPlease give the label to the review 
(g - generic / s - specific): '''%(reviews_portion[p][1][0],reviews_portion[p][0]))
            
            if var.lower().startswith('g'):
                label = 'g'
            elif var.lower().startswith('s'):
                label = 's'
            elif var.lower().startswith('x'):
                logger.info('Finish')
                break
            else:
                logger.info('Bad label')
                continue
        
            #prepare train set
            current_train.append((reviews_portion[p][1],label))
            review_corpus.remove(reviews_portion[p][1])
        
        #train model
        cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor)
        
        #prepare next portion
        ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train)
        #ratio = 0.5
        logger.info('ratio = %.3f\nclassifying train set ...'%ratio)
        train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]]
        train_classify.sort()
        reviews_portion = train_classify[:portion]
        
        logger.info('Iteration: %d (%d items), Accuracy on train = %.2f'%(iteration,len(current_train),100*cl.accuracy(current_train)))
        
        current_train_out = open(datapath+'%s_current_train.json'%NAME,'w')
        current_train_out.write(json.dumps(current_train))
        current_train_out.close()
        
    
    cl.show_informative_features(10)
    
    
    
    
    #test
    random.shuffle(current_train)
    thres = int(0.8*len(current_train))
    train_self = current_train[:thres]
    test_self = current_train[thres:]
    cl_test =  NaiveBayesClassifier(train_self, feature_extractor=feature_extractor)
    acc_str = 'Accuracy on test = %.2f with %d items in testset and %d items in trainset'%(100*cl_test.accuracy(test_self),
                                                                                           len(test_self),len(train_self))
    logger.info(acc_str)
    message = list()
    message.append(acc_str)
        
    #saving model
    pickle.dump(cl, open(datapath+ '%s_active_learning.model'%NAME, "wb" ) )
    
    
    return '\n'.join(message)
def sentiment_with_naive_bayes(train, test, text):
    cl = NaiveBayesClassifier(train)
    accuracy_ = cl.accuracy(test)
    class_ = cl.classify(text)
    return accuracy_, class_
Esempio n. 43
0
from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob

train =open("/home/encompass/Desktop/trial/https__enwikipediaorg_wiki_Sathish.txt");

test = [
    "The beer was good.", 'pos',"I do not enjoy my job", 'neg',"I ain't feeling dandy today.", 'neg',"I feel amazing!", 'pos','Gary is a friend of mine.', 'pos',"I can't believe I'm doing this.", 'neg']

cl = NaiveBayesClassifier(train)

# Classify some text
print(cl.classify("Their burgers are amazing."))  # "pos"
print(cl.classify("I don't like their pizza."))   # "neg"

# Classify a TextBlob
blob = TextBlob("The beer was amazing. But the hangover was horrible. "
                "My boss was not pleased.", classifier=cl)
print(blob)
print(blob.classify())

for sentence in blob.sentences:
    print(sentence)
    print(sentence.classify())

# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy(train)))

# Show 5 most informative features
cl.show_informative_features(5)
Esempio n. 44
0
    ("scored and got sent off", "non-player"),
    ("yellow cards received by Wayne Rooney", "player"),
    ("red cards for Lee Cattermole", "player"),
    ("second half red cards for John Terry", "player"),
    ("goals scored by Wayne Rooney in the second half against Arsenal", "player"),
    ("red cards for Manchester United", "non-player"),
    ("teams with the most red cards", "non-player"),
]

split = len(data) // 2
random.shuffle(data)

cl = NaiveBayesClassifier(data[split:])
cl.classify("games Steven Gerrard played in")

cl.accuracy(data[:split])

cl2 = NaiveBayesClassifier(data[split:], feature_extractor=word_extractor)
cl2.accuracy(data[:split])

from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()
st.stem("provision")

# break_down("goals scored by Gareth Bale")
# [('goals', u'NNS'), ('scored', u'VBD'), ('by', u'IN'), ('Gareth', u'NNP'), ('Bale', u'NNP')]

# ->  MATCH (p:Player)-[:played]->(stats)
# 	  WHERE p.name = "Gareth Bale",
# 	  RETURN SUM(stats.goals) AS goals
Esempio n. 45
0
    ('Se quer ir rápido, vá sozinho. Se quer ir longe, vá em grupo', 'neutra'),
    ('Sempre que lhe perguntarem se você sabe fazer um trabalho, diga que '
     'sim e apresse-se em descobrir como executá-lo', 'neutra'),
    ('Não importa aonde você está, e sim aonde quer chegar', 'neutra'),
    ('Dia dos Namorados, Dia do Amigo. Um dia inventam uma data de algo'
     ' que você tenha', 'neutra'),
    ('Você não é pago para pensar', 'neutra'),
    ('Eu só contrato pessoas que pensam como eu', 'neutra'),
    ('Não faça perguntas. Apenas faça o que estou dizendo', 'neutra'),
]

# criando um classificador
cl = NaiveBayesClassifier(train_set)

#criando uma variavel para medicao de precisao
accuracy = cl.accuracy(test_set)

##### frase utilizada na previsao ###

frases = [
    'Persiga um ideal, não o dinheiro. O dinheiro vai acabar indo atrás de você',
    'Você não precisa de uma equipe de 100 pessoas para desenvolver uma ideia',
    'Faça o que você puder, onde você está e com o que você tem',
    'Não faltam oportunidades para você viver do jeito que você quer. O que'
    ' falta é vontade de tomar o primeiro passo',
    'Hoje acordei cedo para ver o sol',
    'Se queres prever o futuro, estuda o passado'
]


def entrada():
Esempio n. 46
0
 def test_train_from_lists_of_words(self):
     # classifier can be trained on lists of words instead of strings
     train = [(doc.split(), label) for doc, label in train_set]
     classifier = NaiveBayesClassifier(train)
     assert_equal(classifier.accuracy(test_set),
                     self.classifier.accuracy(test_set))
Esempio n. 47
0
 def test_train_from_lists_of_words(self):
     # classifier can be trained on lists of words instead of strings
     train = [(doc.split(), label) for doc, label in train_set]
     classifier = NaiveBayesClassifier(train)
     assert_equal(classifier.accuracy(test_set),
                  self.classifier.accuracy(test_set))





]
testing = [
('Investor wealth rises Rs 4.82 lakh crore in two days of market bullish rise','pos'),
('Investor wealth rises Rs 4.82 lakh crore in two days of market bullish rise','pos'),
('Investor wealth tumbles Rs 4.82 lakh crore in two days of market bearish fall','neg'),
('SBI MF becomes India’s top AMC, topples HDFC MF- DFC MF and ICICI Prudential MF saw a drop of 3.33 % and 2.98% in their average AUM','neg'),
('Seven of top 10 cos lose Rs 2.82 lakh crore in m-cap TCS, HDFC Bank hammered','neg'),
('tesla shares haven’t actually dropped much and are still pretty high. they’re probably a good long term investment','pos'),
(' tesla shares down 6.01% to $701.8 stocks stockmarket stockstowatch stockstotrade stock stocktrading financial market consumer auto automobile manufacturing manufacturer manufacturers','neg'),
('tesla stock is just stupid high right now. zero demand for cars, oil at $0, and shares at $700?','neg'),
('unconvinced by the recent run up in shares, bank of america has downgraded tesla to "underperform" wednesday morning and moved their price target to $485 from $500.','neg'),
('tesla stands out in commanding investors confidence. its shares are up by 64% this year sentiment','pos'),
('finally hit 100% on my tesla postion return 🎯 and believe ima continue to hold all them shares','pos'),
('i was able to accumulate a handful more shares when i sold after it fell to 750 when it looked like itd fall a bit more','neg'),
('options flow grid update $tsla optionstrading tesla shares down 3.31% to $772.3 optionsflow stocks stockmarket investing investment','neg')

]
cl1=PositiveNaiveBayesClassifier(positive_set=relevant,unlabeled_set=irrelevant)
cl = NaiveBayesClassifier(training)
print (cl.accuracy(testing)*100 ,"%")
#print(cl1.accuracy)
#blob = TextBlob('good idea to sell', classifier=cl)
#print(cl.classify("analyst downgrades stock saying q1 ‘phenomenal’ but shares ‘not inexpensive"))
#print(cl1.classify("analyst downgrades stock saying q1 ‘phenomenal’ but shares ‘not inexpensive"))
Esempio n. 49
0
def _ask_about_result():
	i = raw_input("are you satistfied ? ")
	if i == "y":
		return True
	if i == 'n':
		return False
	else:
		print " y or n please"
		return self._ask_about_result()

if __name__ == '__main__':
	print "Hello"

	data = load_sample()

	splitIndex = 2*len(data)/3
	train = data[:splitIndex]
	test = data[splitIndex:]

	cl = NaiveBayesClassifier(train)

	for item in test:
		print_item(item)

	print "accuarciy", cl.accuracy(test)

	happy = _ask_about_result()
	if happy:
		with open('classifier.pickle', "wb") as f:
			pickle.dump(cl, f)
Esempio n. 50
0
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = NaiveBayesClassifier(train)

# Classify some text
print(cl.classify("Their burgers are amazing."))  # "pos"
print(cl.classify("I don't like their pizza."))  # "neg"

# Classify a TextBlob
blob = TextBlob(
    "The beer was amazing. But the hangover was horrible. "
    "My boss was not pleased.",
    classifier=cl)
print(blob)
print(blob.classify())

for sentence in blob.sentences:
    print(sentence)
    print(sentence.classify())

# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy(test)))

# Show 5 most informative features
cl.show_informative_features(5)
Esempio n. 51
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
Esempio n. 52
0
start = time.process_time()

with open(r'./temp/train_kmer.csv', mode='r',
          encoding='utf-8') as train_kmer:  #load train_kmer.csv as train set
    clsf = NaiveBayesClassifier(
        train_kmer)  #train naive Bayes classifier with train set

print('Training finished! Time consumption:')
mid = time.process_time()
print(str(mid - start))

with open(r'./temp/test_kmer.csv', mode='r',
          encoding='utf-8') as test_kmer:  #load test_kmer.csv as test set
    print('Classification accuracy:')
    print(clsf.accuracy(test_kmer))  #print accuracy performed on test set


def getkmer(s: str, k: int):
    result = ''
    for i in range(len(s) - k):
        result = result + s[i:i + k] + ' '
    return result


classifile = open(r'./input/tokens.txt', mode='r', encoding='utf-8')
pinyin = open(r'./output/pinyin.txt', mode='w', encoding='utf-8')
words = open(r'./output/words.txt', mode='w', encoding='utf-8')

for line in classifile.readlines(
):  #classify words and pinyin according to predicted label
Esempio n. 53
0
    with open(name) as f:
        text = f.read()
        text = text.replace("\n", " ")
        text = unicode(text, "utf-8", errors="ignore")
        data.append((text, "pro"))
        i += 1


files = glob.glob(NonPropath)
for name in files:
    with open(name) as f:
        text = f.read()
        text = text.replace("\n", " ")
        text = unicode(text, "utf-8", errors="ignore")
        data.append((text, "non-pro"))


random.shuffle(data)
number_of_elements = len(data)
split = (number_of_elements / 3) * 2
train = data[:split]
test = data[split:]

# print 'content of line 5 ' , train[4]

cl = NaiveBayesClassifier(train)
cl.accuracy(test)
cl.classify(
    "Your symptoms may be caused due to a musculo-skeletal strain. I would advise you to take OTC pain-killers/NSAIDS and see if it helps. Rest and ice will also help to relieve the symptoms. If the pain does not get better, you may need to visit your doctor for a physical examination. X-rays will usually be normal in most cases."
)
Esempio n. 54
0
#!/usr/bin/env python  
# -*- coding: utf-8 -*-  
# @Time    : 2018/7/3 下午4:00  
# @Author  : Kaiyu  
# @Site    :   
# @File    : test.py

from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob
import json

if __name__ == '__main__':
    with open('okoo-merged-labels.json', encoding='utf-8') as f:
        data = json.load(f)['all']
        data = [(item['text'], str(item['merged_label'])) for item in data]
        train_data = data[:-1000]
        test_data = data[-1000:-1]
        model = NBC(train_data)
        for test_item in test_data:
            #
            label_ = model.classify(test_item[0])
            print('True: {} predict: {}'.format(str(test_item[1]), label_))
        print(model.accuracy(test_data))
Esempio n. 55
0
from textblob.classifiers import NaiveBayesClassifier

train = [
    ('I love this sandwich.', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('This is my best work.', 'pos'),
    ("What an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg'),
    ("I like big butts and I cannot lie","butts")
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

cl = NaiveBayesClassifier(train)
print cl.accuracy(test)
print cl.classify("Their burgers are amazing")  # "pos"
print cl.classify("I don't hate you.")  # "neg"
            with open(filename, 'rb') as csvfile: 
                reader = csv.reader(csvfile, delimiter=';', quotechar='|')
                
             
                print "> File opened successfully!"
                
                counter = 0
                for row in reader:
                    selectTweets(row)
                    counter += 1
                    
                print "> Wait a sec for the results..."
                    
                cl = NaiveBayesClassifier(trainTweets)
                
                print("Accuracy of the classifier: {0}".format(cl.accuracy(testTweets)))
                cl.show_informative_features(10)
                
                while True:
                
                    
                    tweetfile =  str(raw_input("Please enter the file name of the data file(.json) "))
		    with open(tweetfile) as tf:
			nop=0
			non=0
        		for line in tf:
			    tweetwords=[]
			    text = ""
            		    if line:
				tweet = json.loads(line)
				result = get_tweet_sentiment(tweet)
Esempio n. 57
0
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg')
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

cl = NaiveBayesClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[101:200]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))

# Show 5 most informative features
cl.show_informative_features(5)
Esempio n. 58
0
choice = input("Select one classifier number: ")

# for testing with different dataset sizes
# size = input("n: ")
# trains = []
# for i in range(int(size)):
#     trains.append(train[i])
# for i in range(250, int(size)+250):
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
    cl4 = NLTKClassifier(trains)
			for word in words:
				if word not in stopwords.words() and not word.isdigit():
					list_tuples.append((word.lower(),tabsep[0]))
			c+=1
			if c==500:
				break
	return list_tuples
print 'importing data...'
a = time.time()
entire_data = get_list_tuples("dataset.txt")
print "It took "+str(time.time()-a)+" seconds to import data"
print 'data imported'
random.seed(1)
random.shuffle(entire_data)
train = entire_data[:750]
test = entire_data[751:1500]
print 'training data'
a = time.time()
cl = NaiveBayesClassifier(train)
print "It took "+str(time.time()-a)+" seconds to train data"
print 'data trained, now checking accuracy:'
accuracy = cl.accuracy(test)
print "accuracy: "+str(accuracy)
cl.show_informative_features(5)

x = ""
while (x != "exit"):
	x = raw_input("enter a email to check if it is a spam email or not , type exit to exit \n")
	print cl.classify(x)
	
    ('it is so ugly', 'neg'),
    ('burning like hell', 'neg'),
    ("1 2 3 4 5 6 7 8 9 0", "error"),
    ('fine', 'pos'),
    ('i hate you', 'neg'),
    ('i love it', 'pos')
    ]

    

    
 
cl = NaiveBayesClassifier(train)
cl.classify("I feel amazing!")
cl.update(new_data)
cl.accuracy(new_data)


master = Tk()
e = Entry(master)
e.pack()

e.focus_set()

def callback():
    #print e.get() # This is the text you may want to use later
    'pos'
    blob = TextBlob(e.get(), classifier=cl)
    for s in blob.sentences:
        print(s)
        print(s.classify())