def train_data(ticker): df = pd.read_csv('../tmp/training_data/' + ticker + '2015-2016_data1.csv') train_df = df[['snippet', 'price change']] print "Training News Dataset" print train_df.head(5) cl = NaiveBayesClassifier(train_df.as_matrix(columns=None)) df = pd.read_csv('../tmp/training_data/' + ticker + '2016-2017_data1.csv') dataset = df[['snippet', 'price change']] classified = [] right = 0 #print dataset.head(n=5) print "\nClassifying dataset\n" for index, row in dataset.iterrows(): classified.append(cl.classify(row[0])) right += 1 if row[1] == classified[index] else 0 dataset['News Sent.'] = classified path = '../tmp/results/News/' + ticker + '_results.csv' dataset.to_csv(path, encoding='utf-8', index=False) #dataset['Price Sent.'] = real_sent print dataset[['snippet', 'price change', 'News Sent.']].head(n=20) total = len(dataset['snippet']) print "\nCalculating " print "\nRight %d, Total %d" % (right, total) print "Correct percentage %.2f %%" % ((1.0 * right / total) * 100) #print cl.classify(dataset.as_matrix(columns=None)) print cl.show_informative_features(10)
def generateClassifier(): train = getIntentDataset() cl = NaiveBayesClassifier(train) cl.show_informative_features(5) path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/" saveTrainedClassifier(path, cl, "intent_classifier_2.pickle")
def generateIntentionalityClassifier(): db = dbClient() training = db.training cursor = training.find() #Reducir la cantidad de registros crs = list(cursor) random.shuffle(crs) # split into 90% training and 10% test sets p = int(len(crs) * .01) cr_test = crs[0:p] print "Test", len(cr_test) data = [] t = "" for td in cr_test: tgram = td["triGram"] label = td["label"] #print tgram for tg in tgram: d = '-'.join(tg) t = t + " " + d #print t data.append((t, label)) t = "" #print data cl = NaiveBayesClassifier(data) cl.show_informative_features(30) path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/QueryAnalyzer/Models/" saveTrainedClassifier(path, cl, "my_classifier_v6.pickle") return cl
def create_sentiment(): """ Train sentiment model and save. Input type: None Output: Model as pickle """ random.seed(1) test = [ ("The dude presenting Unravel seems like one of the most genuine game developers Ive ever seen I really hope this game works out for him",'pos'), ("His hands are shaking Dude looks so stoked and scared at the same time",'pos'), ("Right I just felt like I was watching his dream come true It was nice The game looks very well done as well Good for him",'pos'), ("Seriously Unravel looks really good actually and honestly seeing him so happy about what hes made is contagious I want to see more of Unravel ",'pos'), ("He was so nervous shaking all over his voice quivering",'neg'), ("The game looked nice too very cute art style ",'pos'), ("You could tell he genuinely wanted to be there it looked like he was even shaking from the excitement I hope it works out for them aswell",'pos'), ("However following that up with the weird PvZ thing was odd To say the least",'neg'), ("Haha The game did look nice though Im definitely going to keep an eye on it I enjoy supporting such hopeful developers",'pos'), ("Very personable This looks like a buy for me As a dev in a other sector I appreciate this passion",'pos'), ("I want to give him a cookie",'pos'), ("Im getting a copy Im gonna support my indie devs",'pos'), ("The twitch leak was accurate It was like a play by play you start speaking French then switch to English",'neg'), ("yep exactly what i was thinking lol its important to note that the twitch leak never had them saying it was Dishonored 2 but that they were honored to be here very different",'neg'), ("Honored Im 100 sure that was intentional",'neg'), ("oh yea for sure but wasnt solid enough evidence imo to be like dishonored 2 confirmed just based off that",'neg'), ("The confirmation was who was talking not what they were talking about ",'neg'), ("How awkward is it for a pop singer to perform at a video game conference",'neg'), ("Oh god did they warn him that he will get zero reaction",'neg'), ("I really hope so",'pos'), ("Almost as bad as Aisha f*****g up her dialogue constantly Shes doing alright though E3 is really becoming a mainstream media event Hollywood has nothing like this ComicCon is the only comparison and they dont dazzle it up like E3",'neg') ] # Grab review data reviews = [ (list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category) ] random.shuffle(reviews) # Divide into 10% train/test splits new_train, new_test = reviews[:1900], reviews[1900:] # Train the NB classifier on the train split cl = NaiveBayesClassifier(new_train) # Compute accuracy accuracy = cl.accuracy(test + new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features cl.show_informative_features(5) # Save model for use in creating social model sentiment with open('sentiment_clf_full.pkl', 'wb') as pk: pickle.dump(cl, pk) print 'done saving model'
class NaiveBayesAnalyzer: cl = None def __init__(self): with open("training_data.json", "r") as f: self.cl = NaiveBayesClassifier(f, format="json") self.cl.show_informative_features(20) def analyze(self, text): return self.cl.classify(text)
def train(self): """ Train a classifier dependeding on the data """ train = [] for intent in self.data["intents"]: for pattern in intent["patterns"]: train.append((pattern, intent["label"])) pprint.pprint(train) cl = NaiveBayesClassifier(train) print('Accuracity: ', cl.accuracy(train)) cl.show_informative_features(5) return cl
def enginemongo(text): from textblob.classifiers import NaiveBayesClassifier trainingset = db.trainingset.find() tsarr = [] for t in trainingset: tsarr.append((t["question"], t["answer"])) print(tsarr) cl = NaiveBayesClassifier(tsarr) prob_dist = cl.prob_classify(text) print("TEST:", text, " ", prob_dist, " ", prob_dist.max()) maxprob = 0 maxanswer = "" for a in prob_dist.samples(): pd = round(prob_dist.prob(a), 2) if (pd > maxprob): maxprob = pd maxanswer = a print(a, ":", round(prob_dist.prob(a), 2)) print(cl.show_informative_features()) print("RISPOSTA:", maxanswer, " --- ", maxprob) aa = cl.extract_features(text) print(aa) print("---------------------------------------") return {"answer_key": maxanswer, "answer_prob": maxprob}
class LanguageDetector(object): def __init__(self, train=SAMPLE_TRAIN, feature_extractor=FeatureExtractors.last_word_extractor()): self.train = train self.classifier = NaiveBayesClassifier(self.train, feature_extractor) def accuracy(self, test_set=SAMPLE_TEST): return self.classifier.accuracy(test_set) def show_features(self): return self.classifier.show_informative_features(5)
def train_NBC(filepath): new_df = read_data(filepath) new_train_test = new_df.values.tolist() x_train, x_test = train_test_split(new_train_test, test_size=0.1) cl = NaiveBayesClassifier(x_train) # print(cl.classify("Please create an assignment and forward it by EOD")) # print(cl.classify("Im not a dessert person but the warm butter cake should be illegal its so good.")) print("Acheived a test accuracy of : %s " % cl.accuracy(x_test)) # details of classifier train cl.show_informative_features() if not os.path.isdir("./models"): os.mkdir("./models") # saving the trained model file = open("./models/cl_NBC.obj", "wb") pickle.dump(cl, file) file.close()
def main(): print("Running!") # train textblob NaiveBayesClassifier with open(SRC_TRAIN, encoding='utf-8', mode='r') as train: cl = NaiveBayesClassifier(train, format="json") cl.show_informative_features(10) # classify each article in the test data with open(SRC_TEST, encoding='utf-8') as test: #load json to a json object articles = json.load(test) print("to classify: " + str(len(articles))) # iterate through articles count = 0 for article in articles: count += 1 givenLabel = cl.classify(article['text']) trueLabel = article['label'] correct = measure(givenLabel, trueLabel) print( str(count) + " Classified:" + givenLabel + " Label:" + trueLabel + (" correct" if correct else " wrong")) accuracy = (nTruePositive + nTrueNegative) / ( nTruePositive + nTrueNegative + nFalsePositive + nFalseNegative) fMeasure = (2 * nTruePositive) / ( (2 * nTruePositive) + nFalsePositive + nFalseNegative) print("accuracy: {0}".format(accuracy)) print("F1-Score: {0}".format(fMeasure)) print("TP: {0} FP: {1} TN: {2} FN: {3}".format(nTruePositive, nFalsePositive, nTrueNegative, nFalseNegative))
class XcLassify(object): """TODO""" def __init__(self): self.__cl = None self.__traindata = None self.__testdata = None def _fetch_clean(self, filepath): """TODO""" dframe = pandas.read_excel(filepath) dframe.iloc[:, 0] = dframe.iloc[:, 0].map(clean_str) dframe.iloc[:, 0] = dframe.iloc[:, 0].map(anycode) return dframe.iloc[:, 0:2].to_records(index=False).tolist() def _split_data(self, datalist, test_ratio): """TODO""" self.__traindata, self.__testdata = train_test_split(datalist, test_size=test_ratio) return self.__traindata, self.__testdata def data_from_excel(self, filepath, test_ratio=0.24): datalist = self._fetch_clean(filepath) return self._split_data(datalist, 0.2) def train(self, update=False, new_data=None): """TODO""" if update and new_data: self.__cl.update(new_data) else: self.__cl = NaiveBayesClassifier(self.__traindata) def classify(self, text): """TODO""" text = clean_str(text, post_func=anycode) return self.__cl.classify(text) def benchmark(self, show_best_features=False): """TODO""" print('\nAccuracy: %0.3f\n' % self.__cl.accuracy(self.__testdata)) if show_best_features: self.__cl.show_informative_features()
def main(): auth = OAuthHandler(consumerKey, consumerSecret) auth.set_access_token(accessToken, accessSecret) train = [] test = [] getData('data/pos.txt', 'pos',train) #get data from txt file getData('data/neg.txt', 'neg',train) getData('data/neut.txt', 'neut',train) getData('data/test/testNeut.txt', 'neut',test) getData('data/test/testPos.txt', 'pos', test) getData('data/test/testNeg.txt', 'neg', test) global cl cl = NaiveBayesClassifier(train) while True: # get tweets from twitter cl.show_informative_features(5) print(cl.accuracy(test)) twitterStream = Stream(auth, listener()) twitterStream.filter(track=["bitcoin"], async = True, stall_warnings=True) time.sleep(6000) # check twitterStream.disconnect()
def main(argv=0): nBObj = naiveBayes() businessId = nBObj.deriveBusinessId('yelp_academic_dataset_business.json') print len(businessId) businessId = businessId[:10] train = nBObj.getTrainData('yelp_academic_dataset_review.json', businessId) print train cl = NaiveBayesClassifier(train) print cl.show_informative_features(20) print "Opening the file..." target = open("naiveBayesResult.txt", 'w') for (sentence, rating) in nBObj.testSentences: clOutput = nBObj.testSentence(sentence, cl) strToWrite = str(rating) + "\t" + clOutput target.write(strToWrite) target.write("\n") target.close() nBObj.calcAccuracy()
def main(argv=0): nBObj = naiveBayes() businessId = nBObj.deriveBusinessId('yelp_academic_dataset_business.json') print len(businessId) businessId = businessId[:10] train = nBObj.getTrainData('yelp_academic_dataset_review.json',businessId) print train cl = NaiveBayesClassifier(train) print cl.show_informative_features(20) print "Opening the file..." target = open("naiveBayesResult.txt", 'w') for (sentence,rating) in nBObj.testSentences: clOutput = nBObj.testSentence(sentence,cl) strToWrite = str(rating) + "\t" + clOutput target.write(strToWrite) target.write("\n") target.close() nBObj.calcAccuracy()
def create_sentiment_model(): random.seed(1) # Grab some movie review data reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) new_train, new_test = reviews[:1900], reviews[1900:] cl = NaiveBayesClassifier(new_train) # Compute accuracy accuracy = cl.accuracy(new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features print cl.show_informative_features(5) with open('sentiment_clf_full.pkl', 'wb') as pk: dill.dump(cl, pk) print 'done saving model'
def create_sentiment_model(): random.seed(1) # Grab some movie review data reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(reviews) new_train, new_test = reviews[:1900], reviews[1900:] cl = NaiveBayesClassifier(new_train) # Compute accuracy accuracy = cl.accuracy(new_test) print("Accuracy: {0}".format(accuracy)) # Show 5 most informative features print cl.show_informative_features(5) with open('sentiment_clf_full.pkl', 'wb') as pk: dill.dump(cl, pk) print 'done saving model'
def train(): try: print("Training has begun") trainingData = createTrainingData() #print trainingData classifier = NaiveBayesClassifier(trainingData) print(classifier.show_informative_features()) with open('threat_classifierOntology.pkl', 'wb') as output: pickle.dump(classifier, output, pickle.HIGHEST_PROTOCOL) print("Training has completed") except: now = time.strftime("%x") file = open("traingErrors.txt", "w") error = sys.exc_info()[0].__name__ + ': ' + str(sys.exc_info()[1]) file.write(now) file.write('\n') file.write(error) file.close()
def engine(text): from textblob.classifiers import NaiveBayesClassifier from textblob.classifiers import MaxEntClassifier from textblob.classifiers import NLTKClassifier url_train = "https://" file_train = "train.csv" if not (os.path.isfile(file_train)): with open(file_train, 'wb') as handle: print("Train loaded from Request:", url_train) response = requests.get(url_train, stream=True) if not response.ok: # Something went wrong pass for block in response.iter_content(1024): handle.write(block) handle.close() print("Request DONE") else: print("Train loaded from cache:", file_train) with open(file_train, 'r', encoding="utf8") as fp: #cl = MaxEntClassifier(fp) cl = NaiveBayesClassifier(fp) # print(cl.classify("This is an amazing library!")) # print(cl.accuracy(test)) # cl.update(test) # print(cl.accuracy(test)) prob_dist = cl.prob_classify(text) print("TEST:", text, " ", prob_dist, " ", prob_dist.max()) for a in prob_dist.samples(): print(a, ":", round(prob_dist.prob(a), 2)) print(cl.show_informative_features()) aa = cl.extract_features(text) print(aa) print("---------------------------------------") return cl.classify(text)
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify( ["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal( repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format( len(train_set)))
import pickle from textblob.classifiers import NaiveBayesClassifier from nltk.corpus import twitter_samples pos_tweets = twitter_samples.strings('positive_tweets.json') neg_tweets = twitter_samples.strings('negative_tweets.json') # positive tweets words list pos_tweets_set = [] for tweet in pos_tweets: pos_tweets_set.append((tweet, 'pos')) # negative tweets words list neg_tweets_set = [] for tweet in neg_tweets: neg_tweets_set.append((tweet, 'neg')) # random.shuffle(pos_tweets_set) # random.shuffle(neg_tweets_set) test_set = pos_tweets_set[:1000] + neg_tweets_set[:10000] train_set = pos_tweets_set[1000:2000] + neg_tweets_set[1000:2000] __NaiveBayesClassifier = NaiveBayesClassifier(train_set) print("Accuracy: {}".format(__NaiveBayesClassifier.accuracy(test_set))) print(__NaiveBayesClassifier.show_informative_features(10)) # save model for later use pickle.dump(__NaiveBayesClassifier, open("naivebayes.pickle", "wb"))
#tx_cl = "I feel amazing!" #tx_prob = "This one's a doozy." tx_cl = "El subte esta demorado" tx_prob = "El subte funciona bien" cl = NaiveBayesClassifier(train) print cl.classify(tx_cl) print cl.classify("El subte funciona bien") prob_dist = cl.prob_classify(tx_prob) print prob_dist.max() print round(prob_dist.prob("pos"), 2) print round(prob_dist.prob("neg"), 2) print cl.accuracy(data_sets.en_test) print cl.show_informative_features(5) #Using TextBlob blob = TextBlob("No funca por que hay obras para mejorar la cosa", classifier=cl) print blob.sentiment print blob.classify() blob = TextBlob("El subte funciona normal", classifier=cl) print blob.sentiment print blob.classify() blob = TextBlob("Se realizan obras en el subte A", classifier=cl) print blob.sentiment print blob.classify() blob = TextBlob("No funciona, anda averiguar por que. Quizas hay un accidente", classifier=cl)
for decider in deciders: decider = 'un/data/' + decider with open(decider) as f_input: decider_files.append(f_input.read()) new_decider = [] for speech in decider_files: sentences = sent_tokenize(speech) for sentence in sentences: new_decider.append(sentence) # create classifier train, test = train_test_split(speeches, test_size=0.1) train_set = list(train.itertuples(index=False, name=None)) test_set = list(test.itertuples(index=False, name=None)) my_classifier = NaiveBayesClassifier(train_set) #try out the deciders my_classifier.classify(new_decider[0]) prob_dist = my_classifier.prob_classify(new_decider[1]) print(prob_dist.max()) print(round(prob_dist.prob('SOV'), 2)) print(round(prob_dist.prob('OPN'), 2)) #test accuracy accuracy = my_classifier.accuracy(test_set) print(accuracy) informative = my_classifier.show_informative_features(5)
test_data = [ ("Fluggastdatenspeicherung: EU-Parlament votiert für PNR-Datenbank"), ("Chipmaschinen-Hersteller: ASML liefert sechs EUV-Belichtungsmaschinen an Intel aus"), ("Apple: iCloud löscht unter Umständen Daten unwiederbringlich"), ("Spionagesoftware: Hacking Team nutzt UEFI-Rootkit"), ("Mobilfunk: 5G soll für Nutzer wie ein unbegrenztes System sein"), ("Mobilfunknetzbetreiber: Kostenloses WLAN für Regionalzüge kommt"), ("Kickstarter: Kerze lädt Smartphone"), ("Hacking Team: Carabinieri kapern mal kurz das Internet"), ("Nach Hackerangriff: OPM-Chefin Katherine Archuleta tritt zurück"), ("Smartphone-Hersteller: Geeksphone hört auf"), ("Systemverschlüsselung: Yubikeys Zwei-Faktor-Authentifizierung unter Linux nutzen"), ("Kritik an Dieter Nuhr: Wir alle sind der Shitstorm"), ("Navigationsgerät: Autofahrer verursacht wegen Navi schweren Unfall"), ("Until Dawn angespielt: Das Horrorhaus der tödlichen Entscheidungen"), ("Satoru Iwata: Nintendo-Chef im Alter von 55 Jahren gestorben"), ("Call of Duty: Zombies à la Film noir") ] nbc = NaiveBayesClassifier(train_data, lang='de_DE') for data in test_data: print(nbc.classify(data)) print(nbc.accuracy(train_data)) print(nbc.show_informative_features(5))
# preproccess training and testing datasets # without preprocessing, classification will typically take longer and have lower accuracy print("Preprocessing datasets...") print() sys.stdout.flush() traindata = preprocessData(traindata, minwordlen=4) testdata = preprocessData(testdata, minwordlen=4) # train the Naive Bayes Classifier print("Training Naive Bayes Classifier...") print() sys.stdout.flush() nbc = NaiveBayesClassifier(traindata) # show the most informative features used for classification nbc.show_informative_features(5) print() sys.stdout.flush() # test the Naive Bayes Classifier print("Testing Naive Bayes Classifier...") sys.stdout.flush() acc = nbc.accuracy(testdata) print("Accuracy:", round(acc, 4)) print() # print the confusion matrix print("Printing Confusion Matrix...") print() sys.stdout.flush()
print('Training models...') neg_ids = nltk.corpus.movie_reviews.fileids('neg') pos_ids = nltk.corpus.movie_reviews.fileids('pos') neg_feats = [(NaiveBayesAnalyzer().feature_extractor( nltk.corpus.movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids] pos_feats = [(NaiveBayesAnalyzer().feature_extractor( nltk.corpus.movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids] # split into train and test #train_manual = data[:69] train_manual = neg_m_data[105:] + pos_m_data[105:] train_mrc = neg_feats + pos_feats #+ train_manual #test_data = data[69:] test_data = neg_m_data[:105] + pos_m_data[:105] # create model print('Testing models...') cl = NaiveBayesClassifier(train_manual) cl_2 = NaiveBayesClassifier(train_mrc) # calculate score score = round(cl.accuracy(test_data) * 100, 3) score_2 = round(cl_2.accuracy(test_data) * 100, 3) print('Classifier 1 (EmotionPix) is', str(score) + '% accurate.') print('Informative Features:') print(cl.show_informative_features(10)) print('Classifier 2 (NaiveBayes w/ movie review corpus) is', str(score_2) + '% accurate.') print('Informative Features:') print(cl_2.show_informative_features(10))
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
for word in words: if word not in stopwords.words() and not word.isdigit(): list_tuples.append((word.lower(),tabsep[0])) c+=1 if c==500: break return list_tuples print 'importing data...' a = time.time() entire_data = get_list_tuples("dataset.txt") print "It took "+str(time.time()-a)+" seconds to import data" print 'data imported' random.seed(1) random.shuffle(entire_data) train = entire_data[:750] test = entire_data[751:1500] print 'training data' a = time.time() cl = NaiveBayesClassifier(train) print "It took "+str(time.time()-a)+" seconds to train data" print 'data trained, now checking accuracy:' accuracy = cl.accuracy(test) print "accuracy: "+str(accuracy) cl.show_informative_features(5) x = "" while (x != "exit"): x = raw_input("enter a email to check if it is a spam email or not , type exit to exit \n") print cl.classify(x)
def encode_tweet(tweet): #remove links,username and symbols form tweet tweet_words = [] words = tweet[0].split() for x in words: x = unicode(x, errors='ignore') tweet_words.append(x) mod_tweet=" ".join(tweet_words) tweet[0] = mod_tweet train_tweets.append(tweet) if counter > 100: #training and testing dataset test_tweets.append(tweet) else: train_tweets.append(tweet) with open("tweets1.csv",'rb') as data_file: data = csv.reader(data_file,delimiter=',') for tweet in data: encode_tweet(tweet) counter+=1 classifier = NaiveBayesClassifier(train_tweets) print("Accuracy of the classifier: {0}".format(classifier.accuracy(test_tweets))) classifier.show_informative_features(10) print "Training complete" test = raw_input("Enter the string:") if classifier.classify(test)==0: print "Sentiment: negative" else: print "Sentiment: positive"
# # w= Word('running') # print w.lemmatize() #Text Classify train = [('I love this sandwich.', 'pos'), ('this is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('this is my best work.', 'pos'), ("what an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'), ('my boss is horrible.', 'neg')] test = [('the beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = NaiveBayesClassifier(train) print cl.classify("This is an amazing library!") print cl.accuracy(test) print cl.show_informative_features(5) prob_dist = cl.prob_classify("This one's a doozy.") print prob_dist.max()
print(c.classify('seu lindo')) # A print('labels:', c.labels()) # labels: ['A', 'B'] test = [('Voce e muito gato', 'A'), ('Voce e muito feio', 'B')] print('acuracia:', c.accuracy(test)) # acuracia: 0.5 test = [('Voce e muito lindo', 'A'), ('Voce e muito feio', 'B')] print('acuracia:', c.accuracy(test)) # acuracia: 1.0 print('features:', c.extract_features('Eu sou horroroso')) # features: {'contains(Eu)': True, 'contains(sou)': True, 'contains(lindo)': False, 'contains(feio)': False} c.show_informative_features() # Most Informative Features # contains(sou) = True B : A = 1.0 : 1.0 # contains(Eu) = True B : A = 1.0 : 1.0 # # So que o pacote textblob eh mais do que # classificacao de texto. Vejamos. # from textblob import TextBlob text = TextBlob( "I went home. Because I'm happy. Clap along if you feel like a room without a roof." ) print('text:', text) # text: I went home. Because I'm happy. Clap along if you feel like a room without a roof.
data = read_target_data() print("Predicting Sentiment...") # predict sentiment on data's text data[['pred', 'p_pos', 'p_neg']] = data.text.apply(lambda x: pd.Series(get_sentiment(x))) print("Calculating Ratios...") # use predicted sentiment to fit a dummy model # allowing us to get pos:neg ratios dummy_train = data[['text', 'pred']].values.tolist() dummy_cl = NaiveBayesClassifier(dummy_train) # shove the ratio structure into a file sys.stdout = open('./data/mif.txt', 'w') dummy_cl.show_informative_features(100) sys.stdout = sys.__stdout__ print("Preparing Report...") # parse the raw ratios file and create dataframe with open("./data/mif.txt") as f: mif = f.read().split('\n')[1:] mif_df = pd.DataFrame([parse_mif(i) for i in mif]) # create percentages from ratios N = (mif_df.neg + mif_df.pos) mif_df['pct_neg'] = mif_df.neg / N mif_df['pct_pos'] = mif_df.pos / N # save data sorted by the ratios mif_df.dropna().sort_values("high", ascending=False).head(100).to_csv(
except UnicodeDecodeError: continue test.append([post_body, i_file]) Bayes = NaiveBayesClassifier(train) print os.getcwd() pos = [] neg = [] for body in test: judge = Bayes.classify(body[0]) if judge == "positive": call(['mv', "./" + body[1], "john/"]) os.getcwd() if judge == "negative": call(['mv', "./" + body[1], "non_john/"]) os.mkdir("hard_to_classify") remaining = glob.glob("*.html") for doc in remaining: call(['mv', "./" + doc, "hard_to_classify/"]) #print Bayes.accuracy(test) print Bayes.show_informative_features(10) # #advanced feature extraction - slang and misspellings
print(cl.classify("This is an amazing library!")) # get the label probability distribution prob_dist = cl.prob_classify("This one's a doozy.") print(prob_dist.max()) print(round(prob_dist.prob("pos"), 2)) print(round(prob_dist.prob("neg"), 2)) # classifying textblob blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl) print(blob.classify()) for s in blob.sentences: print(s) print(s.classify()) # evaluating classifiers print(cl.accuracy(test)) print(cl.show_informative_features( 5)) # displaying a listing of the most informative features # updating classifiers wth new data new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ("Stay thirsty, my friend.", 'pos'), ("He ain't from around here.", 'neg')] print(cl.update(new_data)) print(cl.accuracy(test)) # feature extractors # creating a feature extractor that just uses the first and last words of a document as its features def end_word_extractor(document): tokens = document.split() first_word, last_word = tokens[0], tokens[-1] feats = {}
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_custom_format(self): redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): self.client = client self.port = port @classmethod def detect(cls, stream): return True def to_iterable(self): return redis_train formats.register('redis', MockRedisFormat) mock_redis = mock.Mock() cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) assert_equal(cl.train_set, redis_train) def test_data_with_no_available_format(self): mock_fp = mock.Mock() mock_fp.read.return_value = '' assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
class LogicProc: def __init__(self, preclassified_file, channel, slack_token): if os.path.isfile(preclassified_file)==False: print('"' + preclassified_file + '" does not exist!') with open(preclassified_file,'r') as train_set: print 'training from ' + preclassified_file self.spam_classifier = NaiveBayesClassifier(train_set, format="csv") self.slack_client = slack_interface.SlackInterface(slack_token) self.message_queue = [] self.last_message_ts = None self.channel = self.slack_client.get_channel_id(channel) if self.channel==None: print 'Could not find channel ' + channel self.db_interface = database_interface.DB() training = self.db_interface.get_training_data() self.spam_classifier.update(training) self.update_classifer_from_slack(self.channel) self.spam_classifier.show_informative_features() self.check_twitter_msgs = infinite_timer.InfiniteTimer(5.0, self.proc_messages) self.check_slack_msgs = infinite_timer.InfiniteTimer(60.0, self.update_classifer_from_slack, self.channel) self.check_twitter_msgs.start() self.check_slack_msgs.start() def add_new_message(self, msg, source): """ Callback from Twitter when there is a new message @param msg The Twitter message, with all its attributes @param source Where the message came from. Right now should only be 'twitter' """ self.message_queue.append({'source': source, 'message': msg}) def proc_messages(self): for msg in self.message_queue: if msg['source'] == 'twitter': message = msg['message'] if self.quality_filter(message.text) == True: print 'GOOD: ' + message.text.encode('utf-8') self.post_to_slack(message, self.channel) self.store_message(message.text, True) else: print 'BAD: ' + message.text.encode('utf-8') self.store_message(message.text, False) self.message_queue.remove(msg) def run_loop(self): """ Not sure what this was originally intended to do.. now it runs proc_messages once a second """ while True: # sleep between polling queue time.sleep(1) def quality_filter(self, message_text): # -filter useless hashtag announcements "Prayers for Irma! Use #IrmaSoS" # -filter outside the geobounds # -filter duplicates # -bayesian filter result = self.spam_classifier.classify(message_text) if result == 'neg': return False else: return True def post_to_slack(self, msg, channel): self.slack_client.post_message(msg.text, channel) def update_classifer_from_slack(self, channel): slack_msgs = self.slack_client.get_slack_reactions(channel, self.last_message_ts) if len(slack_msgs)>0: self.last_message_ts = slack_msgs[-1]['ts'] bayesian_update_data = [] for m in slack_msgs: user_feedback = self.is_slack_reaction_pos(m['reactions']) text = m['text'] if user_feedback == None: pass elif user_feedback == True: bayesian_update_data.append((text, 'pos')) elif user_feedback == False: bayesian_update_data.append((text, 'neg')) # update for better results if we can if len(bayesian_update_data) > 0: print 'updating db...' # update classification in DB self.db_interface.update(bayesian_update_data); # update classifier print 'updating classifier...' self.spam_classifier.update(bayesian_update_data) print 'done...' self.spam_classifier.show_informative_features() def is_slack_reaction_pos(self,reactions): for t in reactions: name = t['name'] if name == '-1': return False if name == '+1': return True return None def store_message(self, message, filter_classification, source='twitter'): self.db_interface.add(message,filter_classification, source) def bayesian_search(self, query): results = self.api.search(query) filtered_results = [r for r in results if self.is_spam(r.text) == 0] return filtered_results
continue test.append([post_body, i_file]) Bayes = NaiveBayesClassifier(train) print os.getcwd() pos = [] neg = [] for body in test: judge = Bayes.classify(body[0]) if judge == "positive": call(["mv", "./" + body[1], "john/"]) os.getcwd() if judge == "negative": call(["mv", "./" + body[1], "non_john/"]) os.mkdir("hard_to_classify") remaining = glob.glob("*.html") for doc in remaining: call(["mv", "./" + doc, "hard_to_classify/"]) # print Bayes.accuracy(test) print Bayes.show_informative_features(10) # #advanced feature extraction - slang and misspellings
# preproccess training and testing datasets # without preprocessing, classification will typically take longer and have lower accuracy print ("Preprocessing datasets...") print () sys.stdout.flush() traindata = preprocessData(traindata, minwordlen=4) testdata = preprocessData(testdata, minwordlen=4) # train the Naive Bayes Classifier print ("Training Naive Bayes Classifier...") print () sys.stdout.flush() nbc = NaiveBayesClassifier(traindata) # show the most informative features used for classification nbc.show_informative_features(5) print () sys.stdout.flush() # test the Naive Bayes Classifier print ("Testing Naive Bayes Classifier...") sys.stdout.flush() acc = nbc.accuracy(testdata) print ("Accuracy:", round (acc, 4)) print () # print the confusion matrix print ("Printing Confusion Matrix...") print () sys.stdout.flush()
"Korban diajak tersangka ke musala di dekat pondok. Saat kondisi sepi dan hanya berdua dengan korban, tersangka mencabuli korban," kata Wahyu kepada wartawan, Minggu (20/3/2016). Lantaran menganggap Nurul sebagai Gus, korban pun tak berani menolak permintaan tersangka. Terlebih lagi, tersangka membujuk korban bahwa perbuatan cabul itu untuk memasukkan ilmu kebatinan ke tubuh korban. "Tersangka berdalih untuk mengajari korban ilmu tasawuf. Nyatanya itu hanya untuk memuluskan niat tersangka agar bisa mencabuli korban," ungkapnya. Menurut Wahyu, perbuatan cabul itu dilakukan tersangka kepada korban berulang kali selama 2 tahun terakhir. Bahkan korban diminta membayar uang kepada tersangka setiap kali usai melakukan pencabulan. Nilainya antara Rp 200.000 hingga jutaan rupiah. "Tersangka juga meminta uang dari korban berulang kali. Total kerugian korban Rp 40 juta," sebutnya. Tak tahan dengan perbuatan Nurul, lanjut Wahyu, korban pun memutuskan buka mulut ke teman sesama santri. Mendapat dukungan dari teman-temannya, korban memberanikan diri melapor ke Polres Jombang, Kamis (17/3). Pada hari yang sama, polisi memutuskan menjebak tersangka. "Saat korban menyerahkan uang yang terakhir kepada tersangka, saat itu tersangka langsung kami tangkap," jelasnya. Akibat perbuatannya, kini Nurul harus mendekam di Rutan Polres Jombang. Tersangka dijerat dengan Pasal 80 ayat (1) juncto Pasal 82 ayat (1) UU RI No 35 Tahun 2014 tentang Perlindungan Anak dengan ancaman pidana maksimal 15 tahun penjara. "Kalau ada yang merasa menjadi korban perbuatan tersangka ini, jangan malu melapor, akan kami jaga identitasnya. Karena itu bisa memberatkan tersangka," pungkasnya. """ tic = timeit.default_timer() renum = ''.join([i for i in text if not i.isdigit()]) text = stem_words(renum) print("text diatas setelah diklasifikasi yaitu %s\n" % cl.classify(text)) toc = timeit.default_timer() print ("waktu klasifikasi : ") print(toc-tic) print(cl.show_informative_features(20)) # classifier = TextBlob(stemstop_output, classifier=cl) # print(classifier.classify())
df = pd.DataFrame({"labels": trainLabels, "trainData": trainData}) train, test = train_test_split(df, test_size = 0.25) training = zip(train["trainData"].tolist() , train["labels"].tolist()) testing = zip(test["trainData"].tolist() , test["labels"].tolist()) ## training model %time model = NBC(training) %time print(model.accuracy(training)) ## getting accuracy of 90% ## Shows important features for detecting intent print model.show_informative_features() #Most Informative Features # contains(please) = True Yes : No = 10.0 : 1.0 # contains(ve) = True No : Yes = 9.9 : 1.0 # contains(verifi) = True Yes : No = 9.3 : 1.0 # contains(sale) = True No : Yes = 9.3 : 1.0 # contains(moment) = True Yes : No = 8.6 : 1.0 # contains(compani) = True No : Yes = 7.3 : 1.0 # contains(deliveri) = True No : Yes = 6.9 : 1.0 # contains(unsubscrib) = True No : Yes = 6.9 : 1.0 # contains(experi) = True No : Yes = 6.9 : 1.0 # contains(remov) = True No : Yes = 6.9 : 1.0 %time print(model.accuracy(testing)) ## getting accuracy of 70.3%
def activeLearning(NAME, datapath, infile, iterations = 3, portion = 10): logger = logging.getLogger('signature.activeLearning') logger.info('Active learning model building') #load data review_file = open(infile,"r") #convert to appropriate format review_corpus = list() for i, line in enumerate(review_file): try: #filter out non-ascii simbols review = json.loads(line) review_corpus.append([re.sub(r'[^\x00-\x7f]', r' ', review['text']), review['textFeatures']]) except: logger.error(review['text']) continue review_file.close() logger.info('Data converted - %d reviews'%len(review_corpus)) #Shuffle dataset #random.seed(1) random.shuffle(review_corpus) try: current_train = json.loads(open(datapath + '%s_current_train.json'%NAME,'r').read()) except: current_train = list() for t in current_train: try: review_corpus.remove(t[0]) except: pass logger.info("Len(current_train) = %d"%len(current_train)) ''' Prepare first portion ''' if len(current_train) > 10: #train model cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor) #prepare next portion ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train) #ratio = 0.5 logger.info('ratio = %.3f\nclassifying train set ...'%ratio) train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]] train_classify.sort() reviews_portion = train_classify[:portion] else: reviews_portion = [y for y in enumerate(review_corpus[:portion])] ''' main iterations of active learning ''' for iteration in range(iterations): #ask for labels for p in range(len(reviews_portion)): var = input('''\n\n%s \n(%f)\nPlease give the label to the review (g - generic / s - specific): '''%(reviews_portion[p][1][0],reviews_portion[p][0])) if var.lower().startswith('g'): label = 'g' elif var.lower().startswith('s'): label = 's' elif var.lower().startswith('x'): logger.info('Finish') break else: logger.info('Bad label') continue #prepare train set current_train.append((reviews_portion[p][1],label)) review_corpus.remove(reviews_portion[p][1]) #train model cl = NaiveBayesClassifier(current_train, feature_extractor=feature_extractor) #prepare next portion ratio = float(sum([int(x[1] == 'g') for x in current_train]))/len(current_train) #ratio = 0.5 logger.info('ratio = %.3f\nclassifying train set ...'%ratio) train_classify = [[0.1*random.random() + abs(int(cl.classify(t)=='s')-ratio),t] for t in review_corpus[:1000]] train_classify.sort() reviews_portion = train_classify[:portion] logger.info('Iteration: %d (%d items), Accuracy on train = %.2f'%(iteration,len(current_train),100*cl.accuracy(current_train))) current_train_out = open(datapath+'%s_current_train.json'%NAME,'w') current_train_out.write(json.dumps(current_train)) current_train_out.close() cl.show_informative_features(10) #test random.shuffle(current_train) thres = int(0.8*len(current_train)) train_self = current_train[:thres] test_self = current_train[thres:] cl_test = NaiveBayesClassifier(train_self, feature_extractor=feature_extractor) acc_str = 'Accuracy on test = %.2f with %d items in testset and %d items in trainset'%(100*cl_test.accuracy(test_self), len(test_self),len(train_self)) logger.info(acc_str) message = list() message.append(acc_str) #saving model pickle.dump(cl, open(datapath+ '%s_active_learning.model'%NAME, "wb" ) ) return '\n'.join(message)
def extractor(word): feats = {} last_letter = word[-1] feats["last_letter({0})".format(last_letter)] = True return feats if __name__ == "__main__": # customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs.txt':'spanish','./texts/wordsEs2.txt':'spanish'} """ customDicts = {'./texts/wordsEn.txt':'english','./texts/wordsEs2.txt':'spanish'} for customDictFilename, customDictLang in customDicts.items(): currentDict = open(customDictFilename,'r') for line in currentDict: wordTrain = (line.replace('\r','').replace('\n',''),customDictLang) train.append(wordTrain) currentDict.close() """ # print train lang_detector = NaiveBayesClassifier(train, feature_extractor=extractor) # lang_detector = NaiveBayesClassifier(train) print lang_detector.accuracy(test) lang_detector.show_informative_features(5) while 1: try: line = sys.stdin.readline() # print line print lang_detector.classify(line) except KeyboardInterrupt: break if not line: break
blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=class1) blob.classify() for s in blob.sentences: print(s) print(s.classify()) # ### Evaluating Classifiers class1.accuracy(test) # ### Diplay a Listing of the Most Informative Features class1.show_informative_features(5) # ### Updating Classifiers with New Data¶ new_data = [('She is my best friend.', 'pos'), ("I'm happy to have a new friend.", 'pos'), ("Stay thirsty, my friend.", 'pos'), ("He ain't from around here.", 'neg')] class1.update(new_data) class1.accuracy(test) # ### Feature Extractors def end_word_extractor(document):
'what are you working on', 'what you making') experience_utterances = [(x, 'experience') for x in experience_utterances] environment_utterances = [(x, 'enivornment') for x in environment_utterances] working_on_utterances = [(x, 'working') for x in working_on_utterances] # FIXME: find better way to flatten lists together training_set = [] training_set.extend(experience_utterances) training_set.extend(environment_utterances) training_set.extend(working_on_utterances) classifier = NaiveBayesClassifier(training_set) print(classifier.show_informative_features(), classifier.labels()) bogus_utterances = ( 'if you going to use nltk u may want to check this out spacy .io', 'sup people? I see the weather\'s getting better over there, Ben.', 'i had the same problem your having so thats my i made my own.', 'try http, instead of https' ) # TODO: Figure out how to make this stronger dual_utterance = ('how long have you been coding and what IDE do you use',) test_utterances = ('what are you making', 'hey that nyancat is cool, how do you get that?') for t in test_utterances:
nltk.download('punkt') nltk.download('brown') #Version Check# print(pd.__version__) print(tb.__version__) print(nltk.__version__) #Data Cleanse and Shuffle# train = pd.read_csv("training.txt", sep="\t", header=None) train = train.sample(frac=1, random_state=int(input("Random State? "))) train.columns = ["Sentiment", "Raw_Text"] train_data = list(zip(train["Raw_Text"], train["Sentiment"])) #Train Classifier# cl = NaiveBayesClassifier(train_data[:1000]) #Test Classifier# train["Guess"] = train["Raw_Text"].apply(cl.classify) #Results# Accuracy = round( (train[train["Sentiment"] == train["Guess"]].size / train.size) * 100, 2) print(f"\nAccuracy: {Accuracy}%\n") print(cl.show_informative_features())
for line in f: line = line.replace("\n", "") a, b = line.split('|') train.append((a, b)) f.close() return train data = get_csv(file_name[cur_file], must_contain[cur_file]) text = get_text(data) # train model if train_enable: train = get_train_set(file_name[cur_file]) cl = NaiveBayesClassifier(train) cl.show_informative_features(1000) if test_enable: data['sentiment'] = 0.0 for index, row in data.iterrows(): #print(row) #print(cl.prob_classify(row.review_body).prob("pos")) val1 = TextBlob(str(row.review_body)).sentiment.polarity val2 = (cl.prob_classify(str(row.review_body)).prob("pos") - 0.5) // 0.5 data.at[index, 'sentiment'] = 0.3*val1 + 0.7*val2 #data.at[index, 'sentiment'] = TextBlob(str(row.review_body)).sentiment.polarity #data.at[index, 'sentiment'] = round(cl.prob_classify(str(row.review_body)).prob("pos"), 5) data.to_csv(file_name[cur_file]+'_sentiment.csv') if word_cloud_enable: f = open(file_name[cur_file]+'_feature.txt', 'r') pos_words = {}
Created on Tue Jul 11 09:47:27 2017 @author: mzent """ from textblob.classifiers import NaiveBayesClassifier from textblob import TextBlob import csv import pickle train = [] test = [] with open('trainingData5.csv', newline='', encoding="latin-1") as csvfile: reader = csv.reader(csvfile) for row in reader: train.append((row[0] + ": " + row[2], row[3])) with open('testData5.csv', newline='', encoding="latin-1") as csvfile: reader = csv.reader(csvfile) for row in reader: test.append((row[0] + ": " + row[2], row[3])) print("read data") cl = NaiveBayesClassifier(train) print("created classifier") # Compute accuracy print("Accuracy: {0}".format(cl.accuracy(test))) # Show 5 most informative features cl.show_informative_features(25) print(pickle.dump(cl))
("I need shelter", "shelter"), ("I need shelter", "shelter"), ("The hospitals here are all full", "healthcare"), ("I need shelter", "shelter"), ("I need healthcare", "healthcare"), ("I am concerned about my children's meantal well being and health, the earthquake has caused big stress", "healthcare") ] # BASIC TESTING FRAMEWORK USING NAIVE BAYES CLASSIFIER MODEL print("Beginning training set") classy = NaiveBayesClassifier(train) print("Training set ended") print("Beginning testing set") totalCorrect = 0 totalTestPoints = len(test) for testPoint in test: print("Checking: {}".format(testPoint)) print("Focus on shelter probability: {}".format( classy.prob_classify(testPoint[0]).prob("shelter"))) print("Focus on healthcare probability {}".format( classy.prob_classify(testPoint[0]).prob("healthcare"))) prediction = classy.classify(testPoint[0]) if (prediction == testPoint[1]): totalCorrect += 1 print() print("=================") print() print("Overall Test Accuracy: {0:.2f}%".format( (totalCorrect / totalTestPoints) * 100)) classy.show_informative_features(8)
def SA(): r = Rake() # Opens file and reads in training data # NB classifier trains using the read in data with open("datasets/trainingData.csv", 'r') as trainingdata: classifier = NaiveBayesClassifier(trainingdata, format="csv") print("Training Data") classifier.show_informative_features(15) # Opens file and reads in testing data # Prints testing data accuracy # Not needed for final product with open("datasets/testingData.csv", 'r') as testingdata: print("Testing data accuracy", classifier.accuracy(testingdata)) # Asks for user input userInput = input("Please provide a test input: ") # Removes all non letter characters regex = re.compile('[^a-zA-Z ]') punctuationRemoved = regex.sub('', userInput) print("Punctuation removed: ", punctuationRemoved) # Defines stopwords stop_words = set(stopwords.words('english')) # Takes user input, removes stopwords word_tokens = word_tokenize(punctuationRemoved) # Creates list size based on number of words left after stop words are removed filtered_sentence = [w for w in word_tokens if not w in stop_words] # Initialize empty list filtered_sentence = [] # Appends each word to end of list # Runs for as many words are stored in word_tokens for w in word_tokens: # If word is not in stop_words, append to end of list if w not in stop_words: filtered_sentence.append(w) # Prints list to see new sentence with stopwords removed print("Stopwords removed: ", filtered_sentence) # Converts the filtered stop word sentence to string stringWithoutStopwords = ' '.join( [str(elem) for elem in filtered_sentence]) # Extracts keywords from the filtered sentence r.extract_keywords_from_text(stringWithoutStopwords) # Ranks the keywords that have been extracted ranked_phrases = r.get_ranked_phrases() print("Keywords extracted: ", ranked_phrases) # Converts extracted keywords list to string listToStr = ' '.join([str(elem) for elem in ranked_phrases]) # Runs string through trained NB classifier finalString = TextBlob(listToStr, classifier=classifier) # Print string followed by classification print("String followed by classification: ", finalString, finalString.classify()) if finalString.classify() == ("pos"): binaryClassify = 1 else: binaryClassify = 0 print(binaryClassify)
] if __name__ == "__main__": # print "Initiallizing classifier... (training...)" # train_positive() # train_negative() # print train_set # classifier = NaiveBayesClassifier(train_set) # with open('./texts/words.txt', 'r') as fp: # classifier = NaiveBayesClassifier(fp, format="csv") # print classifier.accuracy(test_set) # print classifier.show_informative_features() classifier = NaiveBayesClassifier(train_set) print train_set print classifier.accuracy(test_set) print classifier.show_informative_features() print "Ready " while 1: try: line = sys.stdin.readline() prob_dist = classifier.prob_classify(line.lower()) print prob_dist.max() print "PROB POS: " + str(round(prob_dist.prob("pos"), 2)) print "PROB NEG: " + str(round(prob_dist.prob("neg"), 2)) except KeyboardInterrupt: break if not line: break