def train(self, training_corpus): assert isinstance(training_corpus, (list, tuple)) assert isinstance(training_corpus[0], dict) featureset = [(twit_features(i["text"]), i["polarity"]) for i in training_corpus if i["denied"] == 0] self.classifier = NaiveBayesClassifier.train(featureset)
def get_sentiment_data(query, training_set): train = [] with open('training/' + training_set + '/training.txt') as f: for line in f: temp = line.split('\t') #print temp train.append((get_features(temp[1]), temp[0])) clf = NaiveBayesClassifier.train(train) tweets = grab_tweets(query) print "HERE" classified = {} for tweet in tweets: if tweet.created_at in classified.keys(): classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))] else: classified[tweet.created_at] = [clf.classify(get_features(tweet.text))] print classified returndata = {} for key in classified: #numpos = sum([1 if v=='pos' else 0 for v in classified[key]]) #returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative # percent: returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key]) #returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0 print returndata return returndata
def nltk_model(): """Fits the (non-parametric) naive Bayes classifier from nltk on the names dataset.""" # each elt of all_names will be a (name, gender) tuple all_names = list() with open(MALE_FILE, "r") as f: for line in f: all_names.append((line.rstrip(), "male")) # rstrip removes trailing whitespace with open(FEMALE_FILE, "r") as g: for line in g: all_names.append((line.rstrip(), "female")) # assert stmts can be useful for debugging etc assert len(all_names) == 7944 # shuffle all_names in place random.shuffle(all_names) # features are ({'feature_type': feature_value}, gender) tuples features = [(nltk_featurize(name), gender) for name, gender in all_names] split_pt = int(TRAIN_PCT * len(features)) train_set, test_set = features[:split_pt], features[split_pt:] nb = NaiveBayesClassifier.train(train_set) print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set))) nb.show_most_informative_features(10)
def test_raw_mail(org_email): features_test = {} wordtokens_test = [word_limit.lemmatize(key.lower()) for key in word_tokenize(org_email)] for key in wordtokens_test: if key not in stpwords: features_test[key] = True return features_test #Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle] #Splitting the test and training data sets from the whole email set features size_feature = int(len(feature_sets) * 0.10) train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature] classifier = NaiveBayesClassifier.train(train_set) #print (test_set[1:5]) #Printing the accuracy of the machine print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) #Printing the top 50 features classifier.show_most_informative_features(50) #Printing the spam and ham labels print ('labels:',classifier.labels()) #Classification of user entered email while(True): featset = raw_mail(input("Enter text to classify: ")) print (classifier.classify(featset))
def __init__(self, chatbot, **kwargs): super().__init__(chatbot, **kwargs) from nltk import NaiveBayesClassifier self.positive = kwargs.get('positive', [ 'what time is it', 'hey what time is it', 'do you have the time', 'do you know the time', 'do you know what time it is', 'what is the time' ]) self.negative = kwargs.get('negative', [ 'it is time to go to sleep', 'what is your favorite color', 'i had a great time', 'thyme is my favorite herb', 'do you have time to look at my essay', 'how do you have the time to do all this' 'what is it' ]) labeled_data = ( [(name, 0) for name in self.negative] + [(name, 1) for name in self.positive] ) train_set = [ (self.time_question_features(text), n) for (text, n) in labeled_data ] self.classifier = NaiveBayesClassifier.train(train_set)
def __init_naive_bayes( self ): """ Create and trains the NaiveBayes Classifier """ try: # corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: '))) # while corpus_no == 0 or corpus_no > 3: # corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' ))) corpus = 'corpus2'#+str(corpus_no) path = os.path.join('corpora/',corpus) spam_path = os.path.join(path,'spam') ham_path = os.path.join(path,'ham') spam_dir = os.listdir(spam_path) ham_dir = os.listdir(ham_path) train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir] train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir] spam_size = len(train_spam_filelist) ham_size = len(train_ham_filelist) train_spam_set = self.__make_featured_set(train_spam_filelist,'spam') train_ham_set = self.__make_featured_set(train_ham_filelist,'ham') train_set = train_spam_set + train_ham_set self.classifier = NaiveBayesClassifier.train( train_set ) except: raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\ os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\ sys.exc_info()[2].tb_lineno, \ sys.exc_info()[1].message )
def check_classifier(feature_extractor, **kwargs): ''' Train the classifier on the training spam and ham, then check its accuracy on the test data, and show the classifier's most informative features. ''' # Make training and testing sets of (features, label) data train_set, test_spam, test_ham = \ make_train_test_sets(feature_extractor, **kwargs) #=============================================== # ADD YOUR CODE HERE # Train the classifier on the training set (train_set) # classifier = /your code/ # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham) # spam_accuracy = /your code/ # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham) # ham_accuracy = /your code/ #=============================================== classifier = NaiveBayesClassifier.train(train_set) spam_accuracy = nltk.classify.accuracy(classifier, test_spam) ham_accuracy = nltk.classify.accuracy(classifier, test_ham) # How accurate is the classifier on the test sets? print ('Test Spam accuracy: {0:.2f}%' .format(100 * spam_accuracy)) print ('Test Ham accuracy: {0:.2f}%' .format(100 * ham_accuracy)) # Show the top 20 informative features print classifier.show_most_informative_features(20)
def train_nltk(data, labels): ''' Returns a trained nltk.NaiveBayesClassifier Inputs --------- data -- np.array of tuples ''' # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True) best_model = None max_acc = float('-inf') for k, (train_index, test_index) in enumerate(kf): X_train, Y_train = data[train_index], labels[train_index] X_test, Y_test = data[test_index], labels[test_index] features_train = bulk_extract_features(X_train) features_test = bulk_extract_features(X_test) train_set = zip(features_train, Y_train) test_set = zip(features_test, Y_test) model = nbc.train(train_set) acc = nltk.classify.accuracy(model, test_set) print str(acc) if acc > max_acc: max_acc = acc best_model = model best_model.show_most_informative_features(30) return best_model
def __init__(self, **kwargs): super(TimeLogicAdapter, self).__init__(**kwargs) from nltk import NaiveBayesClassifier self.positive = [ 'what time is it', 'do you know the time', 'do you know what time it is', 'what is the time' ] self.negative = [ 'it is time to go to sleep', 'what is your favorite color', 'i had a great time', 'what is' ] labeled_data = ( [(name, 0) for name in self.negative] + [(name, 1) for name in self.positive] ) # train_set = apply_features(self.time_question_features, training_data) train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data] self.classifier = NaiveBayesClassifier.train(train_set)
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def train(self): """ """ catalog = getToolByName(self, "portal_catalog") presentNouns = dict() trainingData = [] allNouns = catalog.uniqueValuesFor("noun_terms") for item in allNouns: presentNouns.setdefault(item, 0) subjectIndex = catalog._catalog.getIndex("Subject") nounTermsIndex = catalog._catalog.getIndex("noun_terms") # The internal catalog ids of the objects # that have noun terms in the catalog nounTermIndexIds = IISet(nounTermsIndex._unindex.keys()) # The internal catalog ids of the objects # that have subjects in the catalog subjectIndexIds = IISet(subjectIndex._unindex.keys()) commonIds = intersection(subjectIndexIds, nounTermIndexIds) for cid in commonIds: nounPresence = presentNouns.copy() nouns = nounTermsIndex._unindex[cid] tags = subjectIndex._unindex[cid] for noun in nouns: nounPresence[noun] = 1 for tag in tags: trainingData.append((nounPresence, tag)) if trainingData: self.classifier = NaiveBayesClassifier.train(trainingData)
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n): classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV'] allclassifiers = [] for name in classnames: for i in range(n): random.shuffle(featureslist) train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION) if name == 'Naive Bayes': spamclassifier = NaiveBayesClassifier.train(train_set) if name == 'Logistic Regression': spamclassifier = SklearnClassifier(LogisticRegression()) spamclassifier.train(train_set) if name == 'Linear SCV': spamclassifier = SklearnClassifier(LinearSVC(C=0.01)) spamclassifier.train(train_set) perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name) if i == 0: perfmeasures_n = perfmeasures_i else: perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i) # Store last classifier built per model allclassifiers.append(spamclassifier) # Print performance measures per classifier printperformance(name, perfmeasures_n, n) return allclassifiers
def get_matrix(spam_set, ham_set, num_folds): ''' Generate different matrix by taking the average of K Fold data ''' total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0 for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds): classifier = NaiveBayesClassifier.train(train_set) spam_len = len(test_spam_set) ham_len = len(test_ham_set) true_positive = false_positive = true_negative = false_negative = 0 for test in test_spam_set: features = test[0] predicted_label = classifier.classify(features) if predicted_label == 0: true_positive += 1 else: false_negative += 1 for test in test_ham_set: features = test[0] predicted_label = classifier.classify(features) if predicted_label == 1: true_negative += 1 else: false_positive += 1 precision = true_positive / float(true_positive + false_positive) recall = true_positive / float(true_positive + false_negative) F1 += (2 * precision * recall) / (precision + recall) spam_accuracy += true_positive / float(true_positive + false_negative) ham_accuracy += true_negative / float(true_negative + false_positive) total_precision += precision total_recall += recall return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
def train_classifiers(self): for word in self.senses: train_set = [] for senseId in self.senses[word]: for lsa_vector in self.senses[word][senseId]: train_set.append([dict(lsa_vector), senseId]) self.classifiers[word] = NaiveBayesClassifier.train(train_set)
def training(features, method, proportion_training): training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing testing_set = features[int(proportion_training*len(features)):] if method == 'NaiveBayes': classifier = NaiveBayesClassifier.train(training_set) return training_set, testing_set, classifier
def train(self, foldPercent=.8): features = self.buildFeatures() foldIndex = int(foldPercent * len(features)) self.setTrain = features[:foldIndex] self.setTest = features[foldIndex:] self.classifier = nbc.train(self.setTrain)
def train(features, samples_proportion): train_size = int(len(features) * samples_proportion) train_set, test_set = features[:train_size], features[train_size:] print ('Training set size = ' + str(len(train_set)) + ' emails') print ('Test set size = ' + str(len(test_set)) + ' emails') train_set_tuple = tuple(train_set) classifier = NaiveBayesClassifier.train(train_set_tuple) return train_set, test_set, classifier
def textClass(): #dbFile = open("samp.txt") dbFile = open("all.txt") reviews = list() #each list element is a list of words in the review ratings = list() #ratings given usefulness = list() #review classification tot_recs = 0 len_tot = 0 mlen = 0 #parse the file and create the list to be passed to the NBClassifiers while tot_recs < 150000:#True: if tot_recs % 1000 == 0: print "num records:", tot_recs tot_recs += 1 raw_rec = readRec(dbFile) if len(raw_rec) == 0: break review_text = [word.strip(punctuation) for word in raw_rec["text"]] rate_val = str( raw_rec["score"][0] ) prs_rec = parse4ftrs(raw_rec) len_tot += prs_rec["length"] if prs_rec["length"] > mlen: mlen = prs_rec["length"] use_val = str( prs_rec["class"] ) #print use_val, rate_val #word feature dictionary wfd = word_feats(review_text) ratings.append( ( wfd , rate_val) ) usefulness.append( ( wfd, use_val) ) dbFile.close() print "avg length:", len_tot/tot_recs print "max len:", mlen #select a cutoff for test v training #nrecs = len(ratings) nrecs = tot_recs rate_cl = NaiveBayesClassifier.train(ratings) use_cl = NaiveBayesClassifier.train(usefulness) return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham): """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham), then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative features are showed. """ classifier = NaiveBayesClassifier.train(train_set) print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam))) print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham))) print classifier.show_most_informative_features(20)
def train(features, samples_proportion): train_size = int(len(features) * samples_proportion) # initialise the training and test sets train_set, test_set = features[:train_size], features[train_size:] print ('Training set size = ' + str(len(train_set)) + ' emails') print ('Test set size = ' + str(len(test_set)) + ' emails') # train the classifier classifier = NaiveBayesClassifier.train(train_set) return train_set, test_set, classifier
def buildClassifier(hamDir, spamDir): spamEmails = [] hamEmails = [] allEmails = [] features = [] # Using glob instead of os.listdir to ignore hidden files for email in glob.glob(spamDir + "/*"): f = open(email) spamEmails.append(f.read()) f.close() for email in glob.glob(hamDir + "/*"): f = open(email) hamEmails.append(f.read()) f.close() for email in spamEmails: allEmails.append((email, 'spam')) for email in hamEmails: allEmails.append((email, 'ham')) # Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle. random.shuffle(allEmails) # Make a list of feature per email for (email, label) in allEmails: features.append((emailFeatures(email), label)) # 70:30 ratio for training:testing print "Using a 70:30 ratio for training:testing, the accuracy is as follows: " totalSize = int(len(features) * 0.7) trainingEmails, testingEmails = features[:totalSize], features[totalSize:] print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails)) classifier = NaiveBayesClassifier.train(trainingEmails) print classify.accuracy(classifier, testingEmails) print "Now creating and saving a full size classifier made up of %d emails..." %len(features) classifier = NaiveBayesClassifier.train(features) saveClassifier(classifier, "full-classifier.pickle")
def __init__(self,classifierType): titles = [] bodies = [] invalids = [] drivers = [] fromFields = [] toFields = [] ctitles = [] cbodies = [] cdrivers = [] dirname = os.path.dirname(__file__) with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') i = -1 for row in spamreader: i += 1 if (i > 0): titles.append(row[0]) bodies.append(row[3]) fromFields.append(row[6]) toFields.append(row[7]) invalids.append(row[6] == 'invalid') drivers.append(row[10]) if not row[6] == 'invalid': ctitles.append(row[0]) cbodies.append(row[3]) cdrivers.append(row[10]) words = [] if classifierType == 'driver': for i in range(len(ctitles)): words += nltk.word_tokenize(ctitles[i]) words += nltk.word_tokenize(cbodies[i]) documents = [((nltk.word_tokenize(ctitles[i]) + nltk.word_tokenize(cbodies[i])) , cdrivers[i]) for i in range(len(ctitles))] random.shuffle(documents) elif classifierType == 'invalid': for i in range(len(titles)): words += nltk.word_tokenize(titles[i]) words += nltk.word_tokenize(bodies[i]) documents = [((nltk.word_tokenize(titles[i]) + nltk.word_tokenize(bodies[i])) , str(invalids[i])) for i in range(len(ctitles))] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in words) self.word_features = all_words.keys()[:500] self.training_set = [(self.document_features(d), c) for (d,c) in documents] self.classifier = NaiveBayesClassifier.train(self.training_set)
def naives_classifier(self, training_set, dev_set, log=0): classifier = NaiveBayesClassifier.train(training_set) accuracy = classify.accuracy(classifier, dev_set) print('Naive Bayes accuracy dev percent: ', (accuracy * 100)) if log == 1: classifier.show_most_informative_features(20) return classifier
def user_name_classify(user_name, classifier): """Infer a gender for a User given any name, using a Naive Bayes classifier """ names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] features = [(name, gender) for (name, gender) in names] training_set = features[500:] test_set = features[:500] classifier = NaiveBayesClassifier.train(training_set) return classifier.classify(user_name)
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9): files = [positiveFile, negativeFile] tweetfeats = [] masterfeats = {} for fn in files: f = open(fn, 'r') theclass = "pos" if fn == negativeFile: theclass = "neg" sep = '\t' fin = csv.reader(f, delimiter = sep) for line in fin: text = line[1] if (len(line) != 9): print(text) # break up into tokens removing all non-word chars feat = featurify(text) for f in feat: if f in masterfeats: masterfeats[f] += 1 else: masterfeats[f] = 0 if len(feat) > 0: tweetfeats.append((feat, theclass)) mfn = masterfeats.copy() for f in masterfeats: if masterfeats[f] < nOccurrences: del mfn[f] masterfeats = mfn f = open("features.lst", "w") f.write('\n'.join(list(masterfeats.keys()))) f.close() print "Number of Features = %i" % len(masterfeats) train_cut = int(len(tweetfeats) * trainProportion) random.shuffle(tweetfeats) trainfeats = tweetfeats[:train_cut] testfeats = tweetfeats[train_cut:] print "Training sentiment classifier..." sys.stdout.flush() classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() sys.stdout.flush() # SAVE the classifier & features f = open("classifier.pickle", 'w') pickle.dump(classifier, f) f.close() f = open("features.pickle", 'w') pickle.dump(masterfeats, f) f.close()
def classify(text, sender=None, subject=None): training_set = load_training_set() classifier = NaiveBayesClassifier.train(training_set) test_data = bag_of_words(extract_bigrams(text)) if sender is not None: test_data[sender] = True if subject is not None: test_data[subject] = True classified = classifier.prob_classify(test_data) pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()}) return categories[classified.max()]
def train(self, data): self.result_string = self._represent(data) self.labels = defaultdict(int) result_string_len = len(self.result_string) self.labels = FreqDist(self.result_string) train = [] for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1): window = self.result_string[start:start + self.n_w] x_key = self.result_string[start + self.n_w] train.append(self._gen_feature(window, x_key)) self.classifier = NaiveBayesClassifier.train(train)
def buildRevClassifier(self, features, normalize, validity): revs = self.values() random.shuffle(revs) featureSets = [(features(rev), rev.reviewer) for rev in self.values()] #limit = {'5':0, '4':0, '3':0, '2':0, '1':0} #for feature, rank in featureSets: # if limit[rank] > normalize: # featureSets.remove((feature, rank)) # limit[rank] += 1 return NaiveBayesClassifier.train(featureSets)
def cross_validate(): training_set = load_training_set() random.shuffle(training_set) average = 0 cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None) for traincv, evalcv in cv: classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]]) acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]]) print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1] print 'Accuracy: %4.2f' % acc average += acc print 'Average accuracy: %4.2f' % (average / 10)
ts = ts[:2] #print ts #feat_set=dict(feat_set) training_data = zip(tl, ts) #training_data=dict(training_data) #training_data, test_set = feat_set[:700],feat_set[700:] vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data])) feature_set = [ ({i: (i in word_tokenize(sentence.lower())) for i in vocabulary}, tag) for sentence, tag in training_data ] classifier = nbc.train(feature_set) #for classifying a new sentence test_sentence = tl[1] featurized_test_sentence = { i: (i in word_tokenize(test_sentence.lower())) for i in vocabulary } print "test_sent:", test_sentence print "tag:", classifier.classify(featurized_test_sentence) #print nltk.classify.accuracy(classifier,test_set)
def train_topic_classifier(self, train_set): classifier = NaiveBayesClassifier.train(train_set) return classifier
def train(self, trainingData): self.classifier = NaiveBayesClassifier.train(trainingData)
#而不是真正意义上的随机序列。 Seed就是这个算法开始计算的第一个值。所以就会出现只要seed是一样的,那么后续所有“随机”结果和顺序也都是完全一致的。 random.seed(5)#指定种子,按照特定算法生成固定的随机数 random.shuffle(data)#打乱序列顺序 #创建测试数据 input_names=['Alexander','Danielle','David','Cheryl'] #定义将被训练和测试数据的百分比 num_train = int(0.8*len(data)) #循环输入不同的长度,比较精确度 for i in range(1,6): print('\nNumber of end letters:',i) features = [(extract_features(n,i),gender) for (n,gender) in data] #将数据分成训练和测试 train_data,test_data = features[:num_train],features[num_train:] #使用训练数据构建朴素贝叶斯分类器 calssifier =NaiveBayesClassifier.train(train_data) #计算分类器的准确度 accuracy = round(100*nltk_accuracy(calssifier,test_data),2) print('Accuracy = '+str(accuracy)+'%') #使用训练的模型预测输入数据的输出 for name in input_names: print(name,'==>',calssifier.classify(extract_features(name,i)))
def main(): should_download = input("Do you need to download nltk libraries? [y/n] ") if should_download == "y": download_nltk_libraries() analysis = SentimentAnalysis() # If the cleaned and tokenized data is already cached, pull from that if os.path.isfile('cache/cleaned_training_data_negative_cache.csv'): cleaned_positive_content = read_cache( 'cache/cleaned_training_data_positive_cache.csv') cleaned_negative_content = read_cache( 'cache/cleaned_training_data_negative_cache.csv') print("Read from cache") else: # Otherwise, clean and tokenize the data and then cache it. split_training_file() positive_tokens = analysis.tokenize_training_model(positive_tweets) negative_tokens = analysis.tokenize_training_model(negative_tweets) cleaned_positive_content = analysis.clean_content(positive_tokens) cleaned_negative_content = analysis.clean_content(negative_tokens) write_header('cache/cleaned_training_data_positive_cache.csv') write_header('cache/cleaned_training_data_negative_cache.csv') write_cache('cache/cleaned_training_data_positive_cache.csv', cleaned_positive_content) write_cache('cache/cleaned_training_data_negative_cache.csv', cleaned_negative_content) positive_content_for_model = analysis.prepare_content_for_model( cleaned_positive_content) negative_content_for_model = analysis.prepare_content_for_model( cleaned_negative_content) # The dataset needs to be converted to a dict applicable for training. positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_content_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_content_for_model] # The positive and negative sentiment halves to train off of should be combined again, and the order randomized. dataset = positive_dataset + negative_dataset random.shuffle(dataset) # train the first 70%, test the last 30%. We have 1.6 million tweets in our training data. train_data = dataset[:1120000] test_data = dataset[1120000:] print("Training using dataset") classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) # After training, we can repeat the process using real data. tokenized_tweets, og_tweets, num_original_tweets = fetch_featured_tweets() assert len(tokenized_tweets) == len(og_tweets) cleaned_drug_tokens = analysis.clean_content(tokenized_tweets) print("Running network on real tweets") num_positives = 0 for idx, tokens in enumerate(cleaned_drug_tokens): original_tweet = og_tweets[idx] token_dict = dict([token, True] for token in tokens) try: # We instruct our network to classify each tweet, and only output Positive sentiment tweets. classified = classifier.classify(token_dict) if classified == 'Positive': num_positives += 1 print(original_tweet, "=>", classified) except Exception: print("exception") print("\nTotal original tweets:", num_original_tweets) print("Total drug related tweets:", len(cleaned_drug_tokens)) print("Percent of original tweets that are drug related:", len(cleaned_drug_tokens) / num_original_tweets) print("Total number of positive sentiment tweets:", num_positives) print("Percent of drug related tweets with positive sentiment:", num_positives / len(cleaned_drug_tokens)) return 0
positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) #Create list that contains lists that contains our dictionary sentences and the string "possitive" positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] #Create list that contains lists that contains our dictionary sentences and the string "negative" negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] #Merging the list of data dataset = positive_dataset + negative_dataset #Randomize their position random.shuffle(dataset) #split dataset in 80% training and 20% as testing value = 0.8 * len(dataset) + 1 train_dataset = dataset[:int(value)] test_dataset = dataset[int(value):] #Call and train Naives Bayes classifier classifier = NaiveBayesClassifier.train(train_dataset) #Check and print the accuracy with the testing data print("Accuracy is:", classify.accuracy(classifier, test_dataset)) #Show the 10 more important words print(classifier.show_most_informative_features(10)) #Create and run a testing tweet custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
features = {} for w in word_features: features[w] = (w in words) return features # Creating features for each review featuresets = [(find_features(rev), category) for (rev, category) in documents] # Shuffling the documents random.shuffle(featuresets) training_set = featuresets[:20000] testing_set = featuresets[20000:] classifier = NaiveBayesClassifier.train(training_set) accuracy = classify.accuracy(classifier, testing_set) MNB_clf = SklearnClassifier(MultinomialNB()) MNB_clf.train(training_set) #print("MNB_classifier accuracy percent:", (classify.accuracy(MNB_clf, test_set))*100) BNB_clf = SklearnClassifier(BernoulliNB()) BNB_clf.train(training_set) #print("BernoulliNB_classifier accuracy percent:", (classify.accuracy(BNB_clf, test_set))*100) LogReg_clf = SklearnClassifier(LogisticRegression()) LogReg_clf.train(training_set) #print("LogisticRegression_classifier accuracy percent:", (classify.accuracy(LogReg_clf, test_set))*100) SGD_clf = SklearnClassifier(SGDClassifier())
features_data = np.array(sentences) features_data_test = np.array(testSentences) k_fold = KFold(n_splits=10, random_state=1992, shuffle=True) word_features = None accuracy_scores = [] accuracy_data_scores = [] for train_set, test_set in k_fold.split(features_data): word_features = get_word_features( get_words_in_sentences(features_data[train_set].tolist())) train_features = apply_features(extract_features, features_data[train_set].tolist()) test_features = apply_features(extract_features, features_data[test_set].tolist()) classifier = NaiveBayesClassifier.train(train_features) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) testdata_features = apply_features(extract_features, features_data_test.tolist()) refdatasets = collections.defaultdict(set) testdatasets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_features): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) for i, (feats, label) in enumerate(testdata_features):
def train_model(self, data): self.model = NaiveBayesClassifier.train(data)
def trainModel(self, train_data, test_data): return NaiveBayesClassifier.train(train_data)
def train(all_features, ratio): train_size = int(len(all_features) * ratio) train_set, test_set = all_features[:train_size], all_features[train_size:] clf = NaiveBayesClassifier.train(train_set) return train_set, test_set, clf
def main(): model_csv = INPUT_PATH + '/newsSentiment.csv' stop_words = stopwords.words('english') all_model_data = [] model_data = {'positive': [], 'neutral': [], 'negative': []} tokenized_data_rows = [] with open(model_csv, newline='', encoding="ISO-8859-1") as csv_file: csv_reader = csv.reader(csv_file) for row in csv_reader: # model_data.append((row[1], row[0])) all_model_data.append((row[0], row[1])) tokens = remove_noise(word_tokenize(row[1]), stop_words) model_data[row[0]].append(tokens) tokenized_data_rows.append((tokens, row[0])) # use UTC time to_datetime = datetime.utcnow() from_datetime = to_datetime - timedelta(days=7) news_data = load_news_data(from_datetime, to_datetime) news_data = news_data[news_data['category'] == 'business'] # remove tiny snippets # news_data = news_data[(news_data.description.map(len) > 140)] # positive_tweets = twitter_samples.strings('positive_tweets.json') # negative_tweets = twitter_samples.strings('negative_tweets.json') # text = twitter_samples.strings('tweets.20150430-223406.json') # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') # positive_cleaned_tokens_list = [] # negative_cleaned_tokens_list = [] # for tokens in positive_tweet_tokens: # positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # for tokens in negative_tweet_tokens: # negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # all_pos_words = get_all_words(model_data['positive']) # freq_dist_pos = FreqDist(all_pos_words) # print(freq_dist_pos.most_common(10)) # positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) # negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) # positive_dataset = [(tweet_dict, "Positive") # for tweet_dict in positive_tokens_for_model] # negative_dataset = [(tweet_dict, "Negative") # for tweet_dict in negative_tokens_for_model] # [(tweet_dict, "Negative") # for tweet_dict in negative_tokens_for_model] # dataset = positive_dataset + negative_dataset dataset = [] for key, token_list in model_data.items(): tokens_for_model = get_tweets_for_model(token_list) # for token_row in token_list: dataset.extend([(tweet_dict, key) for tweet_dict in tokens_for_model]) random.shuffle(dataset) partition_number = len(dataset) * 3 // 4 train_data = dataset[:partition_number] test_data = dataset[partition_number:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) # custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." # custom_tokens = remove_noise(word_tokenize(custom_tweet)) # print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) news_data['sentiment'] = news_data['title'].apply(classify_string, args=(classifier, )) save_path = TEST_OUTPUT_PATH + '/data3.csv' results = news_data[['title', 'sentiment']] results.to_csv(save_path, index=False, encoding='utf-8')
def main(): global positive_tokens global cleaned_positive_tokens global negative_tokens global cleaned_negative_tokens global predict_tokens global cleaned_predict_tokens global output_list global temp_matrix # get cleaned up tokens print("......Cleaning up Dataset......") print("...tokenizing...") print("...normalizing...") print("...Lemmatizing...") print("...removing stop words...\n") clean_up_tweets(positive_input_file_dir, train_text_column_index, positive_tokens, cleaned_positive_tokens) print("Done: clean up positive tweets") clean_up_tweets(negative_input_file_dir, train_text_column_index, negative_tokens, cleaned_negative_tokens) print("Done: clean up negative tweets\n") #print(positive_tokens[4]) #print(cleaned_positive_tokens[4]) #print(negative_tokens[4]) #print(cleaned_negative_tokens[4]) # Converting Tokens to a Dictionary: positive_tokens_for_model = get_tweets_for_model(cleaned_positive_tokens) negative_tokens_for_model = get_tweets_for_model(cleaned_negative_tokens) print("Done: Convert tokens to dictionaries.\n") # create a dataset by joining the positive and negative tweets. positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset print("Done: Combine dataset by joining the positive and negative tweets.") # random shuffle random.shuffle(dataset) print(f"positive dataset: {len(positive_dataset)} tweets.") print(f"negative dataset: {len(negative_dataset)} tweets.") print(f"combine positive & negative dataset: {len(dataset)} tweets.\n") print("......Training Data......") # splits the shuffled data into a ratio of 7:3 for training and testing train_data = dataset[:round(len(dataset)*0.7)] test_data = dataset[round(len(dataset)*0.7):] print(f"train data: {len(train_data)} tweets") print(f"test data: {len(test_data)} tweets\n") print("Build & Test Naive_Bayes_Classifier Model: ") classifier = NaiveBayesClassifier.train(train_data) print("=============Accuracy====================") print(f"Accuracy is:{classify.accuracy(classifier, test_data)}\n") print(classifier.show_most_informative_features(10)) # build confusion matrix refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) labels = [] tests = [] for i, (feats, label) in enumerate(test_data): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) labels.append(label) tests.append(observed) print("=============Precision and Recall====================") print(f"Positive precision: {nltk.precision(refsets['Positive'], testsets['Positive'])}") print(f"Positive recall: {nltk.recall(refsets['Positive'], testsets['Positive'])}") print(f"Positive F-measure: {nltk.f_measure(refsets['Positive'], testsets['Positive'])}") print(f"Negative precision: {nltk.precision(refsets['Negative'], testsets['Negative'])}") print(f"Negative recall: {nltk.recall(refsets['Negative'], testsets['Negative'])}") print(f"Negative F-measure: {nltk.f_measure(refsets['Negative'], testsets['Negative'])}") print("=============Confusion Matrix====================") confusion_matrix_result = nltk.ConfusionMatrix(labels, tests) print(confusion_matrix_result) # now visualize the confusion matrix using matplotlib.pyplot #=============Visualize Confusion Matrix==================== # matirx needs to be saved as np.array() # also, needs to extract ._confusion first confusion_matrix_result = np.array(confusion_matrix_result._confusion) temp_matrix = confusion_matrix_result classes = ["Negatives", "Positives"] plt.figure() plt.imshow(confusion_matrix_result, interpolation='nearest', cmap=plt.cm.Blues) plt.title("Confusion Matrix") plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes) plt.yticks(tick_marks, classes) text_format = 'd' thresh = confusion_matrix_result.max()/2 for row, column in itertools.product(range(confusion_matrix_result.shape[0]), range(confusion_matrix_result.shape[1])): plt.text(column, row, format(confusion_matrix_result[row, column], text_format), horizontalalignment='center', color='white' if confusion_matrix_result[row, column] > thresh else "black") plt.ylabel("True Values") plt.xlabel("Predicted Values") plt.tight_layout() # needs a high resolution image plt.savefig("/Users/Han/Downloads/web project data/confusion_matrix.png", dpi=1200) plt.show() # =======================================now predict new tweets======================================= print("......Now Cleaning up new Dataset......") print("...tokenizing...") print("...normalizing...") print("...Lemmatizing...") print("...removing stop words...\n") clean_up_tweets(predict_input_file_dir, predict_text_column_index, predict_tokens, cleaned_predict_tokens) print("Done: clean up predict tweets\n") print("...Now Deploy Bayes Classifier on new dataset...") for current_tweet_tokens in cleaned_predict_tokens: output_list.append([classifier.classify(dict([token, True] for token in current_tweet_tokens))]) write_csv(output_list, output_file_dir) print("Done! ")
def train(self, train_set): self.classifier = NaiveBayesClassifier.train(train_set) return self.classifier
print("Dictionary with Positive class : ", positiveReviewDataset[7]) print("Dictionary with Negative class : ", negativeReviewDataset[7]) #print("tagged neg :",negative_dataset[0]) dataset = positiveReviewDataset + negativeReviewDataset print("Dataset[0] :", dataset[0]) print("Dataset length", len(dataset)) random.shuffle(dataset) trainData = dataset[:7000] testData = dataset[7000:] trainedModel = NaiveBayesClassifier.train(trainData) print("Accuracy of the model : ", classify.accuracy(trainedModel, testData)) review = "This is a bad product." reviewTokens = noiseRemoval(word_tokenize(review)) # Test print print(review, " : ", trainedModel.classify(dict([token, True] for token in reviewTokens))) #Text = "j@nittha" #Text = re.sub("@", "a", Text) #print(Text)
for palavra_unica_base_tratada in palavras_unicas_base_tratada: resultado_linha_palavra['%s' % palavra_unica_base_tratada] = ( palavra_unica_base_tratada in palavras_unicas_da_frase) # print(f'{frase}: {palavras_unicas_da_frase} : {resultado_linha_palavra}\n') return resultado_linha_palavra # Base classificada base_classificada = classify.apply_features(extrator_linha_nltk, base_sem_stop_words_stemmed) # Constrói classificador de probabilidade do Naive Bayes classificador = NaiveBayesClassifier.train(base_classificada) # Estatísticas do Classificador print( f'As classes existentes na base classificada são {classificador.labels()}\n' ) print(f'As 5 principais características são:') classificador.show_most_informative_features(5) print_space_between_logs() # Utilizando o classificador print(f'Utilizando classificador Naive Bayes para obter a classe\n') def imprimir_classificacao_frase(frase):
def get_classifier(): # positive_tweets = twitter_samples.strings("positive_tweets.json") # negative_tweets = twitter_samples.strings("negative_tweets.json") # text = twitter_samples.strings("tweets.20150430-223406.json") # tokens = twitter_samples.tokenized("positive_tweets.json")[0] stop_words = stopwords.words("english") positive_reviewids = [ x.reviewid for x in session.query(Review).filter( Review.score >= 5).order_by(Review.score.desc()).all() ] positive_reviews = [] for id in random.sample(positive_reviewids, 100): positive_reviews.append( session.query(Content).filter( Content.reviewid == id).first().content) negative_reviewids = [ x.reviewid for x in session.query(Review).filter( Review.score < 5).order_by(Review.score).all() ] negative_reviews = [] for id in random.sample(negative_reviewids, 100): negative_reviews.append( session.query(Content).filter( Content.reviewid == id).first().content) # positive_tokens = twitter_samples.tokenized("positive_tweets.json") # negative_tokens = twitter_samples.tokenized("negative_tweets.json") positive_tokens = [nltk.word_tokenize(x) for x in positive_reviews] negative_tokens = [nltk.word_tokenize(x) for x in negative_reviews] positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # all_pos_words = get_all_words(positive_cleaned_tokens_list) # freq_dist_pos = FreqDist(all_pos_words) # print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_for_model(negative_cleaned_tokens_list) positive_dataset = [(word_dict, "Positive") for word_dict in positive_tokens_for_model] negative_dataset = [(word_dict, "Negative") for word_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) # train_data = dataset[:7000] # test_data = dataset[7000:] train_data = dataset return NaiveBayesClassifier.train(train_data)
def mine_tweets(infile: str, tweetout: str, gramout: str) -> None: """Classify, prune, and atomize Tweets.""" logger = logging.getLogger("miner") logger.info("Gathering and tokenizing positive tweets") positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json") logger.info("Gathering and tokenizing negative tweets") negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json") logger.info("Cleaning model tokens") positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] # Clean tokens for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(normalize(tokens)) # Clean tokens for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(normalize(tokens)) logger.info("Building Tweet corpus") positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) # type: ignore negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) # type: ignore # Mark positive Tweets as such positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] # Mark negative Tweets as such negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] # Create unified dataset and shuffle it dataset = positive_dataset + negative_dataset random.shuffle(dataset) # Train the data using the first 70% as # training data, and the last 30% as # testing data. logger.info("70% training, 30% testing") train_data = dataset[:7000] test_data = dataset[7000:] logger.info("Training...") classifier = NaiveBayesClassifier.train(train_data) logger.info("Accuracy is: %s", classify.accuracy(classifier, test_data)) logger.info("Classifying Tweets") tweets = [] with open(infile, "r") as csv_file: logger.info("Opened %s", infile) csv_reader = csv.reader(csv_file, delimiter=",") logger.info("Attached CSV reader to %s successfully", infile) # Counts processed Tweets and rejected ones counter: int = 0 subject_reject: int = 0 # Iterate for tweet in csv_reader: # Printing if not counter % DIVISION: logger.info("Read in %s Tweets so far...", counter) # For debugging if counter == MAX_TWEETS: break # Classify Tweet new_tweet = Tweet(tweet) dist = classifier.prob_classify( dict([token, True] for token in new_tweet.cleaned_tokens) # type: ignore ) new_tweet.positivity = dist.prob("Positive") new_tweet.negativity = dist.prob("Negative") new_tweet.difference = abs(new_tweet.positivity - new_tweet.negativity) # Assess the subjectivity of the Tweet if new_tweet.difference > SUBJECTIVITY_THRESHOLD: tweets.append(new_tweet) else: subject_reject += 1 # Count counter += 1 logger.info("Processed %s Tweets", len(tweets)) logger.info("%s Tweets were rejected for not being subjective enough", subject_reject) # Pickle Tweets pickle.dump(tweets, open(tweetout, "wb")) logger.info("Pickled %s Tweets", len(tweets)) # Storing our n-gram occurrences gram_scores: List[Dict[str, int]] = [{}, {}, {}, {}, {}] # Counting n-grams for i in range(1, 5): logger.info("Creating %s-grams", i) # Iterate for tweet in tweets: # type: ignore # Create n-grams grams = ngrams(tweet.cleaned_tokens, i) # type: ignore # Count every gram for gram in grams: # Create record for new n-gram if gram not in gram_scores[i]: gram_scores[i][gram] = 1 # Update existing record else: gram_scores[i][gram] += 1 # Serialize n-grams to file with open(gramout, "wb") as gramout_fp: pickle.dump(gram_scores, gramout_fp)
pos_features = [] for words in pos_reviews: pos_features.append((bag_of_words(words), 'pos')) # negative reviews feature set neg_features = [] for words in neg_reviews: neg_features.append((bag_of_words(words), 'neg')) shuffle(pos_features) shuffle(neg_features) test_feature_set = pos_features[:200] + neg_features[:200] train_feature_set = pos_features[200:] + neg_features[200:] classifier = NBC.train(train_feature_set) accuracy = classify.accuracy(classifier, test_feature_set) print(accuracy) #f = open('unigram_classifier.pickle', 'wb') #pickle.dump(classifier, f) #f.close() while (1): custom_review = input( "Enter a custom movie review (Press ENTER key to exit):\n") if (len(custom_review) < 1): break custom_review_tokens = word_tokenize(custom_review) custom_feature_set = bag_of_words(custom_review_tokens) print(classifier.classify(custom_feature_set))
common_words = [ word for word, freq in words_freqs.most_common(10000) if (word not in stopwords.words("english")) and (word not in ponctuation) ] print(common_words[:100]) # -------Funtions--------------------------------------------------------- def find_features(document, com_words=common_words): words = set(document) features = {} for w in com_words: features[w] = (w in words) return features # --------------------------------------------------------------------------- feature_sets = [(find_features(text), category) for (text, category) in documents] data = {} data["train"] = feature_sets[:1900] data["test"] = feature_sets[1900:] clf = NaiveBayesClassifier.train(data["train"]) # acc: 85.095 # acc = classify.accuracy(clf, data["test"])*100 clf.show_most_informative_features(10) #------TEST------------------------------------ rev_name = movie_reviews.fileids("neg")[11] text = movie_reviews.words(rev_name) clf.classify(find_features(text))
negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] sem_classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(sem_classifier, test_data)) print(sem_classifier.show_most_informative_features(10)) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print( custom_tweet, sem_classifier.classify(dict([token, True] for token in custom_tokens))) custom_tweet = "My daughter has been at MEM airport for almost 7 hours trying to fly #unitedAIRLINES to houston. #unitedair what are you going to do???"
### Get our texts into the format NLTK expects for its classifier negative_featurized = [{word: True for word in review} for review in negative_min_df] positive_featurized = [{word: True for word in review} for review in positive_min_df] negative_tagged = [(review, 'negative') for review in negative_featurized] positive_tagged = [(review, 'positive') for review in positive_featurized] all_tagged = negative_tagged + positive_tagged ### Train the classifier classifier = NaiveBayesClassifier.train(all_tagged) ### Import, process, featurize new set of movie reviews ebert_path = 'movie_reviews/ebert/' ebert_files = os.listdir(ebert_path) ebert_reviews = [open(ebert_path + name).read() for name in ebert_files] ebert_tokenized = [word_tokenize(review.lower()) for review in ebert_reviews] ebert_no_stops = [[word for word in review if word not in stopword_set] for review in ebert_tokenized] ebert_lemmatized = [[wnl.lemmatize(word) for word in review] for review in ebert_tokenized] ebert_set = [set(review) for review in ebert_lemmatized] ebert_min_df = [[word for word in review if word in more_than_once_set] for review in ebert_set] ebert_featurized = ({word: True for word in review} for review in ebert_min_df)
features['topic'] = document[0] for word in document_words: # features['contains(%s)' % word] = (word in document_words) features[word] = (word in document_words) return features tweets = file_handler.load_data( settings.BASE_DIR + '/sentiment_app/analyzer/dataset/full-corpus-lite.csv') data_set = nltk.classify.apply_features(extract_features, tweets) # training_set = data_set[:len(data_set)/2] # testing_set = data_set[len(data_set)/2:] # make classifier classifier = NaiveBayesClassifier.train(data_set) def anaylze(tweet): print tweet # tweet = ("topic", "tweet string post") # accuracy & informative features # print nltk.classify.accuracy(classifier, testing_set) # print classifier.show_most_informative_features(30) # print classifier._labels # Test Classify data = preprocess(tweet[1]) feature = extract_features((tweet[0], data))
def train(self, language): df = read_csv("./dataset/SentiWordNet_3.0.0.tsv", sep="\t", header=0, index_col='ID') labeled = [] for row in df.iterrows(): score = 0 if float(row[1]['NegScore']) > 0: score = float( numpy.tanh(row[1]['PosScore']) / -float(-row[1]['NegScore'])) else: score = float(numpy.tanh(row[1]['PosScore'])) try: tokenized = word_tokenize(row[1]['Gloss']) except: continue item = (tokenized, score) labeled.append(item) stop_words = stopwords.words(language) # positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') # negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') # positive_cleaned_tokens_list = [] # negative_cleaned_tokens_list = [] # for tokens in positive_tweet_tokens: # positive_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words)) # for tokens in negative_tweet_tokens: # negative_cleaned_tokens_list.append(self.remove_noise(tokens, stop_words)) # labeled_cleaned_tokens_list = self.remove_noise(labeled, stop_words) # all_pos_words = self.get_all_words(labeled_cleaned_tokens_list) # freq_dist_pos = FreqDist(all_pos_words) # positive_tokens_for_model = self.get_tweets_for_model(positive_cleaned_tokens_list) # negative_tokens_for_model = self.get_tweets_for_model(negative_cleaned_tokens_list) # positive_dataset = [(tweet_dict, "Positive") # for tweet_dict in positive_tokens_for_model] # negative_dataset = [(tweet_dict, "Negative") # for tweet_dict in negative_tokens_for_model] dataset = labeled random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] self.classifier = NaiveBayesClassifier.train(train_data) self.total_accuracy = classify.accuracy(self.classifier, test_data) self.refsets = collections.defaultdict(set) self.testsets = collections.defaultdict(set) print('Total accuracy: ', self.total_accuracy)
def main(): print('Building model...') print('Gathering training data...') # set nltk twitter samples as list of strings pos_sample_tweets = twitter_samples.strings('positive_tweets.json') neg_sample_tweets = twitter_samples.strings('negative_tweets.json') #### UPDATE HERE: Option to add your own tweet samples #### Remove the empty list, uncomment and update filepaths below pos_custom_tweets = [] ## helpers.import_csv('positive_tweets.csv') neg_custom_tweets = [] ## helpers.import_csv('negative_tweets.csv') # combine nltk twitter samples and custom tweets positive_tweets = pos_sample_tweets + pos_custom_tweets negative_tweets = neg_sample_tweets + neg_custom_tweets # tokenize tweets positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets] negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets] # set cleaned tokens lists positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] stop_words = stopwords.words('english') # get cleaned positive tokens for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append( helpers.remove_noise(tokens, stop_words)) # get cleaned negative tokens for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append( helpers.remove_noise(tokens, stop_words)) # convert tokens into iterable word lists all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list) all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list) # get frequency distribution of word lists freq_dist_pos = FreqDist(all_pos_words) freq_dist_neg = FreqDist(all_neg_words) # print top 10 positive and negative words print('Top 10 positive and negative words:') print(freq_dist_pos.most_common(10)) print(freq_dist_neg.most_common(10)) # convert tokens to a dictionary for modelling positive_tokens_for_model = helpers.get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = helpers.get_tweets_for_model( negative_cleaned_tokens_list) # assign a label to positive tokens positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] # assign a label to negative tokens negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] # set dataset and randomize to train model dataset = positive_dataset + negative_dataset random.shuffle(dataset) # split the data into a 70:30 ratio among 10K tweets train_data = dataset[:7000] test_data = dataset[7000:] # train a Naive Bayes model classifier = NaiveBayesClassifier.train(train_data) # print model accuracy print("Model accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) print('Model complete!\n') return classifier
return palavras def encontrarpalavrasunicas(frequencia): freq = frequencia.keys() return freq palavrasunicas = encontrarpalavrasunicas(buscafrequencia(palavras)) def extratorpalavras(documento): doc = set(documento) caracteristicas = {} for palavra in palavrasunicas: caracteristicas['%s' % palavra] = (palavra in doc) return caracteristicas classificador = NaiveBayesClassifier.train( apply_features(extratorpalavras, removestopwords(frases_padrao))) testestemming = [] stemmer = RSLPStemmer() for (palavrastreinamento) in sujeito.split(): comstem = [p for p in palavrastreinamento.split()] testestemming.append(str(stemmer.stem(comstem[0]))) print('individuo: %s - <reação da vitima = %s> ' % (sujeito, classificador.classify(extratorpalavras(testestemming))))
def trainModel(dataType, save=True): if dataType == "Twitter": pTweets = twitter_samples.strings('positive_tweets.json') nTweets = twitter_samples.strings('negative_tweets.json') cleanPTweets = preprocess(pTweets, dataType) cleanNTweets = preprocess(nTweets, dataType) pDict = [] nDict = [] for tweet in cleanPTweets: tempDict = {} for token in tweet: tempDict[token] = True pDict.append(tempDict) for tweet in cleanNTweets: tempDict = {} for token in tweet: tempDict[token] = True nDict.append(tempDict) pData = [(tweet, "Positive") for tweet in pDict] nData = [(tweet, "Negative") for tweet in nDict] dataSet = pData + nData random.shuffle(dataSet) classifier = NaiveBayesClassifier.train(dataSet) if save: modelName = "./python/models/" + dataType + "BayesModel.txt" with open(modelName, 'wb') as f: pickle.dump(classifier, f) return classifier if dataType == "Movie": cleanPReviews = [] cleanNReviews = [] for file in movie_reviews.fileids('pos'): cleanPReviews.append(movie_reviews.words(file)) for file in movie_reviews.fileids('neg'): cleanNReviews.append(movie_reviews.words(file)) pDict = [] nDict = [] for review in cleanPReviews: tempDict = {} for token in review: tempDict[token] = True pDict.append(tempDict) for review in cleanNReviews: tempDict = {} for token in review: tempDict[token] = True nDict.append(tempDict) pData = [(review, "Positive") for review in pDict] nData = [(review, "Negative") for review in nDict] dataSet = pData + nData random.shuffle(dataSet) classifier = NaiveBayesClassifier.train(dataSet) if save: modelName = "./python/models/" + dataType + "BayesModel.txt" with open(modelName, 'wb') as f: pickle.dump(classifier, f) return classifier
def nbtrain(train_set): classifier = NaiveBayesClassifier.train(train_set) return classifier
def __init__(self, feat_sets): self.train_set = feat_sets[:9500] self.test_set = feat_sets[9500:] self.Multinomial_classifier = SklearnClassifier(MultinomialNB()) self.bernoulli_classifier = SklearnClassifier(BernoulliNB()) self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
from nltk.corpus import names from nltk import NaiveBayesClassifier from nltk import classify names = [('Aidar', 'boy'), ('Marat', 'boy'), ('Aslan', 'boy'), ('Nurbek', 'boy'), ('Nurlan', 'boy'), ('Rakhman', 'boy'), ('Rustam', 'boy'), ('Islam', 'boy'), ('Daulet', 'boy'), ('Yerkebulan', 'boy'), ('Gaziz', 'boy'), ('Aigerim', 'girl'), ('Aidana', 'girl'), ('Zhansaya', 'girl'), ('Karina', 'girl'), ('Zarina', 'girl'), ('Aiman', 'girl'), ('Sholpan', 'girl'), ('Kamshat', 'girl'), ('Aisulu', 'girl'), ('Alina', 'girl'), ('Rauan', 'boy'), ('Raikhan', 'girl')] def gender_features(word): return {'last_letter': word[-1]} featuresets = [(gender_features(n), g) for (n, g) in names] train_set, test_set = featuresets[:17], featuresets[17:] nb_classifier = NaiveBayesClassifier.train(train_set) print(nb_classifier.classify(gender_features('Leyla'))) print(classify.accuracy(nb_classifier, test_set)) print(nb_classifier.show_most_informative_features(5))
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)