def generate_model(self): print("Gathering and processing tweets...") # Shuffle list of username-label tuples tuple_list = usermapping.data_tuples.items() # Split and grab tweets for users results = utils.flatten([ self.fetch_data(t) for t in tuple_list ]) # TODO: Cross-validation generation trn_ratio = int(len(results) * 0.85) shuffle(results) print(len(results)) print(trn_ratio) train = results[:trn_ratio] test = results[trn_ratio:] # Instantiate and train classifier print("Training...") cl = NaiveBayesClassifier(train) cl.train() # Save model print("Saving model...") utils.save_model(cl) # Classify test print("Testing...") print("Accuracy: {0}".format(cl.accuracy(test))) return cl
def train_NLTK_NB(labeled_list_tr): all_words = set(word.lower() for passage in labeled_list_tr for word in word_tokenize(passage[0])) train_set = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in labeled_list_tr] classifier = NaiveBayesClassifier.train(train_set) return classifier
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:71601] test_data = dataset[71601:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) positiveNB2019 = 0 negativeNB2019 = 0 positiveNB2017 = 0 negativeNB2017 = 0 positiveNB2018 = 0 negativeNB2018 = 0 for x in data2019['text']: custom_tokens = remove_noise(word_tokenize(str(x))) classification = classifier.classify(
total = accuracy = float(len(test_data)) for data in test_data: if classify_dataset(data[0]) != data[1]: accuracy -= 1 else: print data, classify_dataset(data[0]), data[1] print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy)) final = accuracy / total * 100 return final # Create training and testing data sen = c[3] emo = c[0] l = len(c[3]) limit = (9*l)//10 sente = c[2] Data = create_data(sen[:limit], emo[:limit]) test_data = create_test(sente[limit:], emo[limit:]) # extract the word features out from the training data word_features = get_word_features(\ get_words_in_dataset(Data)) # get the training set and train the Naive Bayes Classifier training_set = nltk.classify.util.apply_features(extract_features, Data) classifier = NaiveBayesClassifier.train(training_set) Naive_accu = get_accuracy(test_data, classifier) print "Accuracy using Naive Bayes Component ", Naive_accu, "%"