def getSentimentAnalyzer(): stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset classifier = NaiveBayesClassifier.train(train_data) return classifier
def startAnalysis(): # tokenize positive_tweets positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens)) pos_tweet_for_model = get_tweets_for_model(positive_cleaned_tokens_list) pos_dataset = [(tweet_dict,"Positive") for tweet_dict in pos_tweet_for_model] neg_tweet_for_model = get_tweets_for_model(negative_cleaned_tokens_list) neg_dataset = [(tweet_dict,"Negative") for tweet_dict in neg_tweet_for_model] dataset = pos_dataset + neg_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data= dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10))
def train(self): positive_tweet_tokens = twitter_samples.tokenized( "positive_tweets.json") negative_tweet_tokens = twitter_samples.tokenized( "negative_tweets.json") positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append( self.clear_data(tokens, self.stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append( self.clear_data(tokens, self.stop_words)) positive_tokens_for_model = self.get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = self.get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) return NaiveBayesClassifier.train(dataset)
def fit(self, dataset=None): """ This Method will initialize the Training for model using data sample present in nltk libraries :param dataset: :return:model object """ positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(cleanData(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(cleanData(tokens, stop_words)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) self.classifier = nltk.NaiveBayesClassifier.train(dataset) return self
def __init__(self): print("normalizing twitter samples...") self.stop_words = stopwords.words('english') self.positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') self.negative_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json')
def getSentimentAnalyzer(): stop_words = stopwords.words('english') posTokens = twitter_samples.tokenized('positive_tweets.json') negTokens = twitter_samples.tokenized('negative_tweets.json') posCleanedTokens = [] negCleanedTokens = [] for tokens in posTokens: posCleanedTokens.append(removeNoise(tokens, stop_words)) for tokens in negTokens: negCleanedTokens.append(removeNoise(tokens, stop_words)) allPosWords = get_all_words(posCleanedTokens) freqDistPos = FreqDist(allPosWords) print(freqDistPos.most_common(10)) posTokensModel = get_tweets_for_model(posCleanedTokens) negTokensModel = get_tweets_for_model(negCleanedTokens) posData = [(tweet_dict, "Positive") for tweet_dict in posTokensModel] negData = [(tweet_dict, "Negative") for tweet_dict in negTokensModel] dataset = posData + negData random.shuffle(dataset) train_data = dataset classifier = NaiveBayesClassifier.train(train_data) return classifier
def preprocess_data(self): self.pos_tweets = twitter_samples.strings('positive_tweets.json') self.neg_tweets = twitter_samples.strings('negative_tweets.json') self.text = twitter_samples.strings('tweets.20150430-223406.json') self.pos_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') self.neg_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json') self.pos_cleaned_tokens_list = [ self.remove_noise(tokens) for tokens in self.pos_tweet_tokens ] self.neg_cleaned_tokens_list = [ self.remove_noise(tokens) for tokens in self.neg_tweet_tokens ] self.all_pos_words = self.get_all_words(self.pos_cleaned_tokens_list) self.all_neg_words = self.get_all_words(self.neg_cleaned_tokens_list) self.freq_dist_pos = FreqDist(self.all_pos_words) self.freq_dist_neg = FreqDist(self.all_neg_words) self.pos_tokens_for_model = self.get_tweets_for_model( self.pos_cleaned_tokens_list) self.neg_tokens_for_model = self.get_tweets_for_model( self.neg_cleaned_tokens_list) self.pos_dataset = [(tweet_dict, "Positive") for tweet_dict in self.pos_tokens_for_model] self.neg_dataset = [(tweet_dict, "Negative") for tweet_dict in self.neg_tokens_for_model] self.dataset = self.pos_dataset + self.neg_dataset random.shuffle(self.dataset) mid = len(self.dataset) // 2 self.train_data = self.dataset[:mid] self.test_data = self.dataset[mid:]
def prepare_sentiment_classifier(): if(not os.path.exists('./.venv/nltk_data')): os.makedirs('./.venv/nltk_data') nltk.download('twitter_samples', './.venv/nltk_data/') nltk.download('punkt', './.venv/nltk_data/') nltk.download('averaged_perceptron_tagger','./.venv/nltk_data/') nltk.download('wordnet','./.venv/nltk_data/') nltk.download('stopwords','./.venv/nltk_data/') #Normalization - canonical form conversion #Define a lexical tags in the tweets and lemmatize (remove the past form, ending etc.) #Then remove noise and stop words stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) # List the most used positive words freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) # Prepare a dictionary for Bayes positive_tokens_for_model = prepare_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = prepare_tweets_for_model(negative_cleaned_tokens_list) negative_sentiment_value = -10 positive_sentiment_value = 10 positive_dataset = [(tweet_dict, positive_sentiment_value) for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, negative_sentiment_value) for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset #Mix the data to avoid bias random.shuffle(dataset) train_data = dataset[:7000] #The rest 3k of 10k tweets are for testing test_data = dataset[7000:] #Train the model classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) #print(classifier.show_most_informative_features(10)) return classifier
def __init__(self): """ This class trains the data on 10000 tweets """ self.stop_words = stopwords.words('english') self.positive_cleaned_tokens_list = [] self.negative_cleaned_tokens_list = [] self.positive_tweets_tokens = twitter_samples.tokenized('positive_tweets.json') self.negative_tweets_tokens = twitter_samples.tokenized('negative_tweets.json')
def __init__(self): self.stop_words = stopwords.words('english') self.positive_cleaned_tokens_list = [] self.negative_cleaned_tokens_list = [] self.positive_tweets_tokens = twitter_samples.tokenized( 'positive_tweets.json') self.negative_tweets_tokens = twitter_samples.tokenized( 'negative_tweets.json')
def assign_cleans(self): print('Training AI...') print('Learning Positive Tweets') for tokens in twitter_samples.tokenized('positive_tweets.json'): self.clean_positive.append(remove_noise(tokens)) print('Learning Negative Tweets') for tokens in twitter_samples.tokenized('negative_tweets.json'): self.clean_negative.append(remove_noise(tokens))
def load_raw_training_data(self): #Load the in built NLTK training data. Used other data sets if necessary #to improve accuracy. positive_data_tokenized = twitter_samples.tokenized( 'positive_tweets.json') negative_data_tokenized = twitter_samples.tokenized( 'negative_tweets.json') return positive_data_tokenized, negative_data_tokenized
def driver(): # String varaiables of the dataset positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] text = twitter_samples.strings('tweets.20150430-223406.json') stop_words = stopwords.words('english') # Tokenized variables of dataset # tweet_tokens = twitter_samples.tokenized('positive_tweets.json') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') # Cleaning the noise in the tweet tokens for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # Frequency distribution for cleaned words all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) # Creating positive and negative dictionaries positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) # Creating the final dataset for training positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset # Shuffling for suedo-randomness and avoid bias random.shuffle(dataset) # Dividing shuffled data into train and test data train_data = dataset[:7000] test_data = dataset[7000:] # Initializing the classifier classifier = NaiveBayesClassifier.train(train_data) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify( dict([token, True] for token in custom_tokens)))
def train_model(): """ Trains a Naive Bayes sentiment classifier using the twitter_samples dataset from NLTK. Each tweet is tokenized and cleaned to produce a training dataset for the machine learning model. Parameters ---------- Returns ------- NaiveBayesClassifier """ #Load dataset from nltk data positive_tweets = twitter_samples.strings("positive_tweets.json") negative_tweets = twitter_samples.strings("negative_tweets.json") #Retrieve english stop words stop_words = stopwords.words("english") #Tweet tokenization positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json") negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json") positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] #Token cleaning for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) #Extract words from tokens all_pos_words = get_all_words(positive_cleaned_tokens_list) #Frequency distribition of words freq_dist_pos = FreqDist(all_pos_words) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) #Create datasets positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] #Merge individual datasets into singular training data dataset = positive_dataset + negative_dataset train_data = dataset classifier = NaiveBayesClassifier.train(train_data) return classifier
def main_code(analysis_input): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) # print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) # print("Accuracy is:", classify.accuracy(classifier, test_data)) # print(classifier.show_most_informative_features(10)) custom_tokens = remove_noise(word_tokenize(analysis_input)) pos_or_neg = str( classifier.classify(dict([token, True] for token in custom_tokens))) return pos_or_neg
def create_model(): stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # print(positive_tweet_tokens) # print(positive_cleaned_tokens_list) #FINDING WORD DISTRIBUTION # all_pos_words = get_all_words(positive_cleaned_tokens_list) # freq_dist_pos = FreqDist(all_pos_words) # print(freq_dist_pos.most_common(10)) # MODEL PREPARATION positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) # code attaches a Positive or Negative label to each tweet positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] # print(positive_dataset) negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) # number of tweets is 10000 -- ratio of 70:30 for training and testing train_data = dataset[:7000] test_data = dataset[7000:] # MODEL TRAINING classifier = NaiveBayesClassifier.train(train_data) # MODEL SAVING WITH PICKLE filename = 'model/model_pickle1.sav' pickle.dump(classifier, open(filename, 'wb')) # MODEL SAVING WITH cPICKLE filename = 'model/model_cpickle1.sav' cPickle.dump(classifier, open(filename, 'wb')) # MODEL SAVING WITH JOBLIB filename = 'model/model_joblib1.sav' joblib.dump(classifier, filename) # MODEL ACCURACY print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10))
def calibrate(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json' ) #files downloaded from setup.py used to calibrate the classifer for sentiment analysis negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) #print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [ (tweet_dict, "Positive") #calibrating positive for tweet_dict in positive_tokens_for_model ] negative_dataset = [ (tweet_dict, "Negative") #calibrating negative for tweet_dict in negative_tokens_for_model ] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] global classifier classifier = NaiveBayesClassifier.train(train_data) #trains the data! print("Calibration complete!") print("Accuracy is:", classify.accuracy(classifier, test_data))
def train_social(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] # Classifier - TODO Add persistence classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(100)) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) return classifier
def main(): custom_input = get_custom_input() stop_words = stopwords.words('english') custom_tokens = remove_noise(word_tokenize(custom_input), stop_words) nltk_downloader() sid = SentimentIntensityAnalyzer() print("") print( "1: Sentiment Intensity Analysis: poitive, negative, neutral percentages" ) print("Sentiment Intensity without noise removal") ss = sid.polarity_scores(custom_input) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print() print("Sentiment Intensity with noise removal") ss = sid.polarity_scores(" ".join(custom_tokens)) for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print() print("") print('2: Basic Sentiment Analysis: Only Positive or Negative') positive_tweets, negative_tweets, text = fetch_twitter_samples() positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = list() negative_cleaned_tokens_list = list() for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) freq_dist_pos = FreqDist(get_all_words(positive_cleaned_tokens_list)) freq_dist_neg = FreqDist(get_all_words(negative_cleaned_tokens_list)) train_data, test_data = get_train_test_data(positive_cleaned_tokens_list, negative_cleaned_tokens_list) classifier = get_model_classifier(train_data, test_data, NaiveBayesClassifier) print("Result: ", classifier.classify(dict([token, True] for token in custom_tokens)))
def train(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) f = open('classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
def train_twtr_classifier(): if os.path.isfile(SAVED_CLSR_LOC): return load_classifier() positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') pos_twt_toks = twitter_samples.tokenized('positive_tweets.json') neg_twt_toks = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [ remove_noise(toks, stop_words) for toks in pos_twt_toks ] negative_cleaned_tokens_list = [ remove_noise(toks, stop_words) for toks in neg_twt_toks ] all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) save_classifier(classifier) return classifier
def build_classifier(): print('reading data') stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') print('cleaning tokens') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) print('building freq dist...') freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] print('training...') classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) return classifier
def __init__(self): """ Gather data """ positive = twitter_samples.strings('positive_tweets.json') negative = twitter_samples.strings('negative_tweets.json') self.stop_words = list(set(stopwords.words('english'))) positive_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tokens = twitter_samples.tokenized('negative_tweets.json') """ Clean the data """ positive_clean = [] negative_clean = [] for token in positive_tokens: positive_clean.append(self.clean(token)) for token in negative_tokens: negative_clean.append(self.clean(token)) positive_model_tokens = self.final_token_generator(positive_clean) negative_model_tokens = self.final_token_generator(negative_clean) """ Use generator to make datasets """ positive_dataset = [(token, "Positive") for token in positive_model_tokens] negative_dataset = [(token, "Negative") for token in negative_model_tokens] dataset = positive_dataset + negative_dataset """ Shake it all about """ random.shuffle(dataset) random.shuffle(dataset) random.shuffle(dataset) """ Split them up """ training = dataset[:7000] testing = dataset[7000:] """ Train the classifier """ self.classifier = NaiveBayesClassifier.train(training) """
def read_input(infile, NUM_TRAIN, NUM_TEST): train = [] test = [] pos_tweets = 0 neg_tweets = 0 for line in twitter_samples.tokenized("positive_tweets.json"): sent = "Positive" #Remove usernames, urls for i, token in enumerate(line): line[i] = re.sub("@[\S]+", "USERNAME", line[i]) line[i] = re.sub("www.[\S]+|https://[\S]+|http://[\S]+", "URL", line[i]) newstr = "" for ch in line[i]: if ord(ch) > 128: newstr += "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr += (ch) line[i] = newstr pos_tweets += 1 if pos_tweets < NUM_TRAIN: train.append((line, sent)) else: test.append((line, sent)) for line in twitter_samples.tokenized("negative_tweets.json"): sent = "Negative" neg_tweets += 1 #Remove usernames, urls for i, token in enumerate(line): line[i] = re.sub("@[\S]+", "USERNAME", line[i]) line[i] = re.sub("www.[\S]+|https://[\S]+", "URL", line[i]) newstr = "" for ch in line[i]: if ord(ch) > 128: newstr += "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr += (ch) line[i] = newstr if neg_tweets < NUM_TRAIN: train.append((line, sent)) else: test.append((line, sent)) return test, train
def read_input(infile, NUM_TRAIN, NUM_TEST): train = [] test = [] pos_tweets = 0 neg_tweets = 0 for line in twitter_samples.tokenized("positive_tweets.json"): sent = "Positive" #Remove usernames, urls for i,token in enumerate(line): line[i] = re.sub("@[\S]+", "USERNAME", line[i]) line[i] = re.sub("www.[\S]+|https://[\S]+|http://[\S]+", "URL", line[i]) newstr = "" for ch in line[i]: if ord(ch)>128: newstr+= "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr+=(ch) line[i] = newstr pos_tweets+=1 if pos_tweets < NUM_TRAIN: train.append((line, sent)) else: test.append((line, sent)) for line in twitter_samples.tokenized("negative_tweets.json"): sent = "Negative" neg_tweets+=1 #Remove usernames, urls for i,token in enumerate(line): line[i] = re.sub("@[\S]+", "USERNAME", line[i]) line[i] = re.sub("www.[\S]+|https://[\S]+", "URL", line[i]) newstr = "" for ch in line[i]: if ord(ch)>128: newstr+= "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr+=(ch) line[i] = newstr if neg_tweets < NUM_TRAIN: train.append((line, sent)) else: test.append((line, sent)) return test, train
def __init__(self): self.stop_words = stopwords.words('english') self.positive_cleaned_tokens_list = [] self.negative_cleaned_tokens_list = [] self.positive_tweets_tokens = twitter_samples.tokenized( 'positive_tweets.json') self.negative_tweets_tokens = twitter_samples.tokenized( 'negative_tweets.json') self.non_abusive = self.positive_tweets_tokens[: 808] + self.negative_tweets_tokens[: 811] self.abusive_words = pd.read_csv('bad-words.csv')['jigaboo'] self.abusive = [] for word in self.abusive_words: self.abusive.append(word)
def parseTweets(isNewData, tweets, posOrNeg): print("inside parse") allCleanedTokens = [] if (isNewData == True): for tweet in tweets: tokenizedTweet = tokenizeTweet(tweet) cleanedTokens = removeNoise(tokenizedTweet, stopwords.words('english')) allCleanedTokens.append(cleanedTokens) #wordsAllTweets = get_all_words(allCleanedTokens) #print(FreqDist(wordsAllTweets).most_common(25)) tokensForModel = get_tweets_for_model(allCleanedTokens) else: tweets = 'positive_tweets.json' if ( posOrNeg == "positive") else 'negative_tweets.json' tweet_tokens = twitter_samples.tokenized(tweets) for tokens in tweet_tokens: allCleanedTokens.append( removeNoise(tokens, stopwords.words('english'))) tokensForModel = get_tweets_for_model(allCleanedTokens) #wordsAllTweets = get_all_words(positive_cleaned_tokens_list) #print(FreqDist(wordsAllTweets).most_common(25)) return tokensForModel
def twitter_token(): from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents tweets = twitter_samples.strings('positive_tweets.json') tweets_tokens = twitter_samples.tokenized('positive_tweets.json') tweets_tagged = pos_tag_sents(tweets_tokens) """ JJ:Adjective singular nouns (NN) plural nouns (NNS) """ JJ_count = 0 NN_count = 0 for tweet in tweets_tagged: for key, tag in tweet: #tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('Total number of adjectives = ', JJ_count) print('Total number of nouns = ', NN_count)
def corpusreader_demo(): """ Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks)
def validate(self,classifier): """Test the accuracy of a given classifier against a test dataset with labels. Args: classifier: (Bayesain,DecisionTree,SVC,LinearSVC) for use in classifying data Returns: None """ tweets = twitter_samples.fileids() pos_tweets = twitter_samples.tokenized(tweets[1]) neg_tweets = twitter_samples.tokenized(tweets[0]) pos_testing = pos_tweets[(len(pos_tweets)*7/8):] neg_testing = neg_tweets[(len(neg_tweets)*7/8):] pos_test = [(self.train_feats(f), 'positive') for f in pos_testing ] neg_test = [(self.train_feats(f), 'negative') for f in neg_testing ] testfeats = pos_test + neg_test print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)
def create_and_train(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] positive_cleaned_tokens_list = tokenize('positive_tweets.json') negative_cleaned_tokens_list = tokenize('negative_tweets.json') all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) return classifier
def corpusreader_demo(): """ Use `TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks)
def word_delegation(): positive = twitter_samples.tokenized('positive_tweets.json') negative = twitter_samples.tokenized('negative_tweets.json') tweets, all_words = [], [] for tweet in positive: tweets.append((tweet, 'pos')) for word in tweet: all_words.append(word.lower()) for tweet in negative: tweets.append((tweet, 'neg')) for word in tweet: all_words.append(word.lower()) word_features = list(nltk.FreqDist(all_words).keys())[:1000] return tweets, word_features
from collections import Counter from nltk.corpus import twitter_samples, stopwords from nltk import bigrams from nltk import trigrams tokenizedWordsFromTwitter = twitter_samples.tokenized('tweets.20150430-223406.json') tokenizedWordsFromTwitter += twitter_samples.tokenized('negative_tweets.json') tokenizedWordsFromTwitter += twitter_samples.tokenized('positive_tweets.json') #tokenizedWordsFromTwitter = twitter_samples.tokenized('positive_tweets.json') for tweetIndex in range(len(tokenizedWordsFromTwitter)): tokenizedWordsFromTwitter[tweetIndex] = [item for item in tokenizedWordsFromTwitter[tweetIndex] if item not in [".",",",":",";","!","?","(",")"]] tokenizedWordsFromTwitter[tweetIndex] = [item for item in tokenizedWordsFromTwitter[tweetIndex] if item not in stopwords.words('english')] print("Number of Tweets: " + str(len(tokenizedWordsFromTwitter))) twitter_unigrams = Counter({}) twitter_bigrams = Counter({}) twitter_trigrams = Counter({}) for i in tokenizedWordsFromTwitter: twitter_unigrams += Counter(i) individual_bigrams = bigrams(i) twitter_bigrams += Counter(individual_bigrams) individual_trigrams = trigrams(i) twitter_trigrams += Counter(individual_trigrams)
list_of_tokenized_words = [each_word for each_word in list_of_tokenized_words if each_word not in [",", ".", "..", "...", ":", "?", "!", "\'", "\"", "#", "-", "_", "(", ")"]] list_of_tokenized_words = [stemmer.stem(each_word) for each_word in list_of_tokenized_words] print(list_of_tokenized_words) return list_of_tokenized_words def word_feats(words): return dict([(word, True) for word in get_words_from_sentence(words)]) def tokenize_sentence_into_list_of_words(sentence): tknzr = TweetTokenizer() list_of_tokenized_words = tknzr.tokenize(sentence) return list_of_tokenized_words negative_tokenizedWordsFromTwitter = twitter_samples.tokenized('negative_tweets.json') positive_tokenizedWordsFromTwitter = twitter_samples.tokenized('positive_tweets.json') negfeats = [(word_feats(f), 'neg') for f in negative_tokenizedWordsFromTwitter] posfeats = [(word_feats(f), 'pos') for f in positive_tokenizedWordsFromTwitter] negcutoff = int(len(negfeats)*3/4) poscutoff = int(len(posfeats)*3/4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) classifier = NaiveBayesClassifier.train(trainfeats) print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)) print(classifier.show_most_informative_features())
from nltk.corpus import twitter_samples from nltk.tag import pos_tag_sents tweets = twitter_samples.strings('positive_tweets.json') tweets_tokens = twitter_samples.tokenized('positive_tweets.json') JJ_count = 0 NN_count = 0 tweets_tagged = pos_tag_sents(tweets_tokens) for tweet in tweets_tagged: for pair in tweet: tag = pair[1] if tag == 'JJ': JJ_count += 1 elif tag == 'NN': NN_count += 1 print('Total number of adjectives = ', JJ_count) print('Total number of nouns = ', NN_count)
import nltk import plotly.plotly as py import plotly.graph_objs as go from tqdm import tqdm # These are corpus files. One positive, one negative and one mixed print twitter_samples.fileids() # Empty dictionaries to store the collected tags posTags = {} negTags = {} # The corpuses are already POS tagged # Load the positive tweets and load the POS tags into 'posTags' tokenized = twitter_samples.tokenized('positive_tweets.json') print "Loaded" for toks in tqdm(tokenized): toks = nltk.pos_tag(toks) for word in toks: if word[1] in posTags: posTags[word[1]] += 1 else: posTags[word[1]] = 1 # Load the negative tweets and load the POS tags into 'negTags' tokenized = twitter_samples.tokenized('negative_tweets.json') for toks in tqdm(tokenized): toks = nltk.pos_tag(toks) for word in toks: if word[1] in negTags: