def count_tweets(result, tweets, ys): ''' Input: result: a dictionary that will be used to map each pair to its frequency tweets: a list of tweets ys: a list corresponding to the sentiment of each tweet (either 0 or 1) Output: result: a dictionary mapping each pair to its frequency ''' ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### for y, tweet in zip(ys, tweets): for word in process_tweet(tweet): # define the key, which is the word and label tuple pair = (word, y) # if the key exists in the dictionary, increment the count if pair in result: result[pair] += 1 # else, if the key is new, add it to the dictionary and set the count to 1 else: result[pair] = 1 ### END CODE HERE ### return result
def extract_features(tweet, freqs): ''' Input: tweet: a list of words for one tweet freqs: a dictionary corresponding to the frequencies of each tuple (word, label) Output: x: a feature vector of dimension (1,3) ''' # process_tweet tokenizes, stems, and removes stopwords word_l = process_tweet(tweet) # 3 elements in the form of a 1 x 3 vector x = np.zeros((1, 3)) #bias term is set to 1 x[0, 0] = 1 # loop through each word in the list of words for word in word_l: # increment the word count for the positive label 1 x[0, 1] += freqs.get((word, 1.0), 0) # increment the word count for the negative label 0 x[0, 2] += freqs.get((word, 0.0), 0) assert (x.shape == (1, 3)) return x
def build_freqs(tweets, ys): """Build frequencies. Input: tweets: a list of tweets ys: an m x 1 array with the sentiment label of each tweet (either 0 or 1) Output: freqs: a dictionary mapping each (word, sentiment) pair to its frequency """ # Convert np array to list since zip needs an iterable. # The squeeze is necessary or the list ends up with one element. # Also note that this is just a NOP if ys is already a list. yslist = np.squeeze(ys).tolist() # Start with an empty dictionary and populate it by looping over all tweets # and over all processed words in each tweet. freqs = {} for y, tweet in zip(yslist, tweets): for word in process_tweet(tweet): pair = (word, y) if pair in freqs: freqs[pair] += 1 else: freqs[pair] = 1 return freqs
def naive_bayes_predict(tweet, logprior, loglikelihood): ''' Input: tweet: a string logprior: a number loglikelihood: a dictionary of words mapping to numbers Output: p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number) ''' ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # process the tweet to get a list of words word_l = process_tweet(tweet) # initialize probability to zero p = 0 # add the logprior p += logprior for word in word_l: # check if the word exists in the loglikelihood dictionary if word in loglikelihood: # add the log likelihood of that word to the probability p += loglikelihood[word] ### END CODE HERE ### return p
def get(self): if not self.request.get("page"): page = "1" else: page = self.request.get("page") if not self.request.get("since_id"): since_id = utils.get_max_tweet() else: since_id = self.request.get("since_id") query = "select * from json where url=\"http://search.twitter.com/search.json?q=%23bonnierhackday&page="+ page +"&since_id="+ since_id +"\"" self.response.out.write(query) self.response.out.write("<br />") y = yql.Public() yqlresponse = y.execute(query) try: results = yqlresponse['query']['results']['json']['results'] if len(results) > 0: for rawtweet in results: tweet = utils.process_tweet(rawtweet) self.response.out.write(tweet.text) self.response.out.write("<br />") self.response.out.write("more to scrape") self.response.out.write("<br />") taskqueue.add(url='/scrape/feed', params={"page":str(int(page) + 1), "since_id":since_id}, method='GET') except: self.response.out.write(yqlresponse)
def extract_features(tweet, freqs): ''' Input: tweet: a list of words for one tweet freqs: a dictionary corresponding to the frequencies of each tuple (word, label) Output: x: a feature vector of dimension (1,3) ''' # process_tweet tokenizes, stems, and removes stopwords word_l = process_tweet(tweet) # 3 elements in the form of a 1 x 3 vector x = np.zeros((1, 3)) #bias term is set to 1 x[0, 0] = 1 ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # loop through each word in the list of words for word in word_l: # increment the word count for the positive label 1 x[0, 1] += None # increment the word count for the negative label 0 x[0, 2] += None ### END CODE HERE ### assert (x.shape == (1, 3)) return x
def extract_features(tweet, freqs): word_l = process_tweet(tweet) x = np.zeros((1, 3)) x[0, 0] = 1 for word in word_l: x[0, 1] += freqs.get((word, 1.0), 0) x[0, 2] += freqs.get((word, 0.0), 0) assert (x.shape == (1, 3)) return x
def erro_analysis(test_x, test_y, logprior, loglikelihood): # Some error analysis done for you filename = 'error_analys.txt' with open(filename, 'w', encoding='utf-8') as file_object: print("Files %s is recording the wrong tweet!" % filename) file_object.write("Add wrong tweet:\n") for x, y in zip(test_x, test_y): y_hat = naive_bayes_predict(x, logprior, loglikelihood) if y != (np.sign(y_hat) > 0): file_object.write( '%d\t%0.2f\t%s\n' % (y, np.sign(y_hat) > 0, ' '.join( process_tweet(x)).encode('ascii', 'ignore'))) print("Files %s have finished!" % filename)
class preprocessing: # select the lists of positive andpro negative tweets all_positive_tweets = twitter_samples.strings('positive_tweets.json') all_negative_tweets = twitter_samples.strings('negative_tweets.json') # concatenate the lists, 1st part is the positive tweets followed by the negative tweets = all_positive_tweets + all_negative_tweets # make a numpy array representing labels of the tweets labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets)))) processed tweets = [] for tweet in tweets: processed tweets = process_tweet(tweet) # Preprocess a given tweet # create frequency dictionary freqs = build_freqs(processed tweets, labels) # list representing our table of word counts. # each element consist of a sublist with this pattern: [<word>, <positive_count>, <negative_count>]pro data = [] # loop through our selected words for word in keys: # initialize positive and negative counts pos = 0 neg = 0 # retrieve number of positive counts if (word, 1) in freqs: pos = freqs[(word, 1)] # retrieve number of negative counts if (word, 0) in freqs: neg = freqs[(word, 0)] # append the word counts to the table data.append([word, pos, neg]) def get_preprocessed_data(): return processed tweets def get_freqs_dict(): return freqs def get_freqs_table(): return data
def extract_features(tweet, freqs): ''' Input: tweet: a list of words for one tweet freqs: a dictionary corresponding to the frequencies of each tuple (word, label) Output: x: a feature vector of dimension (1,3) ''' word_l = process_tweet(tweet) x = np.zeros((1, 3)) x[0, 0] = 1 for word in word_l: x[0, 1] += freqs.get((word, 1), 0) x[0, 2] += freqs.get((word, 0), 0) assert (x.shape == (1, 3)) return x
def classify(self, text): """ :param text: str :return: str """ good_text = process_tweet(text) features = get_features(good_text) if not features: return None pos_feature_matrix = [self.pos_feature_prob.get(feature, 1/self.num_positive) for feature in features] neg_feature_matrix = [self.neg_feature_prob.get(feature, 1/self.num_negative) for feature in features] positive_prob = self.pos_prob * functools.reduce(lambda x, y: x*y, pos_feature_matrix) negative_prob = self.neg_prob * functools.reduce(lambda x, y: x*y, neg_feature_matrix) # what if positive_prob = negative_prob return 'Positive' if positive_prob > negative_prob else 'Negative'
def extract_features(tweet, freqs): word_l = process_tweet(tweet) x = np.zeros((1, 3)) x[0, 0] = 1 for word in word_l: key0 = (word, 0) key1 = (word, 1) #negative label frequency if (key0 in freqs.keys()): x[0, 2] += freqs[key0] #positive label frequency if (key1 in freqs.keys()): x[0, 1] += freqs[key1] assert (x.shape == (1, 3)) #checking whether elements of x is of order 1*3 return x
def get(self): if not self.request.get("page"): page = "1" else: page = self.request.get("page") url = "http://backtweets.com/search.json?q=www.dn.se&key=0d3f7e0f7874396cf456&itemsperpage=10&since_id=9436805794&page=" + page result = urlfetch.fetch(url) if result.status_code == 200: json = simplejson.loads(result.content) try: if len(json['tweets']) > 0: for rawtweet in json['tweets']: tweet = utils.process_tweet(rawtweet) self.response.out.write(tweet.text) taskqueue.add(url='/scrape/feed', params={"page":str(int(page) + 1)}, method='GET') except: self.response.out.write("oops")
def get_document_embedding(tweet, en_embeddings): ''' Input: - tweet: a string - en_embeddings: a dictionary of word embeddings Output: - tweet_embedding: a ''' doc_embedding = np.zeros(300) ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # process the document into a list of words (process the tweet) processed_doc = process_tweet(tweet) for word in processed_doc: # add the word embedding to the running total for the document embedding doc_embedding+=en_embeddings.get(word,0) ### END CODE HERE ### return doc_embedding
def extract_features(tweet, freqs): # process_tweet tokenizes, stems, and removes stopwords word_l = process_tweet(tweet) # 3 elements in the form of a 1 x 3 vector x = np.zeros((1, 3)) # bias term is set to 1 x[0, 0] = 1 # loop through each word in the list of words for word in word_l: # increment the word count for the positive label 1 x[0, 1] += freqs.get((word, 1.0), 0) # increment the word count for the negative label 0 x[0, 2] += freqs.get((word, 0.0), 0) assert (x.shape == (1, 3)) return x
def transform(self, data): for sentiment, text in data: tweet = process_tweet(text) features = get_features(tweet) if sentiment.lower() == 'positive': self.positive_counter.update(features) elif sentiment.lower() == 'negative': self.negative_counter.update(features) else: print('Unknown label {label}'.format(label=sentiment)) for word, frequency in self.positive_counter.items(): yield Feature( word=word, sentiment='Positive', frequency=frequency, ) for word, frequency in self.negative_counter.items(): yield Feature( word=word, sentiment='Negative', frequency=frequency, )
clean_tweet = [] for word in tweet: if(word not in stopwords_english and word not in string.punctuation): clean_tweet.append(word) print(clean_tweet) #4. Stemming stemmer = PorterStemmer() tweets_stem = [] for word in clean_tweet: stem = stemmer.stem(word) tweets_stem.append(stem) print(tweets_stem) #processing function tweet2 = all_pos[2277] tweet2_stem = process_tweet(tweet2) print(tweet2_stem)
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # process the document into a list of words (process the tweet) processed_doc = process_tweet(tweet) for word in processed_doc: # add the word embedding to the running total for the document embedding doc_embedding+=en_embeddings.get(word,0) ### END CODE HERE ### return doc_embedding # In[8]: custom_tweet = data.loc[5,'DESC'] print(process_tweet(custom_tweet)) tweet_embedding = get_document_embedding(custom_tweet, word_embeddings) tweet_embedding[-5:] # In[9]: df = df.reset_index() dfl = df.values.tolist() dfl[5] # In[10]:
print(tweets_stem) # That's it! Now we have a set of words we can feed into to the next stage of our machine learning project. # ## process_tweet() # # As shown above, preprocessing consists of multiple steps before you arrive at the final list of words. We will not ask you to replicate these however. In the week's assignment, you will use the function `process_tweet(tweet)` available in _utils.py_. We encourage you to open the file and you'll see that this function's implementation is very similar to the steps above. # # To obtain the same result as in the previous code cells, you will only need to call the function `process_tweet()`. Let's do that in the next cell. # In[15]: from utils import process_tweet # Import the process_tweet function # choose the same tweet tweet = all_positive_tweets[2277] print() print('\033[92m') print(tweet) print('\033[94m') # call the imported function tweets_stem = process_tweet(tweet) # Preprocess a given tweet print('preprocessed tweet:') print(tweets_stem) # Print the result # That's it for this lab! You now know what is going on when you call the preprocessing helper function in this week's assignment. Hopefully, this exercise has also given you some insights on how to tweak this for other types of text datasets.
appostrophe_handling=True, lemmatize=True, reduce_lengthenings=True, segment_words=False, correct_spelling=False) print("Loading training data") df_a = pd.read_csv('start-kit/training-v1/offenseval-training-v1.tsv', sep='\t') df_a_trial = pd.read_csv('start-kit/trial-data/offenseval-trial.txt', sep='\t') print("Done!") print("Preprocessing...") X = df_a['tweet'].apply( lambda x: process_tweet(x, **params, trial=False, sym_spell=None)).values y = df_a['subtask_a'].replace({'OFF': 1, 'NOT': 0}).values class_weights = sklearn.utils.class_weight.compute_class_weight( 'balanced', np.unique(y), y.reshape(-1)) X_trial = df_a_trial['tweet'].apply( lambda x: process_tweet(x, **params, trial=True, sym_spell=None)).values y_trial = df_a_trial['subtask_a'].replace({'OFF': 1, 'NOT': 0}).values print("Done!") if balance_dataset: X, y = under_sample(X, y) print("EXAMPLES OF PROCESSED TWEETS [train/trial]") print( "_________________________________________________________________________________________________________"
axis=0) test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0) # using the build frequencies dictionary which has count of the frequencies of the word in the tweet exist in positive and negative labels of words freqs = build_freqs(train_x, train_y) # follwoing the processing step # tokenizing # lowercasing # removing stop words & punctuations # stemming print('This is an example of a positive tweet: \n', train_x[0]) print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0])) import math def sigmoid(z): h = 1 / (1 + np.exp(-z)) return h if (sigmoid(0) == 0.5): print('SUCCESS!') else: print('Oops!') if (sigmoid(4.92) == 0.9927537604041685): print('CORRECT!')
# # For any machine learning project, once you've gathered the data, the first step is to process it to make useful inputs to your model. # - **Remove noise**: You will first want to remove noise from your data -- that is, remove words that don't tell you much about the content. These include all common words like 'I, you, are, is, etc...' that would not give us enough information on the sentiment. # - We'll also remove stock market tickers, retweet symbols, hyperlinks, and hashtags because they can not tell you a lot of information on the sentiment. # - You also want to remove all the punctuation from a tweet. The reason for doing this is because we want to treat words with or without the punctuation as the same word, instead of treating "happy", "happy?", "happy!", "happy," and "happy." as different words. # - Finally you want to use stemming to only keep track of one variation of each word. In other words, we'll treat "motivation", "motivated", and "motivate" similarly by grouping them within the same stem of "motiv-". # # We have given you the function `process_tweet()` that does this for you. # In[4]: custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np" # print cleaned tweet print(process_tweet(custom_tweet)) # ## Part 1.1 Implementing your helper functions # # To help train your naive bayes model, you will need to build a dictionary where the keys are a (word, label) tuple and the values are the corresponding frequency. Note that the labels we'll use here are 1 for positive and 0 for negative. # # You will also implement a `lookup()` helper function that takes in the `freqs` dictionary, a word, and a label (1 or 0) and returns the number of times that word and label tuple appears in the collection of tweets. # # For example: given a list of tweets `["i am rather excited", "you are rather happy"]` and the label 1, the function will return a dictionary that contains the following key-value pairs: # # { # ("rather", 1): 2 # ("happi", 1) : 1 # ("excit", 1) : 1 # }
def is_this_hate_message(message): response = nlp(process_tweet(message)) if (response[0]['label'] != 'LABEL_0'): return True return False
'great', 'great great', 'great great great', 'great great great great' ]: # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood))) p = naive_bayes_predict(tweet, logprior, loglikelihood) # print(f'{tweet} -> {p:.2f} ({p_category})') print(f'{tweet} -> {p:.2f}') # Check the sentiment of another custom tweet my_tweet = 'you are great' print('custom tweet result:', naive_bayes_predict(my_tweet, logprior, loglikelihood)) # check ratio print('ratio (happi): ', get_ratio(freqs, 'happi')) # find negative words at or below a threshold print(get_words_by_threshold(freqs, label=0, threshold=0.05)) # find positive words at or above a threshold ### Notice the difference between the positive and negative ratios. ### Emojis like :( and words like 'me' tend to have a negative connotation. #### Other words like 'glad', 'community', and 'arrives' tend to be found in the positive tweets. get_words_by_threshold(freqs, label=1, threshold=10) # Some error analysis print('Truth Predicted Tweet') for x, y in zip(test_x, test_y): y_hat = naive_bayes_predict(x, logprior, loglikelihood) if y != (np.sign(y_hat) > 0): print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join( process_tweet(x)).encode('ascii', 'ignore')))
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0) # Print the shape train and test sets print("train_y.shape = " + str(train_y.shape)) print("test_y.shape = " + str(test_y.shape)) # create frequency dictionary freqs = build_freqs(train_x, train_y) # check the output print("type(freqs) = " + str(type(freqs))) print("len(freqs) = " + str(len(freqs.keys()))) # test the function below print('This is an example of a positive tweet: \n', train_x[0]) print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0])) # UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT) def sigmoid(z): ''' Input: z: is the input (can be a scalar or an array) Output: h: the sigmoid of z ''' ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ### # calculate the sigmoid of z h = 1 / (1 + np.exp(-z)) ### END CODE HERE ###
# That's it! Now we have a set of words we can feed into to the next stage of our machine learning project. # ## process_tweet() # # As shown above, preprocessing consists of multiple steps before you arrive at the final list of words. We will not ask you to replicate these however. In the week's assignment, you will use the function `process_tweet(tweet)` available in _utils.py_. We encourage you to open the file and you'll see that this function's implementation is very similar to the steps above. # # To obtain the same result as in the previous code cells, you will only need to call the function `process_tweet()`. Let's do that in the next cell. # In[16]: from utils import process_tweet # Import the process_tweet function # choose the same tweet tweet = all_positive_tweets[2277] print() print('\033[92m') print(tweet) print('\033[94m') # call the imported function tweets_stem = process_tweet(tweet); # Preprocess a given tweet print('preprocessed tweet:') print(tweets_stem) # Print the result # That's it for this lab! You now know what is going on when you call the preprocessing helper function in this week's assignment. Hopefully, this exercise has also given you some insights on how to tweak this for other types of text datasets.
print(f"The cost after training is {J:.8f}.") print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}") ##### train_accuracy = test_logistic_regression(train_x, train_y, freqs, theta) print(f"Logistic regression model's accuracy = {train_accuracy:.4f}") tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta) print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}") ## Error Analysis # Some error analysis done for you print('Label Predicted Tweet') for x,y in zip(test_x,test_y): y_hat = predict_tweet(x, freqs, theta) if np.abs(y - (y_hat > 0.5)) > 0: print('THE TWEET IS:', x) print('THE PROCESSED TWEET IS:', process_tweet(x)) print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore'))) ## predict with your own tweet my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!' print(process_tweet(my_tweet)) y_hat = predict_tweet(my_tweet, freqs, theta) print(y_hat) if y_hat > 0.5: print('Positive sentiment') else: print('Negative sentiment')
# create frequency dictionary freqs = build_freqs(train_x, train_y) # check the output print("type(freqs) = " + str(type(freqs))) print("len(freqs) = " + str(len(freqs.keys()))) # #### Expected output # ``` # type(freqs) = <class 'dict'> # len(freqs) = 11346 # ``` # ### Process tweet # The given function `process_tweet()` tokenizes the tweet into individual words, removes stop words and applies stemming. print('This is an example of a positive tweet: \n', train_x[0]) print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0])) # Testing sigmoid function if (sigmoid(0) == 0.5): print('sigmoid SUCCESS!') else: print('Oops! 0 sigmoid error') if (sigmoid(4.92) == 0.9927537604041685): print('CORRECT!') else: print('Oops again! 4.92 sigmoid error') # Construct a synthetic test case using numpy PRNG functions np.random.seed(1) # X input is 10 x 3 with ones for the bias terms