Ejemplo n.º 1
0
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result
Ejemplo n.º 2
0
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3))

    #bias term is set to 1
    x[0, 0] = 1

    # loop through each word in the list of words
    for word in word_l:

        # increment the word count for the positive label 1
        x[0, 1] += freqs.get((word, 1.0), 0)

        # increment the word count for the negative label 0
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert (x.shape == (1, 3))
    return x
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs
Ejemplo n.º 4
0
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p
Ejemplo n.º 5
0
	def get(self):
		if not self.request.get("page"):
			page = "1"
		else:
			page = self.request.get("page")
		if not self.request.get("since_id"):
			since_id = utils.get_max_tweet()
		else:
			since_id = self.request.get("since_id")
		query = "select * from json where url=\"http://search.twitter.com/search.json?q=%23bonnierhackday&page="+ page +"&since_id="+ since_id +"\""
		self.response.out.write(query)
		self.response.out.write("<br />")
		y = yql.Public()
		yqlresponse = y.execute(query)
		try:
			results = yqlresponse['query']['results']['json']['results']
			if len(results) > 0:
				for rawtweet in results:
					tweet = utils.process_tweet(rawtweet)
					self.response.out.write(tweet.text)
					self.response.out.write("<br />")
				self.response.out.write("more to scrape")
				self.response.out.write("<br />")
				taskqueue.add(url='/scrape/feed', params={"page":str(int(page) + 1), "since_id":since_id}, method='GET')
		except:
			self.response.out.write(yqlresponse)
Ejemplo n.º 6
0
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)

    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3))

    #bias term is set to 1
    x[0, 0] = 1

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

    # loop through each word in the list of words
    for word in word_l:

        # increment the word count for the positive label 1
        x[0, 1] += None

        # increment the word count for the negative label 0
        x[0, 2] += None

    ### END CODE HERE ###
    assert (x.shape == (1, 3))
    return x
Ejemplo n.º 7
0
def extract_features(tweet, freqs):

    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1

    for word in word_l:

        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert (x.shape == (1, 3))
    return x
Ejemplo n.º 8
0
def erro_analysis(test_x, test_y, logprior, loglikelihood):
    # Some error analysis done for you
    filename = 'error_analys.txt'
    with open(filename, 'w', encoding='utf-8') as file_object:
        print("Files %s is recording the wrong tweet!" % filename)
        file_object.write("Add wrong tweet:\n")
        for x, y in zip(test_x, test_y):
            y_hat = naive_bayes_predict(x, logprior, loglikelihood)
            if y != (np.sign(y_hat) > 0):
                file_object.write(
                    '%d\t%0.2f\t%s\n' % (y, np.sign(y_hat) > 0, ' '.join(
                        process_tweet(x)).encode('ascii', 'ignore')))
    print("Files %s have finished!" % filename)
class preprocessing:

    # select the lists of positive andpro negative tweets
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')

    # concatenate the lists, 1st part is the positive tweets followed by the negative
    tweets = all_positive_tweets + all_negative_tweets

    # make a numpy array representing labels of the tweets
    labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))

    processed tweets = []
    for tweet in tweets: 
        processed tweets = process_tweet(tweet) # Preprocess a given tweet

    # create frequency dictionary
    freqs = build_freqs(processed tweets, labels)

    # list representing our table of word counts.
    # each element consist of a sublist with this pattern: [<word>, <positive_count>, <negative_count>]pro
    data = []

    # loop through our selected words
    for word in keys:

        # initialize positive and negative counts
        pos = 0
        neg = 0

        # retrieve number of positive counts
        if (word, 1) in freqs:
            pos = freqs[(word, 1)]

        # retrieve number of negative counts
        if (word, 0) in freqs:
            neg = freqs[(word, 0)]

        # append the word counts to the table
        data.append([word, pos, neg])

    def get_preprocessed_data():
        return processed tweets
    
    def get_freqs_dict():
        return freqs 
        
    def get_freqs_table():
        return data
Ejemplo n.º 10
0
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1
    for word in word_l:
        x[0, 1] += freqs.get((word, 1), 0)
        x[0, 2] += freqs.get((word, 0), 0)
    assert (x.shape == (1, 3))
    return x
Ejemplo n.º 11
0
    def classify(self, text):
        """
        :param text: str
        :return: str
        """
        good_text = process_tweet(text)
        features = get_features(good_text)
        if not features:
            return None
        pos_feature_matrix = [self.pos_feature_prob.get(feature, 1/self.num_positive) for feature in features]
        neg_feature_matrix = [self.neg_feature_prob.get(feature, 1/self.num_negative) for feature in features]
        positive_prob = self.pos_prob * functools.reduce(lambda x, y: x*y, pos_feature_matrix)
        negative_prob = self.neg_prob * functools.reduce(lambda x, y: x*y, neg_feature_matrix)

        # what if positive_prob = negative_prob
        return 'Positive' if positive_prob > negative_prob else 'Negative'
def extract_features(tweet, freqs):

    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))
    x[0, 0] = 1

    for word in word_l:
        key0 = (word, 0)
        key1 = (word, 1)
        #negative label frequency
        if (key0 in freqs.keys()):
            x[0, 2] += freqs[key0]
        #positive label frequency
        if (key1 in freqs.keys()):
            x[0, 1] += freqs[key1]
    assert (x.shape == (1, 3))  #checking whether elements of x is of order 1*3
    return x
Ejemplo n.º 13
0
	def get(self):
		if not self.request.get("page"):
			page = "1"
		else:
			page = self.request.get("page")
		url = "http://backtweets.com/search.json?q=www.dn.se&key=0d3f7e0f7874396cf456&itemsperpage=10&since_id=9436805794&page=" + page
		result = urlfetch.fetch(url)
		if result.status_code == 200:
			json = simplejson.loads(result.content)
			try:
				if len(json['tweets']) > 0:
					for rawtweet in json['tweets']:
						tweet = utils.process_tweet(rawtweet)
						self.response.out.write(tweet.text)
					taskqueue.add(url='/scrape/feed', params={"page":str(int(page) + 1)}, method='GET')
			except:
				self.response.out.write("oops")
Ejemplo n.º 14
0
def get_document_embedding(tweet, en_embeddings): 
    '''
    Input:
        - tweet: a string
        - en_embeddings: a dictionary of word embeddings
    Output:
        - tweet_embedding: a
    '''
    doc_embedding = np.zeros(300)

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # process the document into a list of words (process the tweet)
    processed_doc = process_tweet(tweet)
    for word in processed_doc:
        # add the word embedding to the running total for the document embedding
        doc_embedding+=en_embeddings.get(word,0)
    ### END CODE HERE ###
    return doc_embedding
def extract_features(tweet, freqs):

    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3))

    # bias term is set to 1
    x[0, 0] = 1

    # loop through each word in the list of words
    for word in word_l:
        # increment the word count for the positive label 1
        x[0, 1] += freqs.get((word, 1.0), 0)

        # increment the word count for the negative label 0
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert (x.shape == (1, 3))
    return x
Ejemplo n.º 16
0
    def transform(self, data):
        for sentiment, text in data:
            tweet = process_tweet(text)
            features = get_features(tweet)
            if sentiment.lower() == 'positive':
                self.positive_counter.update(features)
            elif sentiment.lower() == 'negative':
                self.negative_counter.update(features)
            else:
                print('Unknown label {label}'.format(label=sentiment))

        for word, frequency in self.positive_counter.items():
            yield Feature(
                word=word,
                sentiment='Positive',
                frequency=frequency,
            )

        for word, frequency in self.negative_counter.items():
            yield Feature(
                word=word,
                sentiment='Negative',
                frequency=frequency,
            )
Ejemplo n.º 17
0
clean_tweet = []

for word in tweet:
    if(word not in stopwords_english and word not in string.punctuation):
        clean_tweet.append(word)

print(clean_tweet)

#4. Stemming

stemmer = PorterStemmer()

tweets_stem = []

for word in clean_tweet:
    stem = stemmer.stem(word)
    tweets_stem.append(stem)

print(tweets_stem)

#processing function

tweet2 = all_pos[2277]

tweet2_stem = process_tweet(tweet2)

print(tweet2_stem)


Ejemplo n.º 18
0
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # process the document into a list of words (process the tweet)
    processed_doc = process_tweet(tweet)
    for word in processed_doc:
        # add the word embedding to the running total for the document embedding
        doc_embedding+=en_embeddings.get(word,0)
    ### END CODE HERE ###
    return doc_embedding


# In[8]:


custom_tweet = data.loc[5,'DESC']
print(process_tweet(custom_tweet))
tweet_embedding = get_document_embedding(custom_tweet, word_embeddings)
tweet_embedding[-5:]


# In[9]:


df = df.reset_index()
dfl = df.values.tolist()
dfl[5]


# In[10]:

Ejemplo n.º 19
0
print(tweets_stem)

# That's it! Now we have a set of words we can feed into to the next stage of our machine learning project.

# ## process_tweet()
#
# As shown above, preprocessing consists of multiple steps before you arrive at the final list of words. We will not ask you to replicate these however. In the week's assignment, you will use the function `process_tweet(tweet)` available in _utils.py_. We encourage you to open the file and you'll see that this function's implementation is very similar to the steps above.
#
# To obtain the same result as in the previous code cells, you will only need to call the function `process_tweet()`. Let's do that in the next cell.

# In[15]:

from utils import process_tweet  # Import the process_tweet function

# choose the same tweet
tweet = all_positive_tweets[2277]

print()
print('\033[92m')
print(tweet)
print('\033[94m')

# call the imported function
tweets_stem = process_tweet(tweet)
# Preprocess a given tweet

print('preprocessed tweet:')
print(tweets_stem)  # Print the result

# That's it for this lab! You now know what is going on when you call the preprocessing helper function in this week's assignment. Hopefully, this exercise has also given you some insights on how to tweak this for other types of text datasets.
              appostrophe_handling=True,
              lemmatize=True,
              reduce_lengthenings=True,
              segment_words=False,
              correct_spelling=False)

print("Loading training data")
df_a = pd.read_csv('start-kit/training-v1/offenseval-training-v1.tsv',
                   sep='\t')
df_a_trial = pd.read_csv('start-kit/trial-data/offenseval-trial.txt', sep='\t')
print("Done!")

print("Preprocessing...")

X = df_a['tweet'].apply(
    lambda x: process_tweet(x, **params, trial=False, sym_spell=None)).values
y = df_a['subtask_a'].replace({'OFF': 1, 'NOT': 0}).values
class_weights = sklearn.utils.class_weight.compute_class_weight(
    'balanced', np.unique(y), y.reshape(-1))

X_trial = df_a_trial['tweet'].apply(
    lambda x: process_tweet(x, **params, trial=True, sym_spell=None)).values
y_trial = df_a_trial['subtask_a'].replace({'OFF': 1, 'NOT': 0}).values
print("Done!")

if balance_dataset:
    X, y = under_sample(X, y)

print("EXAMPLES OF PROCESSED TWEETS [train/trial]")
print(
    "_________________________________________________________________________________________________________"
                    axis=0)
test_y = np.append(np.ones((len(test_pos), 1)),
                   np.zeros((len(test_neg), 1)),
                   axis=0)

# using the build frequencies dictionary which has count of the frequencies of the word in the tweet exist in positive and negative labels of words
freqs = build_freqs(train_x, train_y)
# follwoing the processing step
# tokenizing
# lowercasing
# removing stop words & punctuations
# stemming

print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n',
      process_tweet(train_x[0]))
import math


def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h


if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
Ejemplo n.º 22
0
# 
# For any machine learning project, once you've gathered the data, the first step is to process it to make useful inputs to your model.
# - **Remove noise**: You will first want to remove noise from your data -- that is, remove words that don't tell you much about the content. These include all common words like 'I, you, are, is, etc...' that would not give us enough information on the sentiment.
# - We'll also remove stock market tickers, retweet symbols, hyperlinks, and hashtags because they can not tell you a lot of information on the sentiment.
# - You also want to remove all the punctuation from a tweet. The reason for doing this is because we want to treat words with or without the punctuation as the same word, instead of treating "happy", "happy?", "happy!", "happy," and "happy." as different words.
# - Finally you want to use stemming to only keep track of one variation of each word. In other words, we'll treat "motivation", "motivated", and "motivate" similarly by grouping them within the same stem of "motiv-".
# 
# We have given you the function `process_tweet()` that does this for you.

# In[4]:


custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))


# ## Part 1.1 Implementing your helper functions
# 
# To help train your naive bayes model, you will need to build a dictionary where the keys are a (word, label) tuple and the values are the corresponding frequency.  Note that the labels we'll use here are 1 for positive and 0 for negative.
# 
# You will also implement a `lookup()` helper function that takes in the `freqs` dictionary, a word, and a label (1 or 0) and returns the number of times that word and label tuple appears in the collection of tweets.
# 
# For example: given a list of tweets `["i am rather excited", "you are rather happy"]` and the label 1, the function will return a dictionary that contains the following key-value pairs:
# 
# {
#     ("rather", 1): 2
#     ("happi", 1) : 1
#     ("excit", 1) : 1
# }
Ejemplo n.º 23
0
def is_this_hate_message(message):
    response = nlp(process_tweet(message))
    if (response[0]['label'] != 'LABEL_0'):
        return True
    return False
Ejemplo n.º 24
0
        'great', 'great great', 'great great great', 'great great great great'
]:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    #     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

# Check the sentiment of another custom tweet
my_tweet = 'you are great'
print('custom tweet result:',
      naive_bayes_predict(my_tweet, logprior, loglikelihood))

# check ratio
print('ratio (happi): ', get_ratio(freqs, 'happi'))

# find negative words at or below a threshold
print(get_words_by_threshold(freqs, label=0, threshold=0.05))
# find positive words at or above a threshold
### Notice the difference between the positive and negative ratios.
### Emojis like :( and words like 'me' tend to have a negative connotation.
#### Other words like 'glad', 'community', and 'arrives' tend to be found in the positive tweets.
get_words_by_threshold(freqs, label=1, threshold=10)

# Some error analysis
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))
Ejemplo n.º 25
0
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

# test the function below
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # calculate the sigmoid of z
    h = 1 / (1 + np.exp(-z))
    ### END CODE HERE ###
    
Ejemplo n.º 26
0
# That's it! Now we have a set of words we can feed into to the next stage of our machine learning project.

# ## process_tweet()
# 
# As shown above, preprocessing consists of multiple steps before you arrive at the final list of words. We will not ask you to replicate these however. In the week's assignment, you will use the function `process_tweet(tweet)` available in _utils.py_. We encourage you to open the file and you'll see that this function's implementation is very similar to the steps above.
# 
# To obtain the same result as in the previous code cells, you will only need to call the function `process_tweet()`. Let's do that in the next cell.

# In[16]:


from utils import process_tweet # Import the process_tweet function

# choose the same tweet
tweet = all_positive_tweets[2277]

print()
print('\033[92m')
print(tweet)
print('\033[94m')

# call the imported function
tweets_stem = process_tweet(tweet); # Preprocess a given tweet

print('preprocessed tweet:')
print(tweets_stem) # Print the result


# That's it for this lab! You now know what is going on when you call the preprocessing helper function in this week's assignment. Hopefully, this exercise has also given you some insights on how to tweak this for other types of text datasets.
Ejemplo n.º 27
0
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

#####
train_accuracy = test_logistic_regression(train_x, train_y, freqs, theta)
print(f"Logistic regression model's accuracy = {train_accuracy:.4f}")

tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")    

## Error Analysis
# Some error analysis done for you
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

## predict with your own tweet
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')             
Ejemplo n.º 28
0
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))
# #### Expected output
# ```
# type(freqs) = <class 'dict'>
# len(freqs) = 11346
# ```

# ### Process tweet
# The given function `process_tweet()` tokenizes the tweet into individual words, removes stop words and applies stemming.
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

# Testing sigmoid function 
if (sigmoid(0) == 0.5):
    print('sigmoid SUCCESS!')
else:
    print('Oops! 0 sigmoid error')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again! 4.92 sigmoid error')

# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms