Exemple #1
0
def get_score(candidate,sent_weight,twitter_positions,fb_positions,official_positions,nyt_positions):
    """
    " Get position similarity score between candidate and user. Computed
    " using cosine similarity. 
    "
    " Args:
    "   candidate: candidate name
    "   twitter_positions: Elasticsearch results from twitter index
    "   fb_positions: Elasticsearch results from fb index
    "   official_positions: Elasticsearch results from official index
    "   nyt_positions: Elasticsearch results form NYT index
    " Returns:
    "    Similarity score between candidate and user
    "
    " Author: Kersing Huang <*****@*****.**>, Matthew Garber
    """
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    text = []
    by_candidate = lambda x: x['candidate'] == candidate and len(x['result']['hits']['hits']) > 0
    t = filter(by_candidate,twitter_positions)
    if len(t) > 0:
        text.append(t[0]['result']['hits']['hits'][0]['_source']['sm_text'])
    #end if
    fb = filter(by_candidate,fb_positions)
    if len(fb) > 0:
        text.append(fb[0]['result']['hits']['hits'][0]['_source']['sm_text'])
    #end if

    sia = SentimentIntensityAnalyzer()

    # get candidate polarity scores
    candidate_scores = sia.polarity_scores("".join(text))

    # get user polarity scores
    user_input = json.loads(web.data())
    user_scores = sia.polarity_scores(user_input['position'])

    # compute cosine similarity of these polarity scores
    u_len = vector_length(user_scores)
    c_len = vector_length(candidate_scores)
    sentiment_score = vector_dot(candidate_scores,user_scores)/(u_len*c_len)

    official = filter(by_candidate,official_positions)
    nyt = filter(by_candidate,nyt_positions)
    relevance_score_sum = 0
    for source in [t, fb, official, nyt]:
        if source:
            relevance_score_sum += source[0]['score']
    relevance_score = relevance_score_sum/4
    weighted_score = (sent_weight * sentiment_score) + ((1 - sent_weight) * relevance_score)
    return weighted_score
def nltk_sentiment(tweets):
	sentiment = []
	sid = SentimentIntensityAnalyzer()
	for tweet in tweets:
		st = sid.polarity_scores(tweet)
		sentiment.append(st['compound'])
	return sentiment
def analyze_sentiment_vader_lexicon(review, 
                                    threshold=0.1,
                                    verbose=False):
    # pre-process text
    review = normalize_accented_characters(review)
    review = html_parser.unescape(review)
    review = strip_html(review)
    # analyze the sentiment for review
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold\
                                   else 'negative'
    if verbose:
        # display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'
        sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                        negative, neutral]],
                                        columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                                      ['Predicted Sentiment', 'Polarity Score',
                                                                       'Positive', 'Negative',
                                                                       'Neutral']], 
                                                              labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        print sentiment_frame
    
    return final_sentiment
 def add_sentiment(self):
     print 'Adding sentiment...',
     sia = SentimentIntensityAnalyzer()
     for sentiment in ('pos', 'neg', 'neu', 'compound'):
         sentify = lambda s: sia.polarity_scores(s[:200])[sentiment]
         self.df['sentiment_' + sentiment] = self.df['story body'].apply(sentify)
     print 'done'
def main():
    parser = argparse.ArgumentParser(description="Reads in output from " +
        "downloadGroupmeMessages and runs a sentiment analysis")
    parser.add_argument("inFile", help="The file containing the stored messages")
    parser.add_argument("--outFile", default="out.txt", help="Results go here")
    args = parser.parse_args()
    
    print("\nThis program prints the most negative and positive users of the chat ranked according to their average score from the VADER sentiment intensity analyzer in the NLTK. Not super accurate, but it's a fun conversation starter")
    print("The program takes a few seconds to run, and requires that you have some of the NLTK corpora installed.")

    with open(args.inFile, 'r') as infile:
        infile.readline()
        analyzer = SentimentIntensityAnalyzer()
        negList = []
        positiveList = []
        counter = PostSentimentCounter()
        for line in infile:
            line = line.split('\t')
            message = line[3]
            id = line[0]
            name = line[1]
            
            sentDict = analyzer.polarity_scores(message)
            counter.countPost(id, name, sentDict)
        counter.printSentimentLeaderboards()
def analyze(posts):
  post_json = setup_json()
  #for post, replies in posts.iteritems()

  sid = SentimentIntensityAnalyzer()
  for key, value in posts.iteritems():
    nustring = ' '.join(value[0]).replace("u'", "")
    ss = sid.polarity_scores(nustring)
    for k in sorted(ss):
      if k is "compound":
        entry = {}
        entry['name'] = int(ss[k]*len(nustring))
        entry['size'] = len(nustring)
        if ss[k] == 0.0:
          post_json['children'][1]['children'].append(entry)
        elif ss[k] < -0.8:
          post_json['children'][2]['children'][2]['children'].append(entry)
        elif ss[k] < -0.4:
          post_json['children'][2]['children'][1]['children'].append(entry)
        elif ss[k] < -0.0:
          post_json['children'][2]['children'][0]['children'].append(entry)
        elif ss[k] < 0.4:
          post_json['children'][0]['children'][0]['children'].append(entry)
        elif ss[k] < 0.8:
          post_json['children'][0]['children'][1]['children'].append(entry)
        else:
          post_json['children'][0]['children'][2]['children'].append(entry)
  return post_json
    def sentiment_analytis_text(self,text_insert):

        text = text_insert

        token_text = tokenize.sent_tokenize(text)
        sid = SentimentIntensityAnalyzer()

        over_all_sentiment = 0
        count = 0

        for sentence in token_text:
            score = sid.polarity_scores(sentence)
            # Create over all sentiment score
            over_all_sentiment += score.get("compound")

            # If sentence is not neuteral add to sentence count for average
            if (score.get("compound") >  0.1):
                count += 1

        # Calculate average sentiment
        if count > 0:
            average_sentiment = over_all_sentiment/count
        else:
            average_sentiment = over_all_sentiment

        return average_sentiment
Exemple #8
0
def add_sentiment_to_comments():
    sia = SentimentIntensityAnalyzer()
    for story_comment_list in comments.values():
        for comment in story_comment_list:
            if "text" in comment:
                comment["sentiment"] = sia.polarity_scores(comment["text"])
            print(comment) # here's where to add sentiment using nltk to text
Exemple #9
0
 def on_data(self, raw_data):
     tweet = loads(raw_data)
     try:
         text = tweet['text']
         if tweet.get('retweeted_status') is None and 'RT @' not in text:
             if tweet.get('coordinates') is None:
                 # TODO: Check for rate limit. If rate limited, then perform location inference
                 nouns = self._get_nouns(tweet_text=text)
                 # bf = BilateralFriends(user_id=tweet['user']['id'], twitter_api=self.api)
                 # loc_occurrence_count = bf.get_location_occurrence()
                 tweet_nouns = defaultdict(int)
                 for noun in nouns:
                     tweet_nouns[noun] += 1
                 self.corpus[tweet['user']['id']] = {'id': tweet['user']['id'],
                                                     'location': tweet['user']['location'],
                                                     # 'bilateral_friends_location_occurrences': loc_occurrence_count,
                                                     'text_nouns': tweet_nouns}
                 loc_inf = LocationInference(user=self.corpus[tweet['user']['id']], local_words=self.local_words,
                                             geo_words=self.geo_words)
                 inferred_location = loc_inf.get_location()
                 print inferred_location
                 print 'Predicted location:', inferred_location[0]
                 tweet['coordinates'] = {'type': 'Point', 'coordinates': [LOCATIONS[inferred_location[0]][1],
                                                                          LOCATIONS[inferred_location[0]][0]]}
                 print tweet['coordinates']
             sentiment_analyzer = SentimentIntensityAnalyzer()
             sentiment_score = sentiment_analyzer.polarity_scores(text=text)['compound']
             tweet['sentiment'] = sentiment_score
             current_time_ms = int(round(time() * 1000))
             tweet['time_inserted'] = current_time_ms
             print text, ': ', str(sentiment_score)
             STREAM_BUFFER.insert(tweet)
     except KeyError, v:
         print 'KeyError: ', v
Exemple #10
0
def get_tweets(q, today):
    r = api.request(
        "search/tweets", {"q": "%s since:%s" % (q, today), "count": "100", "result_type": "recent", "lang": "en"}
    )
    data = (json.loads(r.text))["statuses"]
    sid = SentimentIntensityAnalyzer()
    all_tweets = []
    for i in range(0, len(data)):
        text = data[i]["text"].encode("ascii", "ignore").decode("ascii")
        if "RT" in text:
            RT = True
        else:
            RT = False
        others = text.count("@")
        sent = TextBlob(text)
        valance = sent.sentiment.polarity
        NLTK = sid.polarity_scores(text)
        tweet_data = {
            "tweetID": data[i]["id"],
            "created_at": data[i]["created_at"],
            "text": text,
            "textblob": valance,
            "NLTK": NLTK["compound"],
            "RT": RT,
            "others": others,
        }
        # print(data[i])
        all_tweets.append(tweet_data)
    return all_tweets
Exemple #11
0
def sentiment_by_subreddit():
	phrase = urllib.quote(request.form["text"])
	year = urllib.quote(request.form["year"])

	sid = SentimentIntensityAnalyzer()

	year_str = str(year)
	if int(year) > 2014:
		year_str += "_01"

	query = '''SELECT subreddit, body, score FROM
	(SELECT subreddit, body, score, RAND() AS r1
	FROM [fh-bigquery:reddit_comments.''' + year_str + ''']
	WHERE REGEXP_MATCH(body, r'(?i:''' + phrase + ''')')
	AND subreddit IN (SELECT subreddit FROM (SELECT subreddit, count(*) AS c1 FROM [fh-bigquery:reddit_comments.''' + year_str + '''] WHERE REGEXP_MATCH(body, r'(?i:'''+phrase+''')') AND score > 1 GROUP BY subreddit ORDER BY c1 DESC LIMIT 10))
	ORDER BY r1
	LIMIT 5000)
	'''
	bigquery_service = build('bigquery', 'v2', credentials=credentials)
	try:
		query_request = bigquery_service.jobs()
		query_data = {
			'query': query,
			'timeoutMs': 30000
		}

		query_response = query_request.query(
			projectId=bigquery_pid,
			body=query_data).execute()

	except HttpError as err:
		print('Error: {}'.format(err.content))
		raise err
	
	subreddit_sentiments = defaultdict(list)
	subreddit_total = defaultdict(int)
	
	if 'rows' in query_response:
		rows = query_response['rows']
		sentiments = []
		for row in rows:
			subreddit = row['f'][0]['v']
			body = row['f'][1]['v']
			score = int(row['f'][2]['v'])
			sentiment_values = []
			
			lines_list = tokenize.sent_tokenize(body)
			for sentence in lines_list:
				if phrase.upper() in sentence.upper():#(regex.search(sentence)):
					s = sid.polarity_scores(sentence)
					sentiment_values.append(s['compound'])
		
			comment_sentiment = float(sum(sentiment_values))/len(sentiment_values)
			subreddit_sentiments[subreddit].append((comment_sentiment, score))
			subreddit_total[subreddit] += int(score)

	subreddit_sentiments = {subreddit:1 + float(sum([float(pair[0])*float(pair[1]) for pair in sentiment_list]))/subreddit_total[subreddit] for subreddit, sentiment_list in subreddit_sentiments.items()}
	result = sorted(subreddit_sentiments.items(), key = lambda(k,v): (-v,k))
	return json.dumps(result)
Exemple #12
0
 def get_unique_tweets(self, data_dict):
     # TODO: Implement filter to check if Tweet text starts with 'RT'
     """
     :param data_dict:
     :return:
     """
     flag = False
     try:
         text = data_dict['text'].encode('ascii', 'ignore').lower()
         # Check for 'retweeted_status' in metadata field to determine
         # if tweet is a retweet (1st check)
         if 'retweeted_status' not in data_dict:
             url_match = URL.match(text)
             # Check if link contains url
             if url_match:
                 match_group = url_match.group()
                 if len(self.key_list) > 0:
                     if any(match_group in item for item in self.key_list):
                         flag = True
                     if flag is False:
                         data_dict['text'] = match_group
                         print "Inserted text: " + data_dict['text'] + '\n'
                         self.key_list.append(match_group)
                         sid = SentimentIntensityAnalyzer()
                         ss = sid.polarity_scores(text)
                         print ss['compound']
                         score = ss['compound']
                         if score < 0:
                             score += (3 * score)
                         for w in GOOGLE:
                             if w in text and self.google_price >= 0:
                                 self.google_price = score
                                 self.google_text = text
                         for w in MICROSOFT:
                             if w in text and self.microsoft_price >= 0:
                                 self.microsoft_price = score
                                 self.microsoft_text = text
                         for w in FACEBOOK:
                             if w in text and self.facebook_price >= 0:
                                 self.facebook_price = score
                                 self.facebook_text = text
                         p.trigger('test_channel', 'my_event',
                                   {'google': self.google_price,
                                    'microsoft': self.microsoft_price,
                                    'facebook': self.facebook_price})
                         p.trigger('tweet_channel', 'my_event',
                                   {
                                       'google_text': self.google_text,
                                       'microsoft_text': self.microsoft_text,
                                       'facebook_text' : self.facebook_text
                                   })
                         self.google_price = 0
                         self.microsoft_price = 0
                         self.facebook_price = 0
                 else:
                     self.key_list.append(url_match.group())
     except TypeError, e:
         print >> sys.stderr, e
         self.log_error(str(e))
 def avg_message_sentiment_helper(self, message):
     sentences = tokenize.sent_tokenize(message)
     sid = SentimentIntensityAnalyzer()
     sentence_sentiments = []
     for sentence in sentences:
         ss = sid.polarity_scores(sentence)
         sentence_sentiments.append(ss['compound'])
     return np.mean(sentence_sentiments)
    def vader(self):
        sid = SentimentIntensityAnalyzer()
        results = {'neg': 0.0, 'pos': 0.0, 'neu': 0.0, 'compound': 0.0}
        ss = sid.polarity_scores(self.text)
        for k in sorted(ss):
            results[k] += ss[k]

        return results
Exemple #15
0
class Vader:

    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    
    def __call__(self, text):
        scores = self.analyzer.polarity_scores(text)
        return scores['pos'] - scores['neg']
def sentimentScore(sentences):
	analyzer = SentimentIntensityAnalyzer()
	results = []
	for sentence in sentences:
		vs = analyzer.polarity_scores(sentence)
		print("vs: " + str(vs))
		results.append(vs)
	return results
    def get_mean_sentiment(self, exclude_neutral=True):
        sid = SentimentIntensityAnalyzer()
        tot_score = 0
        if exclude_neutral:
            message_count = 0
            for sentence in self.get_message_lst():
                ss = sid.polarity_scores(sentence)
                if ss['compound'] != 0:
                    tot_score += ss['compound']
                    message_count += 1
        else:
            for sentence in self.get_message_lst():
                ss = sid.polarity_scores(sentence)
                tot_score += ss['compound']
            message_count = len(self.get_message_lst())

        return tot_score / message_count
Exemple #18
0
    def computeVaderScore(self,sentence):
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(sentence)
        retList = []
        for k in sorted(ss):
            retList.append(ss[k])

        return retList
    def negative_msg_count(self):
        sid = SentimentIntensityAnalyzer()
        msg_count = 0
        for sentence in self.get_message_lst():
            ss = sid.polarity_scores(sentence)
            if ss['compound'] < 0:
                msg_count += 1

        return msg_count
def get_sentiment_score(sentence):
    score_dict = {}
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        # print('{0}: {1}, '.format(k, ss[k]), end='')
        score_dict[k] = ss[k]

    return score_dict
Exemple #21
0
def sentiment(sentence):
		sid = SentimentIntensityAnalyzer()
		ss = sid.polarity_scores(sentence)
		if float(ss['neg']) > float(ss['pos']) :
				return -1*float(ss['neg'])
		elif float(ss['neg']) < float(ss['pos']):
			return float(ss['pos'])
		else:
			return 0
	def process(self, tup):

		# extract the sentence
		sentence = tup.values[0]  

		sid = SentimentIntensityAnalyzer()
		ss = sid.polarity_scores(sentence)
		tuple_result = (str(ss['neg']),str(ss['pos']),str(ss['neu']))
		self.emit(tuple_result)
def getSentiments(sentences):
    from nltk import tokenize

    sid = SentimentIntensityAnalyzer()
    sentimtents = [None] * len(sentences)
    i = 0
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        sentimtents[i] = ss["compound"]
        i = i + 1
    return sentimtents
def vader_sentiment_scores(text_array):
    sid = SentimentIntensityAnalyzer()
    assert all([type(t) == type('') for t in text_array])
    vs_dict = {'neg': [], 'neu': [], 'pos': [], 'compound': []}
    for i, text in enumerate(text_array):
        if i % 10000 == 0:
            print(i)
        vs = sid.polarity_scores(text)
        for key, value in vs.items():
            vs_dict[key].append(value)
    return vs_dict
def personAvgSentiment(person, month = None, year = None):
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	comp = 0
	sid = SentimentIntensityAnalyzer()
	if month:
		msgLst = fullMessageListMonth(person, month, year)
	else:	
		msgLst = fullMessageList(person)
	for message in msgLst:
		sentimentDict = sid.polarity_scores(message)
		comp += sentimentDict['compound']
	return comp/len(msgLst) if len(msgLst) != 0 else 0
def sentiAnalyze():
    analyzer = SentimentIntensityAnalyzer()


    for candidate in tweets:
        for week in tweets[candidate]:
            for tweet in tweets[candidate][week]:
                if candidate not in result['sum']:
                    result['sum'][candidate] = {}
                if 'total' not in result['sum'][candidate]:
                    result['sum'][candidate]['total'] = 0
                if week not in result['sum'][candidate]:
                    result['sum'][candidate][week] = 0

                result['sum'][candidate][week] += 1
                result['sum'][candidate]['total'] += 1


                score = analyzer.polarity_scores(tweet[2])
                if (score['pos'] - score['neg']) > DISTINCT_THRESHOLD:
                    if candidate not in pos_tweets:
                        pos_tweets[candidate] = {}
                    if week not in pos_tweets[candidate]:
                        pos_tweets[candidate][week] = []
                    pos_tweets[candidate][week].append(tweet)
                   
                    if candidate not in result['pos']:
                        result['pos'][candidate] = {}
                    if 'total' not in result['pos'][candidate]:
                        result['pos'][candidate]['total'] = 0
                    if week not in result['pos'][candidate]:
                        result['pos'][candidate][week] = 0
                    result['pos'][candidate][week] += 1
                    result['pos'][candidate]['total'] += 1

                if (score['neg'] - score['pos']) > DISTINCT_THRESHOLD:
                    if candidate not in neg_tweets:
                        neg_tweets[candidate] = {}
                    if week not in neg_tweets[candidate]:
                        neg_tweets[candidate][week] = []
                    neg_tweets[candidate][week].append(tweet)

                    if candidate not in result['neg']:
                        result['neg'][candidate] = {}
                    if 'total' not in result['neg'][candidate]:
                        result['neg'][candidate]['total'] = 0
                    if week not in result['neg'][candidate]:
                        result['neg'][candidate][week] = 0
                    result['neg'][candidate][week] += 1
                    result['neg'][candidate]['total'] += 1

    return
def vader_sentiment(df):

        sith = SentimentIntensityAnalyzer()
        
        sentiment = []
        for sentence in df.Message:
                sent = sith.polarity_scores(sentence)
                #sent_total = sent['pos'] - sent['neg']

                sentiment.append(sent['compound'])
  
        df['sentiment'] = sentiment
        return df
def ProcessReviews(df, ptype):
    parse_type = ptype

    # Divide reviews into individual sentences
    sentences = df['text'].apply(tokenizetext)

    # Stick the sentences back into the dataframe
    df['sentlist'] = sentences
    d1, d2, d3 = [], [], []
    d4, d5, d6 = [], [], []

    # Initialize the sentiment vader analyzer
    sid = SentimentIntensityAnalyzer()

    # Loop over sentences and process them
    for i in range(0, df.shape[0]):
        sent_list = df['sentlist'][i]
        for sentence in sent_list:
            sent_raw = ''.join(sentence)
            sent_pro = strip_punctuation(sent_raw)
            sent_pro = rmstopwords(sent_pro)
            sent_pro = lemmatize(sent_pro)
            sentiment = sid.polarity_scores(sent_raw)['compound']
            if parse_type[0] == 'ngram':
                pos = ngrams(sent_pro, ptype[1])
            elif parse_type == 'chunk':
                pos = extract_candidate_chunks(sent_pro)
            elif parse_type == 'rake':
                pos = rake_object.run(sent_raw)
                pos = ['_'.join(word[0].split()) for word in pos]
            for j in pos:
                d1.append(df['date'][i])
                d2.append(df['location'][i])
                d3.append(df['rating'][i])
                d4.append(j),
                d5.append(sentiment)
                d6.append(sent_raw)

    # Put everything in a dataframe
    processed_df = pd.DataFrame()
    processed_df['date']      = d1
    processed_df['location']  = d2
    processed_df['rating']    = d3
    processed_df['aspects']   = d4
    processed_df['sentiment'] = d5
    processed_df['context']   = d6

    # Remove any entry where the sentence
    # was determined to be neutral
    processed_df = processed_df[(processed_df['sentiment'] != 0)]
    return processed_df
Exemple #29
0
 def get_news_sentiment(self,team):
     print '------------------------------'
     print 'Scanning sentiment for: ',team
     print '------------------------------'
     story_limit=5 
     neg=0
     pos=0
     visible_text=''
     base_url='https://www.google.co.uk/search?hl=en&gl=uk&tbm=nws&authuser=0&q=football+european+championships'+team.replace(' ','%20')
     req = urllib2.Request(base_url,headers = {'User-Agent': 'Mozilla/5.0'} )
     print base_url
     page = urllib2.urlopen(req)
     soup = BeautifulSoup(page)
     # remove scripts and tags
     for script in soup(["script", "style"]):
             script.extract()    # rip it out
     # find the links
     sentence_count=0
     story_count=0
     for a in soup.findAll('a'):
         if story_count<story_limit:
             # ignore internal google links and remove the tracking guff from the end of the URL
             if ('google' not in a.attrs['href'] and '/search?q=football+european+championships' not in a.attrs['href'] and 'http://' in a.attrs['href']):
                 print a.attrs['href'].replace('/url?q=','').split('&')[0]
                 story_req=urllib2.Request(a.attrs['href'].replace('/url?q=','').split('&')[0],headers={'User-Agent': 'Mozilla/5.0'})
                 try:
                     story_page=urllib2.urlopen(story_req,timeout=5)
                     story_soup=BeautifulSoup(story_page)
                     # analyse text
                     visible_text = story_soup.getText()
                     neg=0
                     pos=0
                     sentences = tokenize.sent_tokenize(visible_text)
                     sid = SentimentIntensityAnalyzer()
                     # for each sentence in the story, get the sentiment/polarity
                     for sentence in sentences:
                         ss = sid.polarity_scores(sentence)
                         neg=neg+ss['neg']
                         pos=pos+ss['pos']
                     sentence_count=sentence_count+ len(sentences)
                     score=score+pos-neg
                     # print out a story by story sentiment for logging
                     print 'Sentiment: ',(pos-neg)/len(sentences)
                 except socket.timeout as e:
                     print type(e)    
                 except urllib2.HTTPError:
                     print 'failed on url: ',a.attrs['href']
                 story_count=story_count+1 
     # return the average net polarity/sentiment
     return (pos-neg)/sentence_count
     
Exemple #30
0
def clean_df(user_df):
    user_df['simple_text'] = user_df['text'].apply(lambda x: remove_links(x))

    #A warning will appear if Twython is not installed
    sid = SentimentIntensityAnalyzer()
    user_df['sentiment'] = user_df['text'].apply(lambda x: sid.polarity_scores(x))

    #keep only fairly positive tweets
    pos_user_tweets = user_df[user_df['sentiment'].apply(lambda x: x['neg']) < .45]

    pos_user_tweets['stemmed_text'] = pos_user_tweets['simple_text'].apply(lambda tweet: remove_punctuation(tweet))

    pos_user_tweets['stripped_text'] = pos_user_tweets['stemmed_text'].apply(lambda tweet: remove_irrelevant_terms(tweet))
    return pos_user_tweets
Exemple #31
0
blanks = []  # start with an empty list

for i, lb, rv in df.itertuples():  # iterate over the DataFrame
    if type(rv) == str:  # avoid NaN values
        if rv.isspace():  # test 'review' for whitespace
            blanks.append(i)  # add matching index numbers to the list

df.drop(blanks, inplace=True)

df['label'].value_counts()

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])

df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg')

df.head()

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy_score(df['label'], df['comp_score'])

print(classification_report(df['label'], df['comp_score']))

print(confusion_matrix(df['label'], df['comp_score']))
len(dnc_doclist)

# make sure all tweets are unique
rnc_array = np.array(rnc_doclist)
len(np.unique(rnc_array))
dnc_array = np.array(dnc_doclist)
len(np.unique(dnc_array))

print(rnc_doclist[:1])

dnc_sentiment_list = list()
rnc_sentiment_list = list()

sid = SentimentIntensityAnalyzer()
for tweet in rnc_doclist:
    ss = sid.polarity_scores(tweet)
    rnc_sentiment_list.append(ss['compound'])
len(rnc_sentiment_list)
rnc_sentiment_list[:10]
rnc_avg_sentiment = np.mean(np.array(rnc_sentiment_list))
#0.1260588
rnc_stddev_sentiment = np.std(np.array(rnc_sentiment_list))
#0.43161457012311344
plt.hist(rnc_sentiment_list, bins=15, color='red')
plt.title("#RNCConvention2020 Tweet Sentiment Scores")
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

for tweet in dnc_doclist:
    ss = sid.polarity_scores(tweet)
Exemple #33
0
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create SentimentIntensityAnalyzer instance
sid = SentimentIntensityAnalyzer()

# Let's try it on one of our phone calls
call_2_text = transcribe_audio('call_2.wav')

# Display text and sentiment polarity scores
print(call_2_text)
print(sid.polarity_scores(call_2_text))

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create SentimentIntensityAnalyzer instance
sid = SentimentIntensityAnalyzer()

# Transcribe customer channel of call 2
call_2_channel_2_text = transcribe_audio('call_2_channel_2.wav')

# Display text and sentiment polarity scores
print(call_2_channel_2_text)
print(sid.polarity_scores(call_2_channel_2_text))

# Import sent_tokenize from nltk
from nltk import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Create SentimentIntensityAnalyzer instance
sid = SentimentIntensityAnalyzer()
Exemple #34
0
for chapter in chapters[1:]:
    print("Chapter :", chapter[:2])

	#SentimentIntensityAnalyzer() function is used for sentiment analysis implmentation\n"
    #using NLTK to find positive/ negative meaning of the chapter.
    sid = SentimentIntensityAnalyzer()

	#If stopwords found in the chapter, then replace it with blank, so that it can not count \n'
    #for calculating positive/negative values.
    for w in chapter.split(" "):
        if w in stopwords.words('english'):
            chapter = chapter.replace(w, " ", 1)
			
    #Calculating positive, negative and neutral values using polarity_scores() function.
    ss = sid.polarity_scores(chapter)
	
    pos = ss["pos"]
    neg = ss["neg"]
    neu = ss["neu"]
    print("Positive ", pos, "Negative ", neg, "Neutral", neu)
	
    #Copying printed values in out which stores list of positive, negative and neutral values list.
    out.append([pos, neg, neu, line_no])
    line_no += 1

N = len(out)

# Considering only positive and negative values. 
p = [i[0] for i in out]
n = [i[1] for i in out]
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
file = 'My_Posts.xlsx'
xl = pd.ExcelFile(file)
dfs = xl.parse(xl.sheet_names[0])
dfs = list(dfs['Your Posts'])
print(dfs)
sid = SentimentIntensityAnalyzer()
str1 = "AM"
str2 = "PM"
for data in dfs:
    a = data.find(str1)
    b = data.find(str2)
    if a == -1 & b == -1:
        ss = sid.polarity_scores(data)
        print(data)
        for k in ss:
            print(k, ss[k])
Exemple #36
0
class nltkWrapperClass:
    ##initiate class d(object):
    def __init__(self, out_dir):  ## private method
        nltk.download('stopwords')
        nltk.download('vader_lexicon')
        self.__out_dir = out_dir  ## private attribute
        self.sid = SentimentIntensityAnalyzer()  ## public attribute

    def __repr__(self):
        return repr(f'The output direcotry of this class is {self.__out_dir}')

    ## Public method to count word frequency
    def wordFreqCount(self, token_list):
        freq = nltk.FreqDist(token_list)
        return freq

    ## Public method to print out word frequency
    def wordFreqPrint(self, freq):
        for key, val in freq.items():
            print(f'key is: {key}, value is : {val}')

    ## Public method to plot out word frequency
    def wordFreqPlot(self, freq):
        freq.plot(20, cumulative=False)
        matplotlib.pyplot.show()

    ## Public method to clean up english stop words
    def stopwordClean(self, token_list):
        clean_token = []
        for token in token_list:
            if token not in stopwords.words('english'):
                clean_token.append(token)
        return clean_token

    ## Public method for giving sentiment score for tweet list and return dictionary
    ## that use tweet id as key and another dictionary that contains the text of tweet
    ## and it's sentiment score as value. Also there is a verbose flag
    def sentiScore(self, tweet_list, Verbose):
        sentiment_score_list = []
        Tweet_dict = {}
        for tweet in tweet_list:
            individual_tweet_dict = {}
            individual_tweet_dict['Text'] = tweet.text
            #Calculating sentiment scores
            individual_tweet_dict[
                'Sentiment_score'] = self.sid.polarity_scores(tweet.text)
            Tweet_dict[tweet.id] = individual_tweet_dict
            if Verbose:
                print(
                    f'Tweet is : {tweet.text}\nSentiment score is :{self.sid.polarity_scores(tweet.text)}\n\n'
                )
        return Tweet_dict

    ## Public method to filter positive tweets
    def posTweets(self, Tweet_dict, Verbose):
        Pos_Tweet_dict = {}
        for tweet in Tweet_dict:
            if Tweet_dict[tweet]['Sentiment_score']['compound'] > 0.05:
                Pos_Tweet_dict[tweet] = Tweet_dict[tweet]
                if Verbose:
                    print(
                        f"Positive tweet is : {Tweet_dict[tweet]['Text']}\nSentiment score is :{Tweet_dict[tweet]['Sentiment_score']}\n\n"
                    )
        return Pos_Tweet_dict

    ## Public method to filter negative tweets
    def negTweets(self, Tweet_dict, Verbose):
        Neg_Tweet_dict = {}
        for tweet in Tweet_dict:
            if Tweet_dict[tweet]['Sentiment_score']['compound'] < -0.05:
                Neg_Tweet_dict[tweet] = Tweet_dict[tweet]
                if Verbose:
                    print(
                        f"Negative tweet is : {Tweet_dict[tweet]['Text']}\nSentiment score is :{Tweet_dict[tweet]['Sentiment_score']}\n\n"
                    )
        return Neg_Tweet_dict

    ## Public method to save pos and neg tweet results
    def CataResultSave(self, Pos_Tweet_dict, Neg_Tweet_dict):
        pos_result = open(self.__out_dir + "Positive_Tweets_Results.txt", 'w')
        for tweet in Pos_Tweet_dict:
            pos_result.write(
                f"Positive tweet is : {Pos_Tweet_dict[tweet]['Text']}\nSentiment score is :{Pos_Tweet_dict[tweet]['Sentiment_score']}\n\n"
            )
        pos_result.close()

        neg_result = open(self.__out_dir + "Negative_Tweets_Results.txt", 'w')
        for tweet in Neg_Tweet_dict:
            neg_result.write(
                f"Negative tweet is : {Neg_Tweet_dict[tweet]['Text']}\nSentiment score is :{Neg_Tweet_dict[tweet]['Sentiment_score']}\n\n"
            )
        neg_result.close()
    def load_and_clean(self, data_path):
        # read data set
        df = pd.read_csv(data_path, sep='\t', header=None)
        df.columns = [
            'polarity', 'aspect_cat', 'target', 'offsets', 'sentence'
        ]

        # create label dictionaries to create column with category number

        aspect_lab = {
            "AMBIENCE#GENERAL": 0,
            "DRINKS#PRICES": 1,
            "DRINKS#QUALITY": 2,
            "DRINKS#STYLE_OPTIONS": 3,
            "FOOD#PRICES": 4,
            "FOOD#QUALITY": 5,
            "FOOD#STYLE_OPTIONS": 6,
            "LOCATION#GENERAL": 7,
            "RESTAURANT#GENERAL": 8,
            "RESTAURANT#MISCELLANEOUS": 9,
            "RESTAURANT#PRICES": 10,
            "SERVICE#GENERAL": 11
        }

        # do the same thing for polarity label

        pol_lab = {"positive": 0, "neutral": 1, "negative": 2}

        # create aspect category number column

        df.insert(loc=2,
                  column='cat_num',
                  value=df['aspect_cat'].map(aspect_lab))

        # create polarity category number column

        df.insert(loc=1, column='pol_num', value=df['polarity'].map(pol_lab))

        # create start and end column for indices of target words in sentence

        df.insert(loc=6,
                  column='start_end',
                  value=[
                      df.loc[i, "offsets"].split(':')
                      for i in range(np.shape(df)[0])
                  ])
        df["start"] = [
            int(df.loc[i, "start_end"][0]) for i in range(np.shape(df)[0])
        ]
        df["end"] = [
            int(df.loc[i, "start_end"][1]) for i in range(np.shape(df)[0])
        ]

        # split aspect categories and create column for each aspect category type

        df["aspect_cat"] = [
            df.loc[i, "aspect_cat"].split("#") for i in range(np.shape(df)[0])
        ]  # we split the 2 words.

        df["cat1"] = [
            df.loc[i, "aspect_cat"][0] for i in range(np.shape(df)[0])
        ]
        aspect_lab1 = {
            "AMBIENCE": 0,
            "DRINKS": 1,
            "FOOD": 2,
            "LOCATION": 3,
            "RESTAURANT": 4,
            "SERVICE": 5
        }
        df['cat1'] = df['cat1'].map(
            aspect_lab1
        )  # we fill 'cat2' with the numbers corresponding to each category.

        df["cat2"] = [
            df.loc[i, "aspect_cat"][1] for i in range(np.shape(df)[0])
        ]
        aspect_lab2 = {
            "GENERAL": 0,
            "PRICES": 1,
            "QUALITY": 2,
            "STYLE_OPTIONS": 3,
            "MISCELLANEOUS": 4
        }
        df['cat2'] = df['cat2'].map(
            aspect_lab2
        )  # we fill 'cat2' with the numbers corresponding to each category.

        # chop up sentences

        cut = ["but"]

        df["sentence_cut"] = df["sentence"]

        for i in range(np.shape(df)[0]):  # for every row of the dataset.
            for c in cut:
                df.loc[i, "sentence_cut"] = df.loc[i, "sentence_cut"].replace(
                    c, 'but')  # replace all the cut words by 'but'.

        df["sentence_cut"] = [
            df.loc[i, "sentence_cut"].split("but")
            for i in range(np.shape(df)[0])
        ]  # split each sentence at the but.

        list_sent = []  # create a list of all sentences.
        for i in range(np.shape(df)[0]):  # for every row of the dataset.
            list_sent.append([])
            for j in range(len(
                    df.loc[i,
                           "sentence_cut"])):  # for each word of the sentence.
                if df.loc[i, "target"] in df.loc[i, "sentence_cut"][j]:
                    list_sent[i].append(df.loc[i, "sentence_cut"][j])

        for i in range(np.shape(df)[0]):
            list_sent[i] = " ".join(list_sent[i])

        # list_sent = sum(list_sent, [])

        df["list_sent"] = list_sent

        ### Tokenization: we will generate BOW and ngrams for the whole sentences, then for the dependencies,
        ### and finally, for the words in the window of size 5 (distance from the target).
        ### We will also assign sentiment scores to each of these representations of the text.
        ### We first delete stop words but will test a model keeping them.
        ### In addition, to word representations, we will generate POS variables

        # load SpaCy for English

        nlp = spacy.load('en_core_web_sm')

        # number of positive words among words which depend of target

        pos_words = pd.read_excel(
            '/Users/philippehayat/Desktop/pos_words.xlsx')
        pos_words = list(pos_words.iloc[:, 0])

        # number of negative words among words which depend of target

        neg_words = pd.read_excel(
            '/Users/philippehayat/Desktop/neg_words.xlsx')
        neg_words = list(neg_words.iloc[:, 0])

        # create lemmatized list of words from whole sentence as well as list of pos

        lemma_list = []
        for i in range(np.shape(df)[0]):
            lemma_list.append([])
            for token in nlp(df.loc[i, 'list_sent']):
                if (token.is_punct == False) & (token.is_alpha == True):
                    lemma_list[i].append(
                        token.lemma_
                    )  # each words of every sentence of 'list_sent' is lemmatized.

        # number of positive words among all words in sentence

        pos_score2 = []
        for i in range(np.shape(df)[0]):
            pos_score2.append([])
            for j in lemma_list[i]:
                if j in pos_words:
                    pos_score2[i].append(1)

        pos_score2 = [
            sum(pos_score2[i]) for i in range(len(pos_score2))
        ]  # we count the number of positive words in each sentence.

        # number of negative words among all words in sentence

        neg_score2 = []
        for i in range(np.shape(df)[0]):
            neg_score2.append([])
            for j in lemma_list[i]:
                if j in neg_words:
                    neg_score2[i].append(1)

        neg_score2 = [
            sum(neg_score2[i]) for i in range(len(neg_score2))
        ]  # we count the number of negative words in each sentence.

        # create vocab set and POS set for 3 methods of selecting words

        vocab_all = list(set(sum(lemma_list, [])))
        #pos_cat_all = list(set(sum(pos_list, [])))

        # assign to columns

        df['lemma_list'] = lemma_list
        #df['pos_list'] = pos_list

        df['sentence2'] = [
            " ".join(df['lemma_list'][i]) for i in range(np.shape(df)[0])
        ]
        #df['pos_list2'] = [" ".join(df['pos_list'][i]) for i in range(np.shape(df)[0])]

        df['pos_score2'] = pos_score2

        df['neg_score2'] = neg_score2

        # BOW

        # BOW for all
        for i in vocab_all:
            df['_' + i] = df.sentence2.str.count(i)
        #for i in pos_cat_all:
        #df['_pos_' + i] = df.pos_list2.str.count(i)

        # Vader polarizer variable

        vader = SentimentIntensityAnalyzer(
        )  # vader gives a score between -1 and 1 to each sentence (that is why + 1).
        vader_all = [
            vader.polarity_scores(df['sentence2'][i])['compound'] + 1
            for i in range(df.shape[0])
        ]
        df['vader_all'] = vader_all

        return df
sqlContext = SQLContext(sc)
review = sqlContext.read.json("./dataset/review.json")
#review = review.withColumn("user_id", review.user_id.cast('string')).withColumn("business_id", review.business_id.cast('string')).withColumn("stars", review.stars.cast('float'))
#sentiment = coalesce((col("stars") >= 3.0).cast("int"), lit(1))
#review = review.withColumn("sentiment", sentiment)
#review = review.filter(review.user_id.isNotNull()).filter(review.business_id.isNotNull())

sentiments = sqlContext.sql(
    'SELECT *, case when stars <= 2.5 then 0 when stars >= 3.5 then 1 end as sentiment from review where stars<=2.5 or stars>= 3.5'
)

df1 = sentiments[['review_id', 'text', 'sentiment']]
#df1.show(1)
#df1.count()

#print(df1)
df2 = df1.rdd
#df2.take(1)

sid = SentimentIntensityAnalyzer()

#calculate the compound score for the review text and then use that to derive the sentiment.
#Then use this predicted sentiment and compare it with the actual sentiment to calculate accuracy.
df3 = df2.map(lambda x : (x[0],sid.polarity_scores(x[1])['compound'],x[2])).\
        map(lambda y: (y[0],1,y[2]) if y[1]>0 else (y[0],0,y[2])).\
        map(lambda z: 1 if(z[1] == z[2]) else 0)
#df3.count()
#df3.take(2)
total_count = df3.count()
accuracy = (df3.filter(lambda a: a == 1).count() * 1.0) / total_count
print(accuracy)
Exemple #39
0
#------------------------
#Getting scores of sentiments
#------------------------
# Scores of YouTube data-
data = pd.read_csv('youtube_data2_for_.csv')

compund = []
pos = []
neg = []
neu = []

for i in data.reviews_youtube:
    i = i.replace("\'", '')
    i = i.strip()
    #print(get_textBlob_score(i))
    ss = sid.polarity_scores(i)
    compund.append(ss['compound'])
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])
    #print(ss)

data['compund'] = compund
data['pos'] = pos
data['neg'] = neg
data['neu'] = neu
#get_vader_score('bad')
maxValuesObj = data[['pos', 'neg']].idxmax(axis=1)
data['sentiments'] = maxValuesObj
data.to_csv('/Users/anilvyas/Desktop/Audace Labs/Rated_data/youtube_data.csv')
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# contains titles and release years associated with each ID
movie_titles = pd.read_csv('movie_titles.txt',
                           names=['col'],
                           engine='python',
                           sep='\t')
movie_titles = pd.DataFrame(movie_titles.col.str.split(',', 2).tolist(),
                            columns=['ID', 'Year', 'Name'])

# assign a sentiment score (-1 to +1) for each movie title
sid = SentimentIntensityAnalyzer()
movie_titles["Sentiment"] = ""
i = 0
for index, row in movie_titles.iterrows():
    title = row["Name"]
    score = sid.polarity_scores(title)['compound']
    row["Sentiment"] = score
    print(i)
    i = i + 1

# export to csv
movie_titles.to_csv('temp.csv')
accs_model2 = cross_val_score(logistic_regression, Xt, Y, scoring="accuracy", cv=5)
print("Accuracy of classifier two is " + str(round(np.mean(accs_model2), 3)))

#-----------------------------------------------------------------------------------------------------
# Model Three: use sentimentIntensityAnalyzer and DecisionTree to analysis the review text

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
 
# use Sentiment Intensity Analyzer to get polarity scores for each review text
reviews=df["Review Text"]
sentiment=[]
s=SentimentIntensityAnalyzer()
for x in reviews:
    score=s.polarity_scores(x)
    sentiment.append(score)
df_s=pd.DataFrame(sentiment)

# add palraity scores into original data frame
df["pos_score"]=df_s["pos"]
df["neu_score"]=df_s["neu"]
df["neg_score"]=df_s["neg"]
df["comp_score"]=df_s["compound"]


predictors=["pos_score","neu_score","neg_score","comp_score"]
target="Rating_fresh"
cleaned_df = df.dropna()

# build the regression classifier
                                      sources='bbc-news,the-verge',
                                      domains='bbc.co.uk,techcrunch.com',
                                      # from_parameter=two_years_ago,
                                      # to=current_date,
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)
	
	articles = newsdata['articles']
	article_dic = {}
	articles_arr = []
	for article in articles:
		headline = article['title']
		article_dic['headline'] = headline
		# format of score is {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
		article_dic['headline_score'] = sia.polarity_scores(headline)
		description = article['description']
		article_dic['description'] = description
		article_dic['description_score'] = sia.polarity_scores(description)
		# format of date is 2018-04-13T00:46:59Z (UTC format)
		article_dic['publishedAt'] = article['publishedAt'] 
		article_dic['source'] = article['source']['name']

		comp_symb = company_symb[company].replace(".", "_").replace("-", "_")


		article_dict['stock_price_change'] = calculate_stock_price_change(comp_symb,article_dic['publishedAt'])
		
		
		# append article dict to array 
		articles_arr.append(article_dic)
Exemple #43
0
def move_NASDAQ():
    test_start_date = datetime.datetime.now() - datetime.timedelta(days=5)
    test_end_date = datetime.datetime.now()  #- datetime.timedelta(days = 1)
    df_stocks = pd.read_pickle('pickled_para_NDAQ.pkl')
    df_stocks['prices'] = df_stocks['close'].apply(np.int64)
    # selecting the prices and articles
    df_stocks = df_stocks[['prices', 'articles']]
    df_stocks['articles'] = df_stocks['articles'].map(lambda x: x.lstrip('.-'))
    df_stocks
    df = df_stocks[['prices']].copy()
    # Adding new columns to the data frame
    df["compound"] = ''
    df["neg"] = ''
    df["neu"] = ''
    df["pos"] = ''
    sid = SentimentIntensityAnalyzer()
    for date, row in df_stocks.T.iteritems():
        try:
            sentence = unicodedata.normalize('NFKD', df_stocks.loc[date,
                                                                   'articles'])
            ss = sid.polarity_scores(sentence)
            df.set_value(date, 'compound', ss['compound'])
            df.set_value(date, 'neg', ss['neg'])
            df.set_value(date, 'neu', ss['neu'])
            df.set_value(date, 'pos', ss['pos'])
        except TypeError:
            print(df_stocks.loc[date, 'articles'])
            print(date)

    test = df.ix[test_start_date:test_end_date]

    ##
    #test_start_date = '2018-03-09'
    #test_end_date = '2018-03-10'
    test = df.ix[test_start_date:test_end_date]

    ##

    # Calculating the sentiment score
    sentiment_score_list = []
    for date, row in test.T.iteritems():
        sentiment_score = np.asarray([
            df.loc[date, 'compound'], df.loc[date, 'neg'], df.loc[date, 'neu'],
            df.loc[date, 'pos']
        ])
        #sentiment_score = np.asarray([df.loc[date, 'neg'],df.loc[date, 'pos']])
        sentiment_score_list.append(sentiment_score)
        numpy_df_test = np.asarray(sentiment_score_list)

    filename = 'finalized_model_NDAQ.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    #loaded_model.fit(numpy_df_test,test['prices'])
    result = loaded_model.predict(numpy_df_test)

    difference = result[1] - result[0]
    if (difference > 0):
        sentence = "Stock Price will rise"
    else:
        sentence = "Stock Price will fall"
    param = {
        'q': ".IXIC",  # Stock symbol (ex: "AAPL")
        'i': "86400",  # Interval size in seconds ("86400" = 1 day intervals)
        'x':
        "INDEXNASDAQ",  # Stock exchange symbol on which stock is traded (ex: "NASD")
        'p': "1M"  # Period (Ex: "1Y" = 1 year)
    }
    df = get_price_data(param)
    df.to_csv('C:/Users/ansha/Anaconda3/FlaskApp/nasd1.csv')
    line = pd.read_csv("nasd1.csv", index_col=False)

    df = pd.DataFrame(data=line)

    dates = df['Unnamed: 0']

    o = df['Open']
    h = df['High']
    l = df['Low']
    c = df['Close']
    line_chart = pygal.Line(x_label_rotation=20,
                            x_labels_major_every=3,
                            show_minor_x_labels=False,
                            human_readable=True)
    line_chart.title = 'NASDAQ'
    line_chart.x_labels = map(str, dates)
    line_chart.add('Open', o)
    line_chart.add('High', h)
    line_chart.add('Low', l)
    line_chart.add('Close', c)
    graph_data = line_chart.render_data_uri()

    return render_template("an1.html",
                           data=sentence,
                           graph_data=graph_data,
                           data1=difference)
while (True):
    start_time = time.time()

    docs = src_coll.find({"nlp_flag": 1}).limit(1000)

    print("start processing 1000 items")

    for doc in docs:
        print("processing... {}".format(doc['search_word']))
        db.usa_tweets_collection.update_one({'_id': doc['_id']},
                                            {'$set': {
                                                'nlp_flag': 2
                                            }})

        ss = sid.polarity_scores(doc['mention'])

        neu_score = ss['neu']
        neg_score = ss['neg']
        pos_score = ss['pos']
        update_q = {
            '$inc': {
                'neu': neu_score,
                'pos': pos_score,
                'neg': neg_score
            }
        }

        # best_score = max(pos_score, neg_score, neu_score)
        # if neu_score == best_score:
        #     update_q = {'$inc': {'neu': 1}}
def tweet_data():
    try:
        # Open/Create a file to append data
        # If the file exists, then read the existing data from the CSV file.
        file_name = "tourism_" + datetime.now().strftime(
            "%d-%b-%Y") + "_data.csv"

        COLS = [
            'created_at', 'id', 'send_by', 'tweet_url', 'original_text',
            'trans', 'process', 'priority', 'type'
        ]

        if os.path.exists(file_name):
            df = pd.read_csv(file_name, header=0)
            pre_id = max(df["id"])
            print(pre_id)
        else:
            pre_id = 0
            df = pd.DataFrame(columns=COLS)
            print(pre_id)
        hndlr_lst = twitter_credential.handler_list
        # new_entry = []
        for name in hndlr_lst:
            for tweet in tweepy.Cursor(
                    api.search,
                    q=name,
                    count=100,
                    # lang="en",
                    since=datetime.now().strftime("%Y-%m-%d"),
                    since_id=pre_id,
                    # max_id = pre_id
                    # until= datetime.now().strftime("%Y-%m-%d")
            ).items():

                # # tweet URL
                tweet_url = f"https://twitter.com/" + tweet.user.screen_name + "/status/" + str(
                    tweet.id)

                # google tranglater
                translator = Translator()
                trans = translator.translate(tweet.text).text

                # cleaning data
                process = p.clean(trans)
                process = re.sub(r':', '', process)
                process = re.sub(r'…', '', process)

                # vader
                sen_analyser = SentimentIntensityAnalyzer()
                polarity_scores = sen_analyser.polarity_scores(process)
                print(tweet.id)
                compnd = polarity_scores['compound']
                if compnd >= 0.05:
                    polarity = polarity_scores['pos']
                    polarity_type = "positive"
                elif compnd <= -0.05:
                    polarity = polarity_scores['neg']
                    polarity_type = "negative"
                else:
                    polarity = polarity_scores['neu']
                    polarity_type = "neutral"

                new_entry = [
                    tweet.created_at, tweet.id, tweet.user.screen_name,
                    tweet_url, tweet.text, trans, process, polarity,
                    polarity_type
                ]
                # print(new_entry)

                single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
                df_final = df.append(single_tweet_df, ignore_index=True)
                df = pd.DataFrame(data=df_final, columns=COLS)
                df.to_csv(file_name)

        # print("Got all the tweet.")
    except tweepy.TweepError as e:
        print(str(e))
        print("Something went wrong.")
    def features(self, source, reply):
        tokenizer = PreprocessTwitter()
        featset = []
        hc = []
        c = 0


        if "text" not in source.keys():
            s_text_raw = source["full_text"].lower()
        else:
            s_text_raw = source["text"].lower()

        if "text" not in reply.keys():
            r_text_raw = reply["full_text"].lower()
        else:
            r_text_raw = reply["text"].lower()

        s_text = tokenizer.tokenize(s_text_raw)
        r_text = tokenizer.tokenize(r_text_raw)

        s_id = source["id"]
        r_id = reply["id"]

        s_words = s_text.split(" ")
        r_words = r_text.split(" ")

        s_vector = self.tweet_vector(s_words)
        r_vector = self.tweet_vector(r_words)


        #similarity between source and reply
        sourceSim = cosine(s_vector, r_vector)
        hc.append(sourceSim)


        ###has url? 
        url = 0.
        if "<url>" in r_text:
            url = 1.
        hc.append(url)

        ###ends with question mark
        ewqm = 0.
        if r_text[-1] == "?":
            ewqm = 1.
        hc.append(ewqm)

        ###is reply?
        is_reply = 0.
        if s_id != r_id:
            is_reply = 1.
        hc.append(is_reply)

        ###supporting similarity
        sup_vector = self.tweet_vector(self.support_terms)
        supSim = cosine(r_vector, sup_vector)
        hc.append(supSim)

        ###contains Wh question
        wh = 0.
        if "who" in r_words or "where" in r_words or "why" in r_words or "what" in r_words:
            wh = 1.
        hc.append(wh)

        ###dontyou
        dontyou = 0.
        if "don't you" in r_text:
            dontyou = 1.
        hc.append(dontyou)

        ###arentyou
        arentyou = 0.
        if "aren't you" in r_text:
            arentyou = 1.
        hc.append(arentyou)

        ###has replies
        has_replies = 0.
        if reply["in_reply_to_status_id"] != "null":
            has_replies = 1.
        hc.append(has_replies)

        ###sentiment features (Vader)
        #analyser = SentimentIntensityAnalyzer()
        #score = analyser.polarity_scores(r_text_raw)
        #hc.append(score["neg"])
        #hc.append(score["pos"])
        #hc.append(score["neu"])
        #hc.append(score["compound"])
        
        #score = analyser.polarity_scores(s_text_raw)
        #hc.append(score["neg"])
        #hc.append(score["pos"])
        #hc.append(score["neu"])
        #hc.append(score["compound"])

        sentAnalyser = SentimentIntensityAnalyzer()

        scores = sentAnalyser.polarity_scores(r_text_raw)

        hc.append(scores["neg"])
        hc.append(scores["pos"])
        hc.append(scores["neu"])
        hc.append(scores["compound"])

        scores = sentAnalyser.polarity_scores(s_text_raw)

        hc.append(scores["neg"])
        hc.append(scores["pos"])
        hc.append(scores["neu"])
        hc.append(scores["compound"])
        
        ###TODO: check negation features -- Stanford

        ###has negation

        ###average negation



        ###has slang/curse word
        hasVulgar = 0.

        ###has Google bad word
        hasGoogleBadWords = 0.

        ###has acronyms
        hasAcro = 0.

        ###average word length
        wordCount = 0.
        wordLen = 0.
        avWL = 0.

        for token in r_words:
            if token[0] != "<" and token[-1] != ">":
                wordLen = wordLen + float(len(token))
                wordCount = wordCount + 1.
            if token in self.acronyms:
                hasAcro = 1.
            if token in self.vulgarWords:
                hasVulgar = 1.
            if token in self.googleBadWords:
                hasGoogleBadWords = 1.
        if wordCount != 0.:
            avWL = wordLen/wordCount
        
        hc.append(avWL)
        hc.append(hasAcro)
        hc.append(hasVulgar)
        hc.append(hasGoogleBadWords)

        ###surprise score
        surprise_vector = self.tweet_vector(self.surpriseWords)
        surprise_score = cosine(r_vector, surprise_vector)
        hc.append(surprise_score)

        ###doubt Score
        doubt_vector = self.tweet_vector(self.doubtWords)
        doubt_score = cosine(r_vector, doubt_vector)
        hc.append(doubt_score)
        
        ###nodoubt score
        no_doubt_vector = self.tweet_vector(self.noDoubtWords)
        no_doubt_score = cosine(r_vector, no_doubt_vector)
        hc.append(no_doubt_score)

        ###has question mark
        ###number question mark
        numberQM = r_text_raw.count("?")
        hasQM = 0.
        if numberQM > 0:
            hasQM = 1.
        hc.append(hasQM)
        hc.append(float(numberQM))

        ###has exclamation mark
        ###number exclamation mark
        numberEM = r_text_raw.count("!")
        hasEM = 0.
        if numberEM > 0:
            hasEM = 1.
        hc.append(hasEM)
        hc.append(float(numberEM))

        ###has dot dot dot
        ###number dot dot dot
        numberDDD = r_text_raw.count("...")
        hasDDD = 0.
        if numberDDD > 0:
            hasDDD = 1.
        hc.append(hasDDD)
        hc.append(float(numberDDD))

        ###originality (tweets counts)
        tweet_count = float(reply["user"]["statuses_count"])
        hc.append(tweet_count)

        ###is Verified
        isVerified = 0.
        if reply["user"]["verified"] != "false":
            isVerified = 1.
        hc.append(isVerified)

        ###number of followers
        followers = float(reply["user"]["friends_count"])
        hc.append(followers)

        ###role
        followees = float(reply["user"]["followers_count"])
        role = 0.
        if followees != 0.:
            role = followers/followees
        hc.append(role)

        #TODO: understand engagement features
        #import time
        #res = time.strptime(reply["user"]["created_at"], "%a %b %d %H:%M:%S +0000 %Y")
        #print(res)

        ###engagement

        ###engagement favorite


        ###public list membership count
        public_list = float(reply["user"]["listed_count"])
        hc.append(public_list)

        ###has geo enabled
        geo = 0.
        if reply["user"]["geo_enabled"] != "false":
            geo = 1.
        hc.append(geo)


        ###has description
        description = reply["user"]["description"]
        hasDesc = 0.
        len_desc = 0.
        if (description != None) and (description.strip() != ""):
            hasDesc = 1.
            len_desc = float(len(description.split(" ")))
        hc.append(hasDesc)


        ###length of description
        hc.append(len_desc)

        ###pattern 1
        pattern1 = Heuristics.pattern1(r_text_raw)
        hc.append(pattern1)
        
        ###pattern 2
        pattern2 = Heuristics.pattern2(r_text_raw)
        hc.append(pattern2)

        ###pattern 3
        pattern3 = Heuristics.pattern3(r_text_raw)
        hc.append(pattern3)

        ###pattern 4
        pattern4 = Heuristics.pattern4(r_text_raw)
        hc.append(pattern4)

        ###pattern 5
        pattern5 = Heuristics.pattern5(r_text_raw)
        hc.append(pattern5)

        ###pattern 6
        pattern6 = Heuristics.pattern6(r_text_raw)
        hc.append(pattern6)

        ###pattern 7
        pattern7 = Heuristics.pattern7(r_text_raw)
        hc.append(pattern7)

        ###pattern 8
        pattern8 = Heuristics.pattern8(r_text_raw)
        hc.append(pattern8)

        ###pattern 9
        pattern9 = Heuristics.pattern9(r_text_raw)
        hc.append(pattern9)

        ###pattern 10
        pattern10 = Heuristics.pattern10(r_text_raw)
        hc.append(pattern10)
        
        ###emoticons
        r_text_noURL = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>", r_text_raw, flags=re.MULTILINE | re.DOTALL)

        for emo in self.emoticons.keys():
            if emo in r_text_noURL:
                self.emoticons_cat[self.emoticons[emo]] = 1.

        for emo_cat in self.emoticons_cat:
            hc.append(emo_cat)


        hc_vector = np.array(hc)


        aux_vector = np.append(s_vector, r_vector)
        final_vector = np.append(aux_vector, hc_vector)
        
        final_vector_scaler = self.scaler.transform(final_vector.reshape(1, -1))

        return final_vector_scaler
def sentiment_analysis(message):
    sia = SIA()
    p_score = sia.polarity_scores(message)
    p = p_score['compound']
    speedometer(p)
Exemple #48
0
def get_sentiments(text):

    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)
    lambda x: " ".join(word for word in x.split() if word not in n_req))

## subjectvity & polarity of each review rows...##
from textblob import TextBlob

dataset['polarity'] = dataset['review'].apply(
    lambda x: TextBlob(x).sentiment.polarity)
dataset['subjectivity'] = dataset['review'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity)

## Finding sentiment through VADER sentiment Analyzer..##
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(dataset.iloc[4]['review'])

dataset['vad_scores'] = dataset['review'].apply(
    lambda review: sid.polarity_scores(review))
dataset['vad_compound'] = dataset['vad_scores'].apply(lambda d: d['compound'])

########...finding Correlation in the data....###
corrmat = dataset.corr()
print(corrmat)

## ....Finding most common occuring words in Corpus...##
review_str = " ".join(dataset.review)
text = review_str.split()

from collections import Counter
counter = Counter(text)
Exemple #50
0
def main():

    # Initialize objects
    gen = document_generator()
    nlp = spacy.load('en', disable=['tagger', 'parser', 'textcat'])
    dic = pyphen.Pyphen(lang='en')
    tok = RegexpTokenizer('\w+')
    sid = SentimentIntensityAnalyzer()
    genre_result = []

    # Tagsets
    POS_tags = {"''", '(', ')', ',', '--', '.', ':', 'CC', 'CD', 'DT', 'EX', 
                 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 
                 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 
                 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 
                 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``', '$', '#'}
    entity_tags = {'PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 
                  'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME',
                  'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'FAC'}

    # Generate features
    # TODO When the generator yields 0 save the progress
    for file in gen:

        # Save file if all the documents from a genre are generated
        if 'genre' == file[0]:
            genre_result = np.array(genre_result)
            genre_result = pd.DataFrame(data=genre_result)
            genre_result.to_csv('/datastore/10814418/preprocessed_' + 
                                str(file[1]) + '.csv', index=False)
            genre_result = []
            continue

        # Check file if the file is non-empty and name variables
        text = file[0]
        info = np.array([file[2], file[1]])
        index = file[3]
        if len(text) < min_len:
            continue

        # POS-tags
        empty_counter = {key: 0 for key in POS_tags}
        tags = nltk.pos_tag(nltk.word_tokenize(text))
        tags_counter = Counter(tag for w,tag in tags)
        final_dict = {**empty_counter, **dict(tags_counter)}
        sorted_items = sorted(final_dict.items())
        keys = [item[0] for item in sorted_items]
        tag_count = np.array([item[1] for item in sorted_items])
        
        # Entities
        empty_counter = {key: 0 for key in entity_tags}
        doc = nlp(text)
        entity_counter = Counter(ent.label_ for ent in doc.ents)
        final_dict = {**empty_counter, **dict(entity_counter)}
        sorted_items = sorted(final_dict.items())
        ent_count = np.array([item[1] for item in sorted_items])

        # Sentence, word and syllable count
        n_sent = len(nltk.sent_tokenize(text)) 
        words = tok.tokenize(text)
        n_word = len(words)
        syllables = [dic.inserted(word) for word in words]
        syllable_list = [len(re.findall('-', word)) + 1 for word in syllables] 
        n_syl = sum(syllable_list)
        syntax_count = np.array([n_sent, n_word, n_syl])

        # Readability score
        flesh = 206.835-1.015*(n_word/n_sent)-84.6*(n_syl/n_word)
        flesh_kincaid = 0.39*(n_word/n_sent)+11.8*(n_syl/n_word)-15.59
        readability_score = np.array([flesh, flesh_kincaid])

        # Sentiment
        score_dic = sid.polarity_scores(text)
        sentiment = np.array([score_dic['neg'], score_dic['neu'], 
                              score_dic['pos'], score_dic['compound']])

        # Concat all features
        instance_result = np.concatenate([tag_count, ent_count, syntax_count, 
                                          readability_score, sentiment, info])
        genre_result.append(instance_result)
data['label'].value_counts()

data.dropna(inplace=True)

empty = []
for i, lb, rv in data.itertuples():
    if (type(rv) == str):
        if rv.isspace():
            empty.append(i)

data.drop(empty, inplace=True)

data['label'].value_counts()

sid.polarity_scores(data.loc[0]['review'])

data['scores'] = data['review'].apply(
    lambda review: sid.polarity_scores(review))
data.head()

data['compound'] = data['scores'].apply(
    lambda score_dict: score_dict['compound'])
data.head()

data['comp_score'] = data['compound'].apply(lambda c: 'pos'
                                            if c >= 0 else 'neg')
data.head()

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
Exemple #52
0
def getSentiment() : 
        
    with open("data/sample_twitter_data_2020-06-20_to_2020-08-06.pk",'rb') as f : 
        conora19_content = pickle.load(f)
        
    conora19_content = [str(doc) for doc in conora19_content]  ### LIST(str) 을 변환시키는 부분 
    print(type(conora19_content))


    #### "https://" 제거하기 
    pattern2 = re.compile(r"\b(https?:\/\/)?([\w.]+){1,2}(\.[\w]{2,4}){1,2}(.*)")
    clean_conora19_content = [pattern2.sub("", doc) for doc in conora19_content]


    #####  "\n"  제거하기 
    pattern1 = re.compile("\n")
    clean_conora19_content = [pattern1.sub("",doc) for doc in clean_conora19_content]

    #### "특수문자"  제거하기 ex> ?,*, !...... 
    pattern4 = re.compile("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]") ##특수 문자 제거
    clean_conora19_content = [pattern4.sub("",doc) for doc in clean_conora19_content]



    #########  english 보기 
    stops = set(stopwords.words("english"))

    ######## customer 단어도 정제하기 (일반적인 단어가 아니라,불필요한 단어 찾기 )

    stops.add('The')
    stops.add('said')
    stops.add('people')
    stops.add('also')
    stops.add('would')
    stops.add('\n')
    stops.add('ANALYSIS/OPINION:') 

    clean_conora19_content = [word for word in clean_conora19_content if word not in stops and len(word) > 1]

    clean_conora19 = []

    ##### 대문자를 소문자로 바뀌기 
    for text in clean_conora19_content :
        clean_conora19.append(text.lower()) ## lower() 소문자로 변환하기
        
    print(clean_conora19[1])

    ########### 감성 분류 

    nltk.download('vader_lexicon')  ## 감성적인 분류 Lib 가져오기
    sid = SentimentIntensityAnalyzer() ### NLTK Sentiment class 만들기

    sentiment = sid.polarity_scores(clean_conora19[1]) ## 기사 contents 하나만 적용하기 
    print("neg sum:{}, neu sum:{}, pos sum:{}".format(sentiment['neg'],sentiment['neu'],sentiment['pos']))

    #### 기사 전체 sentiment 적용하고 저장하기

    total_sentiment = []
    for content in clean_conora19 :
        total_sentiment.append(sid.polarity_scores(content))
        
    #### 기사 전체 Sentiment 합계 구하기

    total_neg = 0.0
    total_neu = 0.0 
    total_pos = 0.0
    print("len:",len(total_sentiment))

    for sentiment in total_sentiment : 
        total_neg = total_neg + float(sentiment['neg'])
        total_neu = total_neu + float(sentiment['neu'])
        total_pos = total_pos + float(sentiment['pos'])

    print("neg sum:{}, neu sum:{}, pos sum:{}".format(total_neg/len(total_sentiment),total_neu/len(total_sentiment),total_pos/len(total_sentiment)))
    
    return [len(total_sentiment),total_neg/len(total_sentiment),total_neu/len(total_sentiment),total_pos/len(total_sentiment)]

#if __name__ == '__main__':

#     sentiment = getSentiment()
    
    time_main = []
    time_sub = []

    for order in range(count, len(test)):
        # add the time so when there is no comment, the program won't cause mistakes
        time_main.append(covert_to_unixtime(test.iloc[order].creat_time))
        time_sub.append(covert_to_unixtime(test.iloc[order].creat_time))
        # deal with the main comment
        if (test.iloc[order].title == disgussion.title
                and test.iloc[order].creat_time == disgussion.creat_time
                and test.iloc[order]['main_comment'] != 0):
            #this print is used to debug, in order to understand where the bug is, the same with the prints beblow
            print('round one', order)
            #get the score of the comment
            sentiment_mm.append(
                sid.polarity_scores(
                    test.iloc[order]['main_comment'])['compound'])
            #add the word in to mimic box
            row_mimic.append(test.iloc[order]['sub_comment'])
            main = main + ' ' + test.iloc[order].main_comment
            if (test.iloc[order]['main_creat'] != 0):
                time_main.append(
                    covert_to_unixtime(test.iloc[order].main_creat))
            else:
                time_main.append(
                    covert_to_unixtime(test.iloc[order].creat_time))
        #deal with the sub comment
        if (test.iloc[order].title == disgussion.title
                and test.iloc[order].creat_time == disgussion.creat_time
                and test.iloc[order]['sub_comment'] != 0):
            print('round two', order)
            #the socre of the sub comment
def get_sentiments(text):
    nltk.download('vader_lexicon')
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)
        print(clean)
        clean_review.append(clean)

with open('Review.txt', "w+", encoding=('utf-8')) as filehandle:
    filehandle.writelines("%s\n" % review for review in clean_review)

import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

df = pd.DataFrame(clean_review, columns=['Reviews'])
reviews = df

reviews['neg'] = reviews['Reviews'].apply(
    lambda x: sia.polarity_scores(x)['neg'])
reviews['neu'] = reviews['Reviews'].apply(
    lambda x: sia.polarity_scores(x)['neu'])
reviews['pos'] = reviews['Reviews'].apply(
    lambda x: sia.polarity_scores(x)['pos'])

reviews['compound'] = reviews['Reviews'].apply(
    lambda x: sia.polarity_scores(x)['compound'])

star5 = [
    j for i, j in enumerate(reviews['Reviews'])
    if 1 >= reviews['compound'][i] > 0.6
]
star4 = [
    j for i, j in enumerate(reviews['Reviews'])
    if 0.6 >= reviews['compound'][i] > 0.2
Exemple #56
0
def store(tags):
    try:
        tso = TwitterSearchOrder()
        tso.set_keywords(tags, or_operator=True)

        sid = SentimentIntensityAnalyzer()

        # Provides the wrapper with the necessary data for making the calls and retrieving the data
        ts = TwitterSearch(consumer_key=key,
                           consumer_secret=secret,
                           access_token=token_key,
                           access_token_secret=token_secret)

        tweet_id_array = [
        ]  # using array instead of calling twitter search again, to make it more time efficient
        count = 0
        for tweet in ts.search_tweets_iterable(tso):
            count += 1

            if (tweet['user']['location'][0:8].lower() == 'edmonton'):
                tweet_id_array.append(tweet['id'])
                ss = sid.polarity_scores(tweet['text'])
                u = mod.User(tweet['user']['id'], tweet['user']['screen_name'],
                             tweet['user']['name'],
                             tweet['user']['followers_count'],
                             tweet['user']['favourites_count'],
                             tweet['user']['friends_count'],
                             tweet['user']['created_at'], timezone.now(),
                             tweet['user']['statuses_count'])

                mod.User.insert_user(tweet['user']['id'],
                                     tweet['user']['screen_name'],
                                     tweet['user']['name'],
                                     tweet['user']['followers_count'],
                                     tweet['user']['favourites_count'],
                                     tweet['user']['friends_count'],
                                     tweet['user']['created_at'],
                                     tweet['user']['statuses_count'])

                mod.Tweet.insert_tweet(
                    tweet['id'], tweet['text'], tweet['created_at'],
                    tweet['favorite_count'], tweet['retweet_count'],
                    tweet['in_reply_to_status_id'], tweet['lang'], u,
                    ss['compound'], ss['pos'], ss['neg'], ss['neu'],
                    get_sentiment_string(ss['compound']), 'retweeted_status'
                    in tweet)
                hashtags_list = tweet['entities']['hashtags']

                # Add the hashtags and duplicates are not added
                for hashtag in hashtags_list:
                    mod.Hashtag.insert_hashtag(tweet['id'],
                                               hashtag['text'].lower())

        # count and save the rep_count after all the tweet data is saved and updated in database
        for tweetid in tweet_id_array:
            rp_count = Tweet.objects.filter(tid_parent=tweetid).count()
            mod.Tweet.insert_replycount(tweetid, rp_count)

    except (TwitterSearchException, ConnectionError
            ) as e:  # take care of all those ugly errors if there are some
        print 'Exception:', e

    print count, 'tweets received'
        title = row.a.text
        date_data = row.td.text.split(' ')

        if len(date_data) == 1:
            time = date_data[0]
        else:
            date = date_data[0]
            time = date_data[1]

        parsed_data.append([ticker, date, time, title])

# print(parsed_data)

df = pd.DataFrame(parsed_data, columns=['ticker', 'date', 'time', 'title'])

vader = SentimentIntensityAnalyzer()

f = lambda title: vader.polarity_scores(title)['compound']
df['compound'] = df['title'].apply(f)
df['date'] = pd.to_datetime(df.date).dt.date

plt.figure(figsize=(10, 8))

mean_df = df.groupby(['ticker', 'date']).mean()
mean_df = mean_df.unstack()
mean_df = mean_df.xs('compound', axis="columns").transpose()
mean_df.plot(kind='bar')
plt.show()
#print(mean_df)
#print(vader.polarity_scores("I don't think Apple is a good company. I think they will do poolry this quarter."))
Exemple #58
0
def main():
    demo = 'd286f23fd3d3c4fbd6cc5768c2a6388d'

    #data = read_csv('/Users/alenshaju/Downloads/SP500_tickers_100.csv')
    #companies = data['Ticker'].to_list()[:10]

    consumer_companies = [
        'TJX', 'NKE', 'TGT', 'HD', 'LOW', 'PG', 'WMT', 'COST', 'MDLZ', 'EL',
        'KO', 'PEP', 'PM', 'MO', 'BKNG', 'MCD', 'SBUX'
    ]
    energy_companies = ['NEE', 'XOM', 'CVX']
    fig_companies = [
        'BLK', 'AXP', 'V', 'MA', 'PYPL', 'FIS', 'JPM', 'BAC', 'WFC', 'USB',
        'SPGI', 'MS', 'SCHW', 'GS', 'BRK.B', 'AMT'
    ]  #C
    healthcare_companies = [
        'ABBV', 'AMGN', 'GILD', 'ABT', 'DHR', 'MDT', 'SYK', 'ISRG', 'CVS',
        'CI', 'TMO', 'UNH', 'ANTM', 'JNJ', 'PFE', 'LLY', 'BMY'
    ]
    industrials_companies = [
        'BA', 'RTX', 'LMT', 'DE', 'UPS', 'TSLA', 'GM', 'CAT', 'HON', 'GE',
        'MMM', 'LIN', 'UNP'
    ]
    tech_companies = [
        'ADBE', 'CRM', 'INTU', 'GOOG', 'GOOG.L', 'FB', 'AMZN', 'ACN', 'IBM',
        'AMAT', 'LRCX', 'NVDA', 'INTC', 'AVGO', 'TXN', 'QCOM', 'MU', 'AMD',
        'MSFT', 'ORCL', 'NOW', 'AAPL'
    ]
    mt_companies = ['CMCS.A', 'CHTR', 'CSCO', 'VZ', 'T', 'DIS', 'NFLX']

    companies = ['UAL']

    past_call_dict = {}
    yec = YahooEarningsCalendar()

    for company in companies:
        print("Ticker:", company)
        past_calls_df = get_past_earnings_call(yec, company)
        past_call_dict[company] = past_calls_df
    df_returns_scores = pd.DataFrame(columns=['Return', 'Score'])
    sia = SentimentIntensityAnalyzer()

    d = {}
    with open(
            "/Users/alenshaju/Downloads/LoughranMcDonald_MasterDictionary_2018.txt"
    ) as f:
        for line in f:
            (key, val) = line.split()
            d[key] = float(val)
    sia.lexicon.update(d)
    excel_df = pd.DataFrame(
        columns=['Ticker', 'Quarter', 'Sentiment Score', 'Returns'])

    for company in companies:
        print("For company: ", company)
        for i, row in past_call_dict[company].iterrows():
            date = datetime.datetime.strptime(row['startdatetime'],
                                              '%Y-%m-%dT%H:%M:%S.%fZ')
            quarter = pd.Timestamp(date).quarter
            year = date.year
            if year <= datetime.datetime.now().year:
                if year == datetime.datetime.now().year:
                    if quarter >= pd.Timestamp(
                            datetime.datetime.now()).quarter:
                        continue
                transcript = requests.get(
                    f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company}?quarter={quarter}&year={year}&apikey={demo}'
                ).json()

                if len(transcript) == 0:
                    continue

                transcript = transcript[0]['content'].split('\n')
                if not bool(len(pd.bdate_range(date, date))):
                    date = date - BDay(1)
                if (date + BDay(1)) in get_trading_close_holidays(year):
                    end_date = date + BDay(1)
                else:
                    end_date = date

                stock = yf.download(company,
                                    start=date,
                                    end=end_date + BDay(1) +
                                    datetime.timedelta(1),
                                    progress=False)
                price_change_rate = (stock['Adj Close'][1] /
                                     stock['Adj Close'][0]) - 1
                price_change_percent = price_change_rate * 100
                sentiment_score = sia.polarity_scores(
                    transcript[0])['pos'] - sia.polarity_scores(
                        transcript[0])['neg']
                print(transcript)
                print('score: ', sia.polarity_scores(transcript[0]))
                print("price change: ", price_change_rate)

                df_returns_scores = df_returns_scores.append(
                    {
                        'Return': price_change_rate,
                        'Score': sentiment_score
                    },
                    ignore_index=True)
                excel_df = excel_df.append(
                    {
                        'Ticker': company,
                        "Date": date,
                        'Quarter': quarter,
                        'Sentiment Score': sentiment_score,
                        'Returns': price_change_rate
                    },
                    ignore_index=True)
            if i > 8:  # 10years - 4 quarters
                break

    excel_df.to_excel("/Users/alenshaju/Downloads/mt_excel_file_v1.xlsx")

    x = df_returns_scores.Score.values.reshape(-1, 1)
    y = df_returns_scores.Return.values.reshape(-1, 1)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=42)

    support_vector_reg_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
    support_vector_reg_model.fit(x_train, y_train)

    y_pred = support_vector_reg_model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2_data = r2_score(y_test, y_pred)
    print("Root mean square error: ", rmse)
    print("R^2 score: ", r2_data)

    train_test_label = ['Training Data', 'Testing Data']
    model_color = ['m', 'c', 'g']

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 10), sharey=True)

    ###### Training Data ##########
    axes[0].plot(x_test,
                 y_pred,
                 color=model_color[0],
                 lw=2,
                 label='{} model'.format(train_test_label[0]))
    axes[0].scatter(x_train[np.setdiff1d(np.arange(len(x_train)),
                                         support_vector_reg_model.support_)],
                    y_train[np.setdiff1d(np.arange(len(x_train)),
                                         support_vector_reg_model.support_)],
                    facecolor="none",
                    edgecolor=model_color[0],
                    s=50,
                    label='Training data')
    axes[0].legend(loc='upper center',
                   bbox_to_anchor=(0.5, 1.1),
                   ncol=1,
                   fancybox=True,
                   shadow=True)

    ####### Testing Data #########
    axes[1].plot(x_test,
                 y_pred,
                 color=model_color[1],
                 lw=2,
                 label='{} model'.format(train_test_label[1]))
    axes[1].scatter(x_test[np.setdiff1d(np.arange(len(x_test)),
                                        support_vector_reg_model.support_)],
                    y_pred[np.setdiff1d(np.arange(len(x_test)),
                                        support_vector_reg_model.support_)],
                    facecolor="none",
                    edgecolor=model_color[1],
                    s=50,
                    label='Testing data')
    axes[1].legend(loc='upper center',
                   bbox_to_anchor=(0.5, 1.1),
                   ncol=1,
                   fancybox=True,
                   shadow=True)
    fig.text(0.5, 0.04, 'data', ha='center', va='center')
    fig.text(0.06,
             0.5,
             'target',
             ha='center',
             va='center',
             rotation='vertical')
    fig.suptitle("Support Vector Regression", fontsize=14)
    plt.show()
def initial():
    spotify = spotipy.Spotify(auth=token)
    spotify.current_user_recently_played = types.MethodType(
        current_user_recently_played, spotify)

    # creating .json file

    recentsongs = spotify.current_user_recently_played(limit=10)
    track_details = []
    # creating arrays for storing id and name

    for i in recentsongs['items']:
        temp = {'name': '', 'artist': ''}
        temp['name'] = i['track']['name']
        temp['artist'] = i['track']['artists'][0]['name']
        track_details.append(temp)
    lyrics = {}
    text = []
    compoundscore = []
    sid = SentimentIntensityAnalyzer()
    track_details = {frozenset(item.items()): item
                     for item in track_details}.values()
    print(track_details)
    for i in track_details:
        song = genius.search_song(i['name'], i['artist'])
        songlyrics = song.lyrics.replace("\n", " ").replace("\\'", "\'")
        lyrics[i['name']] = songlyrics
        songlyrics = songlyrics.replace('(', '').replace(')', '')
        songlyrics = re.sub("[\\[].*?[\\]]", "", songlyrics)
        text.append(songlyrics)
        scores = sid.polarity_scores(songlyrics)
        compoundscore.append(scores['compound'])
    text = ' '.join(map(str, text))
    print(text.encode("utf-8"))
    stpwords = set(stopwords.words('english'))
    stpwords.update(["br", "href", "la", "yeah", "yuh", "wan", "i'm"])

    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    words_no_punc = []
    for w in words:
        if w.isalpha():
            words_no_punc.append(w.lower())

    ps = PorterStemmer()
    clean_words = []
    for w in words_no_punc:
        if w not in stpwords:
            clean_words.append(ps.stem(w))
    fdist = FreqDist(clean_words)
    print(fdist.most_common(10))
    fdist.plot(10)
    words_string = ' '.join(map(str, clean_words))
    wordcloud = WordCloud(stopwords=stpwords).generate(words_string)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    p1 = plt.show()
    print(compoundscore)
    plt.plot(compoundscore)
    p2 = plt.show()
    pos_count = 0
    neg_count = 0
    for num in compoundscore:
        if num >= 0:
            pos_count += 1

        else:
            neg_count += 1
    plt.pie([pos_count, neg_count],
            labels=["Positive Songs", "Negative Songs"])
    p3 = plt.show()
    p1
    p2
    p3
class FeatureExtractor:
    def __init__(self):
        nltk.download('vader_lexicon')
        nltk.download('punkt')
        self.tool = spacy.load("en_core_web_lg")
        self.sent_analyzer = SentimentIntensityAnalyzer()
        self.stemmer = PorterStemmer()
        self.badwords = []
        self.negative_smileys = []
        self.positive_smileys = []
        with open(settings.get_badwords(), 'r') as f:
            for line in f:
                self.badwords.append(line.strip().lower())

        with open(settings.get_negative_smileys(), 'r') as f:
            for line in f:
                self.negative_smileys.append(line.strip())

        with open(settings.get_positive_smileys(), 'r') as f:
            for line in f:
                self.positive_smileys.append(line.strip())

        self.negationwords = [
            'not', 'no', 'nobody', 'nothing', 'none', 'never', 'neither',
            'nor', 'nowhere', 'hardly', 'scarcely', 'barely', 'don', 'isn',
            'wasn', 'shouldn', 'wouldn', 'couldn', 'doesn'
        ]

        self.whwords = [
            'what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why',
            'how'
        ]
        self.entity_annotations = [
            '__PERSON__', '__NORP__', '__FAC__', '__ORG__', '__GPE__',
            '__LOC__', '__PRODUCT__', '__EVENT__', '__WORK_OF_ART__',
            '__LAW__', '__LANGUAGE__', '__DATE__', '__TIME__', '__PERCENT__',
            '__MONEY__', '__QUANTITY__', '__ORDINAL__', '__CARDINAL__'
        ]

    def extract_lemma(self):
        pass

    def extract_ne(self, text: str) -> str:
        '''
        This function extracts the name entities, and then replace them with the ne labels.
        :param text:
        :type text:
        :return:
        :rtype:
        '''
        doc = self.tool(text)
        for ent in doc.ents:
            text = text.replace(ent.text, ' __' + ent.label_ + '__ ')
        return text

    # todo: this method is overshadowed by the exact same one in the bottom. Is this intended?
    def sentence_embeddings(self, text):
        text = self.preprocess(text)
        vector = self.tool(text).vector
        return vector

    def emoji(self, text: str) -> str:
        pass

    '@todo'

    def filter_hashtag(self, text: str) -> str:
        '''
        This function extracts hashtags and then replace with <hashtag>
        :param text:
        :type text:
        :return:
        :rtype:
        '''
        text = re.sub(r"^#.*", '__hashtag__', text)
        return [text, text.count('__hashtag__')]

    '@todo'

    def extract_mention(self, text: str) -> str:
        '''
        This function extracts mentions and the replace with <mention>
        :param text:
        :type text:
        :return:
        :rtype:
        '''
        return re.sub(r"^@.*", ' __usermention__ ', text)

    def post_role(self, post):
        '''
        if it is source return 1, if it is reply return 0
        :return:
        :rtype:
        '''
        return post.has_source_depth == 0

    '@todo make callback functions as input'

    def preprocess(self, text: str) -> str:
        space_pattern = '\s+'
        text = re.sub(space_pattern, ' ', text)
        text = self.filter_hashtag(text)[0]
        text = self.filter_url(text)[0]
        text = self.filter_mention(text)[0]
        text = re.sub('\'', ' ', text)
        text = re.sub('____', '__ __', text)
        text = self.extract_ne(text)
        return str(text)

    def tokenize(self, text):
        """Removes punctuation & excess whitespace, sets to lowercase,
        and stems tweets. Returns a list of stemmed tokens."""
        tweet = " ".join(re.split("[^a-zA-Z]*", text.lower())).strip()
        tokens = [self.stemmer.stem(t) for t in tweet.split()]
        return tokens

    def similarity(self, post_1_id, post_2_id, embeddings):
        dist = distance.cosine(embeddings[post_1_id], embeddings[post_2_id])
        return dist

    def filter_url(self, text):
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                           '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = re.sub(giant_url_regex, '__url__', text)
        return [text, text.count('__url__')]

    def filter_mention(self, text):
        mention_regex = '@[\w\-]+'
        text = re.sub(mention_regex, '__usermention__', text)
        return [text, text.count('__usermention__')]

    def has_badword(self, tokens):
        bad_words = 0
        for token in tokens:
            if token in self.badwords:
                bad_words += 1
        return bad_words / len(self.badwords)

    def has_negation(self, tokens):
        negation_words = 0
        for negationword in self.negationwords:
            if negationword in tokens:
                negation_words += 1
        return negation_words / len(self.negationwords)

    def has_smileys(self, text):
        if len(text) == 0:
            return [0, 0]
        positive_smileys = 0
        negative_smileys = 0
        for smiley in self.positive_smileys:
            if smiley in text:
                positive_smileys += text.count(smiley)
        for smiley in self.negative_smileys:
            if smiley in text:
                negative_smileys += text.count(smiley)
        return [positive_smileys / len(text), negative_smileys / len(text)]

    def has_whwords(self, tokens):
        wh_words = 0
        for token in tokens:
            if token in self.whwords:
                wh_words += 1
        return wh_words / len(self.whwords)

    def other_tokenizer(self, text):
        return nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', text.lower()))

    def check_entities(self, text):
        entity_feats = []
        for annotation in self.entity_annotations:
            entity_feats.append(
                1) if annotation in text else entity_feats.append(0)
        return entity_feats

    def extract_aux_feats(self, item, args, source_id, prev_id, embeddings,
                          post_type):
        aux_feats = []
        text = self.preprocess(item['text'])
        if 'post_role' in args:
            aux_feats.append(post_type)
        if 'sentiment_analyzer' in args:
            aux_feats.append(self.sent_analyzer.polarity_scores(text)['pos'])
            aux_feats.append(self.sent_analyzer.polarity_scores(text)['neu'])
            aux_feats.append(self.sent_analyzer.polarity_scores(text)['neg'])
            aux_feats.append(
                self.sent_analyzer.polarity_scores(text)['compound'])
        if 'similarity' in args:
            if item['id'] == source_id:
                aux_feats.append(1)
            else:
                aux_feats.append(
                    self.similarity(item['id'], source_id, embeddings))
        if 'num_url' in args:
            aux_feats.append(self.filter_url(item['text'])[1])
        if 'num_mention' in args:
            aux_feats.append(self.filter_mention(item['text'])[1])
        if 'num_hashtag' in args:
            aux_feats.append(self.filter_hashtag(item['text'])[1])

        tokens = self.other_tokenizer(item['text'])
        if 'badwords' in args:
            aux_feats.append(self.has_badword(tokens))
        if 'hasnegation' in args:
            aux_feats.append(self.has_negation(tokens))
        if 'whwords' in args:
            aux_feats.append(self.has_whwords(tokens))
        if 'qmark' in args:
            aux_feats.append(1 if len(text) and '?' in text else 0)
        if 'excmark' in args:
            aux_feats.append(1 if len(text) and '!' in text else 0)
        if 'tripdot' in args:
            aux_feats.append(1 if len(text) and '...' in text else 0)
        if 'capital' in args:
            aux_feats.append(
                float(sum(1 for c in text if c.isupper())) /
                len(text)) if len(text) > 0 else aux_feats.append(0)
        if 'smileys' in args:
            aux_feats.append(self.has_smileys(text))
        if 'named_entities' in args:
            for score in self.check_entities(text):
                aux_feats.append(score)
        return np.asarray(aux_feats)

    def sentence_embeddings(self, text):
        text = self.preprocess(text)
        return self.tool(text).vector