# fit_transform() does two functions: First, it fits the model and learns the vocabulary; 
# Second, it transforms our training data into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)
# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()

import numpy as np

vocab = vectorizer.get_feature_names()
print vocab

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag


from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sid=SIA()

sentiment_score=[]
for i in xrange( 0, num_reviews ):
    # If the index is evenly divisible by 10, print a message
    if((i+1)%1000== 0):
        print "Review %d of %d\n" % ( i+1, num_reviews)
    sentiment_score.append(sid.polarity_scores(train["text"][i]))
	
Beispiel #2
0
def twitter():
    if request.method == "POST":

        auth = tw.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tw.API(auth, wait_on_rate_limit=True)

        query = request.form.get("search")
        date_since = "2019-11-20"
        new_search = query + " -filter:retweets"

        tweets = tw.Cursor(api.search,
                           lang="en",
                           q=new_search,
                           tweet_mode='extended',
                           since=date_since).items(100)
        users_locs = [[
            tweet.user.location, tweet.created_at,
            tweet.full_text.replace('\n', ' '), tweet.user.screen_name,
            [e['text'] for e in tweet._json['entities']['hashtags']],
            tweet.user.followers_count
        ] for tweet in tweets]
        tweet_text = pd.DataFrame(data=users_locs,
                                  columns=[
                                      "location", "Tweetcreated", 'tweet',
                                      "username", "Hastage", "follower_count"
                                  ])
        #tweet_text = pd.read_csv('tweet_test.csv')

        if "Unnamed: 0" in tweet_text.columns:
            tweet_text.drop("Unnamed: 0", axis=1, inplace=True)
        tweet_text.dropna(subset=['tweet'], axis=0, inplace=True)
        tweet_text.reset_index(drop=True, inplace=True)

        sia = SIA()
        tweet_text['compound'] = None
        tweet_text['label'] = 0

        for i in range(len(tweet_text.index)):
            pol_score = sia.polarity_scores(tweet_text["tweet"][i])
            tweet_text.iloc[i:i + 1, 6:7] = pol_score['compound']
            tweet_text.loc[tweet_text['compound'] > 0.2, 'label'] = 1
            tweet_text.loc[tweet_text['compound'] < -0.2, 'label'] = -1

        to_drop = tweet_text[tweet_text["label"] == 0].index
        tweet_text.drop(to_drop, inplace=True)
        tweet_text.reset_index(drop=True, inplace=True)

        tweet_text['norm_follower'] = None
        tweet_text['norm_follower'] = (tweet_text['follower_count'] -
                                       tweet_text['follower_count'].min()) / (
                                           tweet_text['follower_count'].max() -
                                           tweet_text['follower_count'].min())

        to_drop = tweet_text[tweet_text["norm_follower"] == 0].index
        tweet_text.drop(to_drop, inplace=True)
        tweet_text.reset_index(drop=True, inplace=True)

        tweet_text[
            'score'] = tweet_text['norm_follower'] * tweet_text['compound']
        tweet_text.sort_values(by=['score'], ascending=False, inplace=True)
        tweet_text["score"] = tweet_text['score'].map(lambda x: round(x, 5))

        #tweet_text["together_found"] = 0
        tweet_text.reset_index(inplace=True)

        tweet_pos = tweet_text[tweet_text["label"] == 1]
        tweet_neg = tweet_text[tweet_text["label"] == -1]

        #tweet_neg.sort_values(by=['score'],ascending=True,inplace=True)

        neg = tweet_neg[['tweet', 'username', 'score',
                         'location']].to_dict(orient='records')
        pos = tweet_pos[['tweet', 'username', 'score',
                         'location']].to_dict(orient='records')

        if (len(pos) > 15):
            tweet_display_pos = 15
        else:
            tweet_display_pos = len(pos)

        if (len(neg) > 15):
            tweet_display_neg = 15
        else:
            tweet_display_neg = len(neg)

        #raise Exception("hi",tweet_pos_score)
        return render_template("results_twitter.html",
                               pos=pos,
                               neg=neg,
                               tweet_display_pos=tweet_display_pos,
                               tweet_display_neg=tweet_display_neg)
    else:
        return render_template("twitter.html")
Beispiel #3
0
'''
Analyzing Beauty dataset for sentiment
'''

import pandas as pd
import gzip
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

analyser = SIA()


def parse(path):
    '''
    Extract values from FILE which is in JSON format
    :param path: path to the file
    :return: None
    '''
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)


def getDF(path):
    '''
    Creates a dataframe from original dataset and integrates all the reviews in one DF
    :param path:
    :return: dataframe df
    '''
    i = 0
    df = {}
    count = 0
Beispiel #4
0
def CreateFeatures1(raw_data, use_timer=False):
    """ This processes the post data, and can be used as 
    a template for the upload data.
    """
    import pandas as pd
    from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

    if use_timer:
        my_timer = SimpleTimer()

    # is this training data, or web app data?  If yes, process UTC code
    if len(raw_data.columns) > 10:
        # some of these columns aren't being used to create features yet
        extract_keys = ['num_comments', 'created_utc', 'title', 'selftext']
        data = raw_data[sorted(extract_keys)].copy()

        # remove all rows without any valid text
        data = data[data.selftext.map(len) > 0]

        dates = pd.to_datetime(data.created_utc, unit="s")
        data['created_dayofweek'] = dates.dt.dayofweek
        data['created_hour'] = dates.dt.hour
        data['created_month'] = dates.dt.month
        data['created_year'] = dates.dt.year
        cut_off_year = 2011
        data = data[data.created_year > cut_off_year]
        print('Removing all posts before 2011. ', end='')
        print('Earliest post = {}'.format(
            pd.to_datetime(data.created_utc.min(), unit="s")))
        data = data.drop('created_utc', axis=1)
    else:
        data = raw_data

    data['post_char_len'] = data.selftext.apply(lambda x: len(x))
    data['post_num_qs'] = data.selftext.apply(lambda x: x.count('?'))
    data['title_char_len'] = data.title.apply(lambda x: len(x))
    data['title_num_qs'] = data.title.apply(lambda x: x.count('?'))

    data['post_word_len1'] = data.selftext.apply(lambda x: len(x.split()))
    data['title_word_len1'] = data.title.apply(lambda x: len(x.split()))

    if use_timer:
        my_timer.elapsed('Done with simple counts')

    def CountPostPunctuation(row):
        # count the number of punctuation in the selftext
        import string
        punc_set = set(string.punctuation)
        num_punc = 0
        for char in row['selftext']:
            if char in punc_set:
                num_punc += 1
        return num_punc

    def CountTitlePunctuation(row):
        # count the number of punctuation in the selftext
        import string
        punc_set = set(string.punctuation)
        num_punc = 0
        for char in row['title']:
            if char in punc_set:
                num_punc += 1
        return num_punc

    data['post_num_punc'] = data.apply(CountPostPunctuation, axis=1)
    data['title_num_punc'] = data.apply(CountTitlePunctuation, axis=1)
    data['post_perc_punc'] = data.post_num_punc / data.post_char_len
    data['title_perc_punc'] = data.title_num_punc / data.title_char_len
    data.post_perc_punc = data.post_perc_punc.fillna(0)

    if use_timer:
        my_timer.elapsed('Done with percentage counts')

    # Sentiment features
    sia_ps = SIA().polarity_scores

    posts_sent = []
    for post in data.selftext:
        temp = sia_ps(post)
        posts_sent.append([temp[k] for k in sorted(temp.keys())])

    titles_sent = []
    for title in data.title:
        temp = sia_ps(title)
        titles_sent.append([temp[k] for k in sorted(temp.keys())])

    data['title_compound'] = [t[0] for t in titles_sent]
    data['title_neg'] = [t[1] for t in titles_sent]
    data['title_neu'] = [t[2] for t in titles_sent]
    data['title_pos'] = [t[3] for t in titles_sent]

    data['post_compound'] = [t[0] for t in posts_sent]
    data['post_neg'] = [t[1] for t in posts_sent]
    data['post_neu'] = [t[2] for t in posts_sent]
    data['post_pos'] = [t[3] for t in posts_sent]

    if use_timer:
        my_timer.elapsed('Done')

    return data
def get_sentiments(text):
    sia = SIA()
    pol_score = sia.polarity_scores(text)
    return pol_score['compound'], pol_score['neg'], pol_score[
        'neu'], pol_score['pos']
Beispiel #6
0
def polarityScores(headline, body):
    sid = SIA()
    h = sid.polarity_scores(headline)
    b = sid.polarity_scores(body)
    return abs(h['neg'] - b['neg']), abs(h['neu'] - b['neu']), abs(h['pos'] -
                                                                   b['pos'])
def prepare_comments_df(all_comments_df):
    # apply cleaning on the comments
    all_comments_df[
        'comment_cleaned_with_emoji'] = all_comments_df.comment.apply(
            lambda row: process_comments(row, True))
    # apply cleaning on the comments
    all_comments_df[
        'comment_cleaned_without_emoji'] = all_comments_df.comment.apply(
            lambda row: process_comments(row, False))
    # get the length of the cleaned comment
    all_comments_df[
        'comment_cleaned_length_with_emoji'] = all_comments_df.comment_cleaned_with_emoji.apply(
            lambda row: len(row.split()))
    all_comments_df[
        'comment_cleaned_length_without_emoji'] = all_comments_df.comment_cleaned_without_emoji.apply(
            lambda row: len(row.split()))
    sia = SIA()
    all_comments_df['pos_sia_emoji'] = all_comments_df[
        'comment_cleaned_with_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['pos'])
    all_comments_df['neg_sia_emoji'] = all_comments_df[
        'comment_cleaned_with_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['neg'])
    all_comments_df['neu_sia_emoji'] = all_comments_df[
        'comment_cleaned_with_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['neu'])
    all_comments_df['agg_sia_emoji'] = all_comments_df[
        'comment_cleaned_with_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['compound'])
    all_comments_df['pos_sia_without_emoji'] = all_comments_df[
        'comment_cleaned_without_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['pos'])
    all_comments_df['neg_sia_without_emoji'] = all_comments_df[
        'comment_cleaned_without_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['neg'])
    all_comments_df['neu_sia_without_emoji'] = all_comments_df[
        'comment_cleaned_without_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['neu'])
    all_comments_df['agg_sia_without_emoji'] = all_comments_df[
        'comment_cleaned_without_emoji'].apply(
            lambda comment: sia.polarity_scores(comment)['compound'])
    all_comments_df['classification_with_emoji'] = all_comments_df.apply(
        lambda row: 'pos' if row['agg_sia_emoji'] >= 0 else 'neg', axis=1)
    all_comments_df['classification_without_emoji'] = all_comments_df.apply(
        lambda row: 'pos' if row['agg_sia_without_emoji'] >= 0 else 'neg',
        axis=1)
    all_comments_df['classification_with_emoji'].iplot(
        kind='hist',
        asFigure=True,
        theme='white',
        gridcolor='white',
        bargap=0.5,
        xTitle='classification',
        yTitle='count',
        title='Omega comments classification with emoji')
    all_comments_df['classification_without_emoji'].iplot(
        kind='hist',
        asFigure=True,
        theme='white',
        gridcolor='white',
        bargap=0.5,
        xTitle='classification',
        yTitle='count',
        title='Omega comments classification without emoji')
    # remove empty commenths
    all_comments_df = all_comments_df[
        all_comments_df.comment_cleaned_length_without_emoji > 0]
    all_comments_df['comment date'] = pd.to_datetime(
        all_comments_df['comment date'])
    # remove hours, minutes, seconds
    all_comments_df["comment date day"] = all_comments_df[
        "comment date"].dt.strftime("%m-%d-%y")
    all_comments_df["comment date hour"] = all_comments_df[
        "comment date"].dt.strftime("%H:%M:%S")
    return all_comments_df
Beispiel #8
0
def main(arg):
    """
    Initiation docstring
    """

    credentials = {}
    credentials['CLIENT_ID'] = CLIENT_ID
    credentials['CLIENT_SECRET'] = CLIENT_SECRET
    credentials['USER_AGENT'] = USER_AGENT
    credentials['USERNAME'] = USERNAME
    credentials['PASSWORD'] = PASSWORD

    # (1) If you haven't saved your json file, use this template first!
    # with open("reddit_credentials.json", "w") as file:
    #    json.dump(credentials, file)

    # (2) Use this after initializing your credentials
    with open("reddit_credentials.json", "r") as file:
        creds = json.load(file)

    # Initiate with credentials (you may have to comment this out when first initializing your json file)
    reddit = praw.Reddit(client_id=creds["CLIENT_ID"],
                         client_secret=creds["CLIENT_SECRET"],
                         user_agent=creds["USER_AGENT"],
                         username=creds["USERNAME"],
                         password=creds["PASSWORD"])

    # read suicide-related keywords in csv
    df = pd.read_csv("suicide_keywords.csv",
                     header=None,
                     dtype=str,
                     usecols=[i for i in range(1)],
                     sep='+',
                     encoding='latin-1')

    # dropping null value columns to avoid errors
    df.dropna(inplace=True)

    # process dataframe to be a list of keywords
    df[0] = df[0].astype(str)
    df[0] = df[0].str.strip()
    df[0] = df.apply(get_keywords, axis=1)
    df[0].apply(word_tokenize)
    df2 = pd.DataFrame(df[0].str.split(',').tolist()).stack()
    df2 = df2.reset_index()

    # SUBREDDIT(S)
    # Subreddit used: 'depression', 'suicidewatch', 'offmychest' TODO: singapore
    chosen_sub = arg
    subreddit = reddit.subreddit(chosen_sub)

    # Keywords with only unique values
    # remove empty values
    keywords = list(
        filter(None, set(df2[0].astype(str).values.flatten().tolist())))

    # Stemmatize keywords to be applied
    porter = PorterStemmer()
    stem_keys = [porter.stem(w) for w in keywords if w.isalpha()]

    # Getting top up-voted topics of all time (can be any amount from .hot, .top, etc)
    top_submissions = subreddit.top(limit=80)

    #############################################
    # TABLE DEFINITIONS (Users, User Submissions) 3NF
    #############################################

    users_dict = {"author_id": [],                  # UserID
                  "author": [],                       # User
                  "submissions": [],                  # Title of submission
                  "comment_karma": [],                # Number of comment karma
                  "link_karma": [],                   # Number of link karma
                  "created": []}                      # Creation date and time

    user_submissions_dict = {"keyword": [],  # keyword
                             "author_id": [],       # User ID
                             "sub_id": [],           # Sub ID
                             "submission": [],       # Title of submission
                             "comments": [],         # Comments of post
                             "subreddit": [],        # Subreddit of submission
                             "created": []}          # Daytime/Nighttime

    keyword_subs = defaultdict(list)                # Sub ID and Keywords
    keyword_freq = defaultdict(list)                # Keywords and Counter

    # Scraping variables
    # Each user (author) from the top posts saved to check their other submissions/replies
    top_post_users = []
    riskzone_users = []

    # Sentiment analysis variables
    results = []
    sia = SIA()

    #############################################
    ########## EVALUATION SUBMISSIONS ###########
    #############################################

    # Check each submission from row 68 to get users
    for submission in top_submissions:

        # Stemmatize title
        stem_words = stemmatize(submission.title)
        
        # Iterate through all our keywords
        for word in list(set(stem_keys)):

            # Include posts where title any of keywords in it
            if word in stem_words:

                # Adds each user to a list if not already in list
                if submission.author not in top_post_users:
                    top_post_users.append(submission.author)

    #############################################
    ####### EVALUATING USERS & SUBMISSIONS ######
    #############################################

    # Iterate through every account from our users
    for account in top_post_users:
        user = reddit.redditor(str(account))            # Redditor object
        # ListingGenerator object containing submissions
        user_submissions = user.submissions.new()

        # Iterate through every submission that user has made.
        try:
            for submission in user_submissions:
                for keyword in list(set(stem_keys)):

                    # Check if keyword exists in either title or body
                    if keyword in stemmatize(submission.title):
                        print(f'-- Stemmatized title: {stemmatize(submission.title)}')
                        print(f'---- {keyword} existed in title.')

                        # Sentiment analysis for each submission title
                        pol_score = sia.polarity_scores(submission.title)
                        pol_score['title'] = submission.title
                        results.append(pol_score)

                        # Add data to dictionary (in preparation for pandas to do its thing)

                        user_submissions_dict["keyword"].append(keyword)
                        user_submissions_dict["sub_id"].append(submission.id)
                        user_submissions_dict["author_id"].append(user.id)
                        user_submissions_dict["submission"].append(
                            submission.title)
                        user_submissions_dict["comments"].append(
                            submission.num_comments)
                        user_submissions_dict["subreddit"].append(
                            submission.subreddit)
                        user_submissions_dict["created"].append(
                            submission.created_utc)

                        # If user checks all flags: FLAG ACCOUNT AS RISKY
                        if user not in riskzone_users:
                            riskzone_users.append(user)

                        # Add keywords found in specific sub
                        keyword_subs["sub_id"].append(submission.id)
                        keyword_subs["keyword"].append(keyword)
        except Forbidden:
            print(f'Ops! Forbidden: {Forbidden}')

    # Prepare keyword data frequency
    k_count_freq = get_keyword_frequency(keyword_subs["keyword"])
    for k, val in k_count_freq.items():
        keyword_freq["keyword"].append(k)
        keyword_freq["counter"].append(val)

    # ADDS UNIQUE ACCOUNTS AND THEIR DATA
    for user in riskzone_users:
        users_dict["author_id"].append(user.id)
        users_dict["author"].append(user)
        users_dict["submissions"].append(len(list(user.submissions.new())))
        users_dict["comment_karma"].append(user.comment_karma)
        users_dict["link_karma"].append(user.link_karma)
        users_dict["created"].append(user.created_utc)

    # Convert dictionary to dataframe to make data more easily readable
    users_data = pd.DataFrame(users_dict)
    users_submissions_data = pd.DataFrame(user_submissions_dict)
    subs_keywords_data = pd.DataFrame(dict(keyword_freq))

    # Fixing column date for users (unix time to actual time)
    _timestamp_users = users_data["created"].apply(get_date)
    users_data = users_data.assign(timestamp=_timestamp_users)

    _timestamp_users_submissions_data = users_submissions_data["created"].apply(
        get_date)
    users_submissions_data = users_submissions_data.assign(
        timestamp=_timestamp_users_submissions_data)

    # Adds column to see wether or not title is risky or not
    df = pd.DataFrame.from_records(results)

    users_submissions_data['risk'] = 0
    users_submissions_data.loc[df['compound'] > 0.2, 'risk'] = 1
    users_submissions_data.loc[df['compound'] < -0.2, 'risk'] = -1

    # Check if directory exists
    directory = f'subreddits/{chosen_sub}/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_users.csv', 'w+', encoding="utf-8", newline='') as file:
        users_data.to_csv(file, index=False)

    with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_submissions.csv', 'w+', encoding="utf-8", newline='') as file:
        users_submissions_data.to_csv(file, index=False)

    with open(f'subreddits/{chosen_sub}/reddit_{chosen_sub}_keywords.csv', 'w+', encoding="utf-8", newline='') as file:
        subs_keywords_data.to_csv(file, index=False)
Beispiel #9
0
def aspect_segmentaion(file, aspect_file):
    #Sentiment analysis
    sid = SIA()

    #INPUT
    #review, this algo needs all the review. Please process dataset.
    reviews, all_ratings = load_file(file)

    #selection threshold
    p = 5
    #Iterations
    # I = 10
    I = 1

    #Create Vocabulary
    review_sent, review_actual, only_sent = parse_to_sentence(reviews)
    vocab, vocab_dict = create_vocab(only_sent)

    #Aspect Keywords
    aspect_terms = get_aspect_terms(aspect_file, vocab_dict)

    label_text = [
        'Value', 'Rooms', 'Location', 'Cleanliness', 'Check in/Front Desk',
        'Service', 'Business Service'
    ]
    # print aspect_terms

    #ALGORITHM
    review_labels = []
    k = len(aspect_terms)
    v = len(vocab)
    aspect_words = np.zeros((k, v))
    aspect_sent = np.zeros(k)
    num_words = np.zeros(v)

    for i in range(I):
        for r in review_sent:
            labels = []
            for s in r:
                count = np.zeros(len(aspect_terms))
                i = 0
                for a in aspect_terms:
                    for w in s:
                        if w in vocab_dict:
                            num_words[vocab_dict[w]] += 1
                            if w in a:
                                count[i] += 1
                    i = i + 1
                if max(count) > 0:
                    la = np.where(np.max(count) == count)[0].tolist()
                    labels.append(la)
                    for i in la:
                        aspect_sent[i] += 1
                        for w in s:
                            if w in vocab_dict:
                                aspect_words[i][vocab_dict[w]] += 1
                else:
                    labels.append([])
            review_labels.append(labels)
            # aspect_w_rank = chi_sq_mat()
            # new_labels = []
            # for na in aspect_w_rank:
            # 	x = np.argsort(na)[::-1][:p]
            # 	new_labels.append(x)
            # for k,v in vocab_dict.items():
            # 	if vocab_dict[k] in x:
            # 		print k
            # print
            # sys.exit()

    ratings_sentiment = []
    for r in review_actual:
        sentiment = []
        #aspect ratings based on sentiment
        for s in r:
            ss = sid.polarity_scores(s)
            sentiment.append(ss['compound'])
        ratings_sentiment.append(sentiment)

    #Aspect Ratings Per Review
    aspect_ratings = []
    for i, r in enumerate(review_labels):
        rating = np.zeros(7)
        count = np.zeros(7)
        rs = ratings_sentiment[i]
        for j, l in enumerate(r):
            for k in range(7):
                if k in l:
                    rating[k] += rs[j]
            for k in range(7):
                if count[k] != 0:
                    rating[k] /= count[k]
        #Map from -[-1,1] to [1,5]
        for k in range(7):
            if rating[k] != 0:
                rating[k] = int(round((rating[k] + 1) * 5 / 2))
        aspect_ratings.append(rating)
    return aspect_ratings, all_ratings

    # n = 0
    #print (review_actual[n], '\n', review_labels[n])
    #print (ratings_sentiment[n], '\n', aspect_ratings[n])
    #print (len(all_ratings), len(reviews), all_ratings[0])
    # sys.exit()

    #print (sent[5:9], labels[5:9])
    #print (zip(actual_sent, labels)[:10])
    #print (zip(actual_sent, sentiment)[:10])
    return aspect_ratings
Beispiel #10
0
from prices import crypto_history

REMOVED = '[removed]'

with open('./secrets.yaml', 'r') as secrets_file:
    data = yaml.load(secrets_file)

client_id = data['reddit']['api']['client_id']
client_secret = data['reddit']['api']['client_secret']

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent='testscript by /u/thegreatwarlo')

bitcoin_markets = reddit.subreddit('BitcoinMarkets')
vader = SIA()


def main():
    # print(crypto_history.gather('20170101', '20170102', ['ethereum']))
    with open('./data/bitcoin_markets_daily_discussion_v1.csv',
              'a') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(
            ['comment_id', 'date', 'body', 'parent_id', 'vader_scores'])
        for submission in get_daily_discussion(300):
            title = submission.title
            date = title.split(']')[-1].strip()
            print('\nSubmission Date: {0}'.format(date))
            non_removed_comment_count = 0
            submission.comments.replace_more(limit=None)
Beispiel #11
0
def analyze(soup_object):

    #select only the headlines in each google search result
    base = soup_object.select("div.g.card h3")

    #declare empty list where I'm going to put all the headlines
    headlines = []

    #loop to get rid of all html and keep only headline text
    for row in base:
        clean = row.text
        headlines.append(clean)

    #print to verify headlines are clean
    #print(headlines)

    #empty list to store tokenized headlines
    tokens = []

    #loop to tokenize headlines by using clean_tokens method from earlier
    for each in headlines:
        clean = clean_tokens(each)
        tokens.append(clean)

    #print tokens to verify
    #print('tokens =',tokens)

    #create stopwords list from nltk and add fallout 76 as stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append("fallout")
    stopwords.append('76')
    stopwords.append("'fallout")

    #remove stopwords from tokens
    filtered = []
    for list in tokens:
        x = []
        for word in list:
            if word not in stopwords:
                x.append(word)
        filtered.append(x)

        #print to verify stopwords are gone
    #print("filtered = ", filtered)

    #declare empty list so I can put the tokens back into headlines without stopwords
    combined = []

    #put tokens back into headlines without stopwords
    for list in filtered:
        combined.append(" ".join(list))

    #import sentiment analyzer
    from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

    #create sia object I think and then create empty list to put end results in
    sia = SIA()
    results = []

    #analyze sentiment of each headline
    for line in combined:
        pol_score = sia.polarity_scores(line)
        results.append(pol_score)

    #print to verify
    #for i in results:
    #print(i)

    #write sentiment analysis to csv file
    with open("resultsback.csv", 'a') as csv_file:
        writer = csv.writer(csv_file)
        for d in results:
            writer.writerow(['compound', d['compound']])
Beispiel #12
0
def process_user_text(user_text, goal_category):
    #put user input text string into a DataFrame
    clean_data = pd.DataFrame(user_text, columns = ["text"]) 
    clean_data["source"] = goal_category
    clean_data["subreddit"] = "placeholder"

    ## Starting with textstat  
    textstat_results = pd.DataFrame(columns = ['flesch_ease', 'flesch_grade','gfog',
               'auto_readability','cl_index','lw_formula','dcr_score', 
               'syll_count', 'lex_count'])

    for i in clean_data["text"]: 
        results = textstat_stats(str(i)) #textstat needs a string
        textstat_results = textstat_results.append(results, ignore_index=True) #so that index is continuous

    # Resetting indices here may be unneccesary
    textstat_results = textstat_results.reset_index(drop=True)

    combined_data = pd.concat([clean_data, textstat_results], axis = 1)

    ## Moving on to NLTK part-of-speech tagging
    combined_data_wordtokens = []
    for document in combined_data["text"]:
        tokens = nltk.word_tokenize(str(document))
        combined_data_wordtokens.append(tokens)

    combined_data_wordpos = []
    for document in combined_data_wordtokens:
        pos = nltk.pos_tag(document) #default is Penn Treebank tagset
        combined_data_wordpos.append(pos)
        
    pos_keys = ['#', '$', '“', '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 
                'FW', 'IN', 'JJ', 'JJR', 'JJS','LS', 'MD', 'NN', 'NNP', 'NNPS', 
                'NNS', 'PDT', 'POS', 'PRP', 'PRP$','RB', 'RBR', 'RBS', 'RP', 
                'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN','VBP', 'VBZ', 
                'WDT', 'WP', 'WP$', 'WRB', '”']

    pos_counts = []

    for document in combined_data_wordpos:
        doc_length = len(document)
        mini_dict = Counter([pos for word,pos in document])
        for pos in pos_keys:
            if pos not in mini_dict:
                mini_dict[pos] = 0
        scaled_dict = {k: v / doc_length for k, v in mini_dict.items()}
        pos_counts.append(scaled_dict)

    pos_df = pd.DataFrame(pos_counts)
    pos_df = pos_df.fillna(0)

    combined_data = pd.concat([combined_data, pos_df], axis = 1)

    ## Add sentiment intensity
    sia = SIA()
    results = []

    for line in combined_data["text"]: # this returns a list of dicts
        pol_score = sia.polarity_scores(line)
        pol_score['text'] = line
        results.append(pol_score)

    sia_neg = []
    sia_pos = []
    sia_neu = []
    sia_comp = []
    for document in results:
        neg = document['neg']
        pos = document['pos']
        neu = document['neu']
        comp = document['compound']
        sia_neg.append(neg)
        sia_pos.append(pos)
        sia_neu.append(neu)
        sia_comp.append(comp)
        
    combined_data["SIA_neg"] = sia_neg
    combined_data["SIA_pos"] = sia_pos
    combined_data["SIA_neu"] = sia_neu
    combined_data["SIA_com"] = sia_comp


    ## Now for the custom features
    Google_Curses = pd.read_csv("./insight2019/flask_app/my_flask/model/RobertJGabriel_Google_swear_words.txt", header = None)
    bad_words = Google_Curses[0].tolist()

    any_bad = []
    for row in combined_data["text"]:
        if any(str(word) in str(row) for word in bad_words):
            any_bad.append(1)
        else: any_bad.append(0)

    combined_data["Google_curses"] = any_bad
    combined_data["Google_curses"].value_counts()

    emoji_counts = []
    for row in combined_data["text"]:
        emoji_num = len(emoji_counter(str(row)))
        emoji_counts.append(emoji_num)

    combined_data["Num_emoji"] = emoji_counts
    combined_data["Num_emoji"].value_counts()

    internet_yelling = []
    for row in combined_data["text"]:
        screams = scream_counter(str(row))
        internet_yelling.append(screams)

    combined_data["Yell_count"] = internet_yelling

    return combined_data
Beispiel #13
0
def news():
    if request.method == "POST":
        query = request.form.get("search")
        keyword = query
        pageSize = 100
        lang = "en"
        apiKey = '673e46eca5794640b6c370ed313a5c80'
        sortBy = "popularity"
        head = "https://newsapi.org/v2/everything?q={}&pageSize={}&sortBy={}&language={}&apiKey={}".format(
            keyword, pageSize, sortBy, lang, apiKey)
        response = requests.get(head)
        response_json_string = json.dumps(response.json())
        response_dict = json.loads(response_json_string)
        response_dict = json.loads(response_json_string)
        articles_list = response_dict['articles']
        df_news = pd.read_json(json.dumps(articles_list))

        if "Unnamed: 0" in df_news.columns:
            df_news.drop("Unnamed: 0", axis=1, inplace=True)

        df_news.dropna(subset=['title'], axis=0, inplace=True)
        df_news.reset_index(drop=True, inplace=True)

        sia = SIA()
        df_news['score'] = None
        df_news['label'] = 0

        for i in range(len(df_news.index)):
            pol_score = sia.polarity_scores(df_news["title"][i])
            df_news.iloc[i:i + 1, 8:9] = pol_score['compound']
            df_news.loc[df_news['score'] > 0.2, 'label'] = 1
            df_news.loc[df_news['score'] < -0.2, 'label'] = -1

        df_news["cat"] = 0
        df_news.reset_index(inplace=True)

        searched = query.lower()
        test_list = searched.split()
        for i in range(len(df_news.index)):
            if (df_news["description"][i] != None):
                if (bool([
                        ele for ele in test_list
                        if (ele in (df_news["description"][i]).lower())
                ])):
                    df_news["cat"][i] = 1
            if (df_news["title"][i] != None):
                if (bool([
                        ele for ele in test_list
                        if (ele in (df_news["title"][i]).lower())
                ])):
                    df_news["cat"][i] = 1

        for i in range(len(df_news.index)):
            if (df_news["description"][i] != None):
                if (searched in (df_news["description"][i]).lower()):
                    df_news["cat"][i] = 2
            if (df_news["title"][i] != None):
                if (searched in (df_news["title"][i]).lower()):
                    df_news["cat"][i] = 2

        df_news.sort_values(by=["cat"], ascending=False, inplace=True)

        news_pos = df_news[df_news['label'] == 1]
        news_neu = df_news[df_news['label'] == 0]
        news_neg = df_news[df_news['label'] == -1]

        pos_n = news_pos[[
            'title', 'description', 'url', 'source', 'author', 'score'
        ]].to_dict(orient='records')
        neg_n = news_neg[[
            'title', 'description', 'url', 'source', 'author', 'score'
        ]].to_dict(orient='records')
        neu_n = news_neu[[
            'title', 'description', 'url', 'source', 'author', 'score'
        ]].to_dict(orient='records')

        if (len(pos_n) > 15):
            news_display_pos = 15
        else:
            news_display_pos = len(pos_n)

        if (len(neg_n) > 15):
            news_display_neg = 15
        else:
            news_display_neg = len(neg_n)

        if (len(neu_n) > 15):
            news_display_neu = 15
        else:
            news_display_neu = len(neu_n)

        return render_template("results_news.html",
                               pos_n=pos_n,
                               neg_n=neg_n,
                               neu_n=neu_n,
                               news_display_pos=news_display_pos,
                               news_display_neg=news_display_neg,
                               news_display_neu=news_display_neu)
    else:
        return render_template("news.html")
Beispiel #14
0
def youtube():
    if request.method == "POST":
        os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
        service = get_authenticated_service()
        query = request.form.get("search")

        comments_from_youtube = search_videos_by_keyword(service,
                                                         q=query,
                                                         part='id,snippet',
                                                         eventType='completed',
                                                         type='video')
        df = pd.DataFrame(comments_from_youtube,
                          columns=[
                              'authorDisplayName', 'publishedAt', 'likeCount',
                              'viewerRating', 'totalReplyCount',
                              'authorChannelURL', 'comment'
                          ])
        #df = pd.read_csv('youtube_comments.csv')
        if "Unnamed: 0" in df.columns:
            df.drop("Unnamed: 0", axis=1, inplace=True)
        df.dropna(subset=["comment"], axis=0, inplace=True)
        df.reset_index(drop=True, inplace=True)
        df["pos"] = None
        df["neu"] = None
        df["neg"] = None
        df["comp"] = None
        df["label"] = 0
        sia = SIA()

        for i in range(len(df.index)):
            pol_score = sia.polarity_scores(df["comment"][i])
            df.iloc[i:i + 1, 7:8] = pol_score['pos']
            df.iloc[i:i + 1, 8:9] = pol_score['neu']
            df.iloc[i:i + 1, 9:10] = pol_score['neg']
            df.iloc[i:i + 1, 10:11] = pol_score['compound']
            if (pol_score['compound'] > 0.2):
                df.iloc[i:i + 1, 11:12] = 1
            elif (pol_score['compound'] < -0.2):
                df.iloc[i:i + 1, 11:12] = -1

        to_drop = df[df["label"] == 0].index
        df.drop(to_drop, inplace=True)
        df.reset_index(drop=True, inplace=True)
        df['likeCount-normalized'] = df['likeCount'] / df['likeCount'].max()
        df["SCORE"] = df['likeCount-normalized'] * df['comp']
        df.sort_values(by=['SCORE'], ascending=False, inplace=True)
        to_drop = df[df["SCORE"] == 0].index
        df.drop(to_drop, inplace=True)
        df.reset_index(drop=True, inplace=True)
        df["SCORE"] = df['SCORE'].map(lambda x: round(x, 5))

        df["cat"] = 0
        df.reset_index(inplace=True)

        searched = query.lower()
        test_list = searched.split()
        for i in range(len(df.index)):
            if (df["comment"][i] != None):
                if (bool([
                        ele for ele in test_list
                        if (ele in (df["comment"][i]).lower())
                ])):
                    df["cat"][i] = 1

        for i in range(len(df.index)):
            if (df["comment"][i] != None):
                if (searched in (df["comment"][i]).lower()):
                    df["cat"][i] = 2

        df.sort_values(by=["cat"], ascending=False, inplace=True)

        df_pos = df[df["label"] == 1]
        df_neg = df[df["label"] == -1]

        #df_neg.sort_values(by=['SCORE'],ascending=True,inplace=True)
        pos_yt = df_pos[[
            'authorDisplayName', 'comment', 'SCORE', 'authorChannelURL'
        ]].to_dict(orient='records')
        neg_yt = df_neg[[
            'authorDisplayName', 'comment', 'SCORE', 'authorChannelURL'
        ]].to_dict(orient='records')

        if (len(pos_yt) > 15):
            to_display_pos = 15
        else:
            to_display_pos = len(pos_yt)

        if (len(neg_yt) > 15):
            to_display_neg = 15
        else:
            to_display_neg = len(neg_yt)

        return render_template("results_youtube.html",
                               pos_yt=pos_yt,
                               neg_yt=neg_yt,
                               to_display_pos=to_display_pos,
                               to_display_neg=to_display_neg)
    else:
        return render_template("youtube.html")
def get_world_politics():
    sia = SIA()
    headlines = []
    post_score = []
    polarity_score = []
    pos_list = []
    neg_list = []

    request = make_international_request_using_cache(world_politics_url)
    json_data = json.loads(request)
    data_all = json_data['data']['children']
    num_of_posts = 0
    while len(data_all) <= 1000:
        time.sleep(2)
        last = data_all[-1]['data']['name']
        url = 'https://www.reddit.com/r/worldpolitics/.json?after=' + str(last)
        req = make_international_request_using_cache(url)
        data = json.loads(req)
        data_all += data['data']['children']
        if num_of_posts == len(data_all):
            break
        else:
            num_of_posts = len(data_all)

    for post in data_all:
        headlines.append(post['data']['title'])
        post_score.append(post['data']['score'])
        res = sia.polarity_scores(post['data']['title'])
        polarity_score.append(res['compound'])
        if res['compound'] > 0.2:
            pos_list.append(post['data']['title'])
        elif res['compound'] < -0.2:
            neg_list.append(post['data']['title'])

    with open("pos_news_titles_international.txt",
              "w",
              encoding='utf-8',
              errors='ignore') as f_pos:
        for post in pos_list:
            f_pos.write(post + "\n")

    with open("neg_news_titles_international.txt",
              "w",
              encoding='utf-8',
              errors='ignore') as f_neg:
        for post in neg_list:
            f_neg.write(post + "\n")

    try:
        conn = sqlite3.connect(DBNAME)
        cur = conn.cursor()
    except Error as e:
        print(e)

    # International Table: ID, Headline, Alpha2, Polarity Score, Upvote Score

    for post in data_all:
        insertion = (None, post['data']['title'], None, None,
                     post['data']['score'])
        statement = 'INSERT INTO "International" '
        statement += 'VALUES (?, ?, ?, ?, ?)'
        cur.execute(statement, insertion)

    for post in polarity_score:
        cur.execute('INSERT INTO International (Polarity) VALUES (?)',
                    (post, ))

    conn.commit()
    conn.close()
Beispiel #16
0
def setup_sia():
    # Sentiment Intensity Analyser
    sia = SIA()
    populate_dictionary(sia)
    return sia
Beispiel #17
0
def gatherFeaturesAndMakeSIA(start_date, end_date, showPlots):

    print("Gathering csv data for training. Please wait...")

    pd.options.mode.chained_assignment = None  # default='warn'

    # Get the market info for Bitcoin
    bitcoin_market_info = pd.read_html(
        "https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end="
        + time.strftime("%Y%m%d"))[0]
    bitcoin_market_info = bitcoin_market_info.assign(
        Date=pd.to_datetime(bitcoin_market_info['Date']))
    bitcoin_market_info.loc[bitcoin_market_info['Volume'] == "-", 'Volume'] = 0
    bitcoin_market_info['Volume'] = bitcoin_market_info['Volume'].astype(
        'int64')

    market_info = bitcoin_market_info

    # filter the requested data
    market_info = market_info[market_info['Date'] <= end_date]
    market_info = market_info[market_info['Date'] >= start_date]

    # calculate the no of days in between the given dates
    datetime_object = datetime.strptime(start_date, '%Y-%m-%d')
    datetime_object1 = datetime.strptime(end_date, '%Y-%m-%d')
    tmp = datetime_object1 - datetime_object
    days_in_between = tmp.days + 1

    # print('Days in beween the given dates - ' + str(days_in_between))

    # Create Day Diff column with values
    kwargs = {
        'Day Diff': lambda x: (x['Open*'] - x['Close**']) / x['Open*'] * 10000
    }
    market_info = market_info.assign(**kwargs)

    # Create Close Off High and Volatility column
    kwargs = {
        'Close Off High':
        lambda x: ((2 * (x['High'] - x['Close**'])) /
                   (x['High'] - x['Low']) - 1) * 10000,
        'Volatility':
        lambda x: (x['High'] - x['Low']) / (x['Open*'])
    }
    market_info = market_info.assign(**kwargs)

    # add Movement column (initially random values)
    market_info = market_info.assign(
        Movement=pd.Series(np.random.randn(len(market_info['Open*']))).values)

    # reverse array for ascending order
    market_info = market_info.sort_values(by='Date')

    # Resetting the indexes
    market_info = market_info.reset_index(drop=True)

    # Replace Movement random values with actual values -> UP or DOWN
    i = 0
    while i < (
            len(market_info) -
            1):  # -1 because we cannot compare the last day with the next one

        market_info['Movement'][
            i] = market_info['Close**'][i] - market_info['Close**'][i + 1]
        x = market_info['Movement'][i]
        if (x > 0):
            market_info['Movement'][i] = 'Down'
        elif (x < 0):
            market_info['Movement'][i] = 'Up'
        i = i + 1

    # drop rest keep only what we need
    model_data = market_info[[
        'Date', 'Close**', 'Volume', 'Close Off High', 'Day Diff',
        'Volatility', 'Movement'
    ]]

    # Dropping the last row that doesnt have movement
    model_data = model_data.drop(model_data.index[len(model_data) - 1])
    # print(model_data.to_string())

    # ----------------------------------------from 3 April ->June 2--------------------------------------------------------
    sia = SIA()

    # ------------------------SIA--------------------------------------
    fileCounter = days_in_between
    sentimentList = []
    list_dates = market_info['Date']
    list_prices = market_info['Close**']

    posValues = []
    negValues = []
    neutralValues = []

    # do not read file number 1 which is 2 June, because it was dropped before
    while fileCounter > 1:
        with open('redditCommentsBitcoinMarkets 3 April - 2 June/' +
                  str(fileCounter),
                  'r',
                  encoding='utf-8',
                  errors='ignore') as file:

            neutralCounter = 0
            positiveCounter = 0
            negativeCounter = 0

            for line in file:
                res = sia.polarity_scores(line)
                if res['compound'] > 0.2:
                    positiveCounter += 1
                elif res['compound'] < -0.2:
                    negativeCounter += 1
                else:
                    neutralCounter += 1

            posValues.append(positiveCounter)
            negValues.append(negativeCounter)
            neutralValues.append(neutralCounter)

            fileCounter -= 1

        totalSentiment = positiveCounter + negativeCounter + neutralCounter
        sentimentList.append(
            ((positiveCounter - negativeCounter) / totalSentiment) * 1000)

    if showPlots:
        # ---------------plot for SIA----------------------------------------------
        plt.plot(list_dates, posValues, color='g', label='positive')
        plt.plot(list_dates, negValues, color='red', label='negative')
        plt.plot(list_dates, neutralValues, color='orange', label='neutral')
        plt.plot(list_dates, list_prices, color='blue', label='price')
        plt.gcf().autofmt_xdate()
        plt.legend()
        plt.xlabel('Time line')
        plt.ylabel('No. of comments')
        plt.title('Positive, negative & neutral sentiment over time')
        plt.show()

    # ------------------------SIA2--------------------------------------

    fileCounter = days_in_between
    sentimentList2 = []

    posValues = []
    negValues = []
    neutralValues = []

    # do not read file number 1 which is 2 June, because it was dropped before
    while fileCounter > 1:

        with open('redditCommentBitcoin 3 April - 2 June/' + str(fileCounter),
                  'r',
                  encoding='utf-8',
                  errors='ignore') as file:

            neutralCounter = 0
            positiveCounter = 0
            negativeCounter = 0

            for line in file:
                res = sia.polarity_scores(line)
                if res['compound'] > 0.2:
                    positiveCounter += 1
                elif res['compound'] < -0.2:
                    negativeCounter += 1
                else:
                    neutralCounter += 1

            posValues.append(positiveCounter)
            negValues.append(negativeCounter)
            neutralValues.append(neutralCounter)

            fileCounter -= 1

        totalSentiment = positiveCounter + negativeCounter + neutralCounter
        sentimentList2.append(
            ((positiveCounter - negativeCounter) / totalSentiment) * 1000)

    if showPlots:
        # ---------------plot for SIA2----------------------------------------------
        plt.plot(list_dates, posValues, color='g', label='positive')
        plt.plot(list_dates, negValues, color='red', label='negative')
        plt.plot(list_dates, neutralValues, color='orange', label='neutral')
        plt.plot(list_dates, list_prices, color='blue', label='price')
        plt.gcf().autofmt_xdate()
        plt.legend()
        plt.xlabel('Time line')
        plt.ylabel('No. of comments')
        plt.title('Positive, negative & neutral sentiment over time')
        plt.show()

    with open('train.csv', 'w') as csvfile:
        fieldnames = [
            'Date', 'Close**', 'Volume', 'Close Off High', 'Day Diff',
            'Volatility', 'SIA', 'SIA2', 'Movement'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        i = 0
        while i < (len(market_info) - 1):

            writer.writerow({
                'Date': model_data['Date'][i],
                'Close**': model_data['Close**'][i],
                'Volume': model_data['Volume'][i],
                'Close Off High': model_data['Close Off High'][i],
                'Day Diff': model_data['Day Diff'][i],
                'Volatility': model_data['Volatility'][i],
                'SIA': sentimentList[i],
                'SIA2': sentimentList2[i],
                'Movement': model_data['Movement'][i]
            })
            i = i + 1

    print("Finished train data set! Output -> train.csv.")
Beispiel #18
0

def get_features(track_id):
    features_results = sp.audio_features([track_id])
    json_features = json.dumps(features_results)
    features_data = json.loads(json_features)

    # Convert features dictionary to a list
    features_list = list(features_data[0].values())

    return features_list


client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sentiment_analyzer = SIA()

# IDs of monthly playlists from November 2016 to November 2017
playlist_ids = [
    "07zqCIPCroFMKSajvERGvE", "30PgYnoeT2PAgFNuYLR5qd",
    "1vS1nakUrLYkTd3W0yRMYe", "3scPGVlAn7d74uXRtFnmUC",
    "5LzBRPVAPYUssZ4ZESnRmH", "6hDHXewz8qBTezvONSqzyl",
    "00riJCYiVJ1yptAXtv2h6k", "0HxFI5dOlKztf38T9sa0cF",
    "7EFWm7Mjy6GLJHOEgKEblM", "6YAG0Li1BoUkmhc8iycY6l",
    "7Iw0yI71QX59zyFq0kAZTS", "69XTCqVzbSWPMLucSvzlLl",
    "7pRnKuQMkmntEj7Nnj94r0"
]

# Audio features
feature_names = [
    "danceability", "energy", "key", "loudness", "mode", "speechiness",
Beispiel #19
0
def update_tweets(tweets_to_check, collection, language, searchQuery,
                  constituent, st):
    sia = SIA()

    tokenizer = TweetTokenizer(preserve_case=True,
                               reduce_len=True,
                               strip_handles=False)

    list_of_tweets = []
    for tweet in tweets_to_check:
        doc = tweet._json

        if collection.find({"id_str": doc["id_str"]}, {"id_str": 1}).limit(1):
            continue

        list_of_tweets.append(doc)

    if language != "en":
        if not do_translation(list_of_tweets):
            if not do_translation(list_of_tweets):
                return None

    for document in list_of_tweets:
        if language == 'en':
            document.update(preprocess_tweet(document['text']))
        else:
            document.update(preprocess_tweet(document['text_en']))

        document['search_term'] = searchQuery
        document['constituent'] = constituent
        document['language'] = language

        date = datetime.strptime(document['created_at'],
                                 '%a %b %d %H:%M:%S %z %Y')
        document['date'] = date

        #Update sentiment
        sentiment_score, sentiment = get_nltk_sentiment(
            document["semi_processed_text"], sia)
        document["nltk_sentiment_score"] = sentiment
        document["nltk_sentiment_number"] = sentiment_score

        #Update tags
        document["tag_LOCATION"] = list()
        document["tag_PERSON"] = list()
        document["tag_ORGANIZATION"] = list()
        document["tag_MONEY"] = list()
        document["tag_PERCENT"] = list()
        document["tag_DATE"] = list()
        document["tag_TIME"] = list()

        if language == 'en':
            text = document['text']
        else:
            text = document['text_en']

        for word, tag in get_tags(text, st, tokenizer):
            if tag != "O":
                document["tag_" + tag].append(word)

        if 'retweeted_status' in document:
            document.pop('retweeted_status', None)

    return list_of_tweets
Beispiel #20
0
def main():
    # Authentication
    api = get_api_authentication()
    assert api

    # ----- Data storing -----

    # Users and their information
    users = {
        "user_id": [],  # .id
        "username": [],  # .screen_name
        "location": [],  # .location
        "created": [],  # .created_at
        "followers": [],  # .followers_count
        "following": [],  # .friends_count
        "tweets": []  # .statuses_count
    }

    # Tweets and its information
    tweets = {
        "user_id": [],  # .user.id
        "tweet_id": [],  # .id
        "tweet": [],  # .text/.full_text (normal/extended tweet)
        "is_retweet": [],  #  True/False
        "created": [],  # .created_at
        "retweets": [],  # .retweet_count
        "likes": []  # .favorite_count
    }

    tweet_keyword_counter = defaultdict(list)
    tweet_keyword_freq = defaultdict(list)

    # List definitions
    user_ids = []
    hashtags = []
    results = []

    # SIA Initializer
    sia = SIA()

    # Get dataframe with keywords
    df = retrieve_keyword_dataframe()

    # Process dataframe to be a list of keywords
    df2 = process_dataframe(df)

    # Keywords with only unique values
    keywords = list(
        filter(None, set(df2[0].astype(str).values.flatten().tolist())))

    # main algorithm (getting users from initial search) (2 tweets per keyword)
    for word in keywords:
        for status in tweepy.Cursor(api.search,
                                    q=word,
                                    tweet_mode='extended',
                                    lang="en").items(2):

            # Add to user account list to then iterate through
            if status.user.id not in user_ids:
                user_ids.append(status.user.id)

    # Check all users tweets
    # 1. For every account id that we've saved from initial search, add user
    # 2. For every tweet in that account (with a limit), add tweet
    # 3. For every keyword in our wordlist
    # 4. If that keyword exist in the current tweet: save information.
    for acc_id in user_ids:
        user = api.get_user(acc_id)

        # Add user information
        if user.id not in users["user_id"]:
            users["user_id"].append(user.id)
            users["username"].append(user.screen_name)
            users["location"].append(
                user.location)  #TODO: Check if correct location before adding
            users["created"].append(user.created_at)
            users["followers"].append(user.followers_count)
            users["following"].append(user.friends_count)
            users["tweets"].append(user.statuses_count)

        ret = 0
        for status in tweepy.Cursor(api.user_timeline,
                                    screen_name=user.screen_name,
                                    tweet_mode='extended').items(500):
            for word in keywords:
                if word in status.full_text:

                    # Polarity score
                    pol_score = sia.polarity_scores(status.full_text)
                    pol_score['tweet'] = status.full_text
                    results.append(pol_score)

                    # Add tweet information
                    if status.id not in tweets["tweet_id"]:
                        tweets["user_id"].append(status.user.id)
                        tweets["tweet_id"].append(status.id)
                        tweets["tweet"].append(status.full_text)
                        if hasattr(status, 'retweeted_status'):
                            tweets["is_retweet"].append(True)
                            ret += 1
                        else:
                            tweets["is_retweet"].append(False)
                        tweets["created"].append(status.created_at)
                        tweets["retweets"].append(status.retweet_count)
                        tweets["likes"].append(status.favorite_count)

                    # Add hashtags if exists
                    if hasattr(status, "entities"):
                        if "hashtags" in status.entities:
                            hashtag = [
                                ent["text"]
                                for ent in status.entities["hashtags"]
                                if "text" in ent and ent is not None
                            ]
                            if hashtag is not None:
                                hashtags.append(hashtag)

                    # Add keywords and their counters
                    tweet_keyword_counter["tweet_id"].append(status.id)
                    tweet_keyword_counter["keyword"].append(word)
                    tweet_keyword_counter["tweet_time"].append(
                        status.created_at)
        acc += 1

    # Get keyword and each frequency
    k_count_freq = calculate_frequency(tweet_keyword_counter["keyword"])
    for k, val in k_count_freq.items():
        tweet_keyword_freq["keyword"].append(k)
        tweet_keyword_freq["counter"].append(val)

    # PANDAS AND CSV WRITING
    users_df = pd.DataFrame(users)
    tweets_df = pd.DataFrame(tweets)
    keys_df = pd.DataFrame(tweet_keyword_freq)

    # Adds column to see wether or not title is risky or not
    df = pd.DataFrame.from_records(results)
    tweets_df['risk'] = 0
    tweets_df.loc[df['compound'] > 0.2, 'risk'] = 1
    tweets_df.loc[df['compound'] < -0.2, 'risk'] = -1

    # Users
    with open(f'tweets/twitter_users.csv', 'w+', encoding="utf-8",
              newline='') as file:
        users_df.to_csv(file, index=False)

    # Tweets
    with open(f'tweets/twitter_tweets.csv', 'w+', encoding="utf-8",
              newline='') as file:
        tweets_df.to_csv(file, index=False)

    # Keywords with datetime
    with open(f'tweets/twitter_keywords.csv',
              'w+',
              encoding='utf-8',
              newline='') as file:
        keys_df.to_csv(file, index=False)
def calculate_sentiment(text):
    sia = SIA()
    return sia.polarity_scores(text)
Beispiel #22
0
plt.show()

# Sentiment Analysis Submissions

sns.set(style='darkgrid', context='talk', palette='Blues_d')

headlines = set()

for submission in reddit.subreddit('Bitcoin').top(limit=1000):
    headlines.add(submission.title)
    display.clear_output()
    print(len(headlines))

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()

results = []

for line in headlines:
    ripple_score = sia.polarity_scores(line)
    ripple_score['headline'] = line
    results.append(ripple_score)

pprint(results[:3], width=100)

df = pd.DataFrame.from_records(results)

df.head()

df['label'] = 0
Beispiel #23
0
 def polarity_calculator(cls, string_data):  # takes in string
     sia = SIA()
     strng = cls.cleaner(string_data)
     pol_score = sia.polarity_scores((strng))
     pol_score['article'] = string_data
     return pol_score  #returns dict
    q1_df = yelp[yelp["date"] > frame_q1]

    q2_df = yelp[(yelp["date"] < (frame_q1 + timedelta(60)))
                 & (yelp["date"] > (frame_q2 + timedelta(60)))]

    for frame, typer in zip([q1_df, q2_df], ["q1", "q2"]):
        yelp = frame
        from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

        yelp["positive"] = 0
        yelp["compound"] = 0.0
        yelp["negative"] = 0
        yelp["neutral"] = 0

        analyzer = SIA()
        for sentence, row in zip(yelp["review"], list(range(yelp.shape[0]))):
            vs = analyzer.polarity_scores(sentence)
            yelp["compound"][row] = float(vs["compound"])
            if vs["compound"] < -0.5:
                yelp["negative"][row] = 1
            elif vs["compound"] > 0.5:
                yelp["positive"][row] = 1
            else:
                yelp["neutral"][row] = 1
                # print("{:-<65} {}".format(sentence, str(vs)))

        worst = yelp[(yelp["rating"] < 3) & (yelp["compound"] < -.3)]
        worst = worst.sort_values("date", ascending=False).reset_index()

        best = yelp[(yelp["rating"] > 2) & (yelp["compound"] > 0)]
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
results = []

headlines = [
    "I am suffering from fever", "I am having a headache",
    "kill those bastards", "I want to strike an airplane to new WTC",
    "music fever", "My kid is suffering from fever", "meseals is growing fast",
    "सर दबग लोगों से मेरे जानमाल की सुरक्षा करा । इन दबग लोगो के सामने आज शास्त्री पार्क थाने कि पुलिस भी बेबस…",
    "I’m going to kill myself ... mariachi band playing in the restaurant when I have a headache",
    "I had a headache today so I couldn’t go anywhere and do much. I laced in bed 75% of the time.",
    "Wake up with a sinus headache aaand the guy across the street cutting the grass (from before 7)... Let's restart this day eh?"
]

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

for i in results:
    print(i)
def sentiment_analysis(message):
    sia = SIA()
    p_score = sia.polarity_scores(message)
    p = p_score['compound']
    speedometer(p)