def base_model():
    mongo_connect = MongoHandler()
    like_tweets = mongo_connect.retrieve_from_collection("twitter_new")
    df = pd.DataFrame(list(like_tweets))

    # text = df['text']
    # df = df.drop(['user_name','user_location','hashtags','mentions','created_at'],axis=1)

    # Column / Feature selection
    base = df[[
        'user_followers', 'user_friends', 'user_favourites', 'user_months',
        'user_statuses', 'user_verified', 'retweets'
    ]]
    per_month = round((base['user_statuses'] + 1) / (base['user_months'] + 1),
                      2)
    per_month = pd.DataFrame(per_month)
    per_month.columns = ['tweet_per_month']
    base = pd.concat([base, per_month], axis=1)
    target = df['favorites']
    # base = base[['user_followers', 'retweets', 'user_favourites', 'user_statuses']]
    columns = base.columns.values.tolist()

    # Tranform the problem of regression into a multi-class classification. Classes: zero, low, medium, high
    for i in range(len(target)):
        if 0 < target[i] < 6:
            target[i] = 1
        elif 5 < target[i] < 11:
            target[i] = 2
        elif target[i] >= 11:
            target[i] = 3

    # target.hist()
    # plt.show()
    nm1 = NearMiss(version=1)  # Under-sampling technique
    base, target = nm1.fit_resample(base, target)

    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        base, target, random_state=1, test_size=0.3)

    # testing different ML algorithms
    # model = tree.DecisionTreeClassifier(criterion="entropy", random_state=5)  # class_weight="balanced")
    # model = linear_model.LogisticRegression(solver="lbfgs", random_state=5)
    # model = naive_bayes.MultinomialNB()
    # model = tree.ExtraTreeClassifier(random_state=5)
    # model = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10), n_estimators=1000, random_state=5)
    model = RandomForestClassifier(n_estimators=500, random_state=5)

    k_fold_cv(model, x_train, y_train, False)

    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    evaluation(y_test, y_predict)

    # feature importance / interpretability
    new_y_train = model.predict(x_train)
    tree_model = tree.DecisionTreeClassifier(
        criterion="entropy", random_state=5)  # class_weight="balanced")
    tree_model.fit(x_train, new_y_train)
    tree_feature_importance(tree_model, columns,
                            x_train)  # calls function for interpretable ML
def read_tweets_and_instaposts(collection):
    mongo_connect = MongoHandler()
    tweets = mongo_connect.retrieve_from_collection(
        collection)  # Retrieve tweets from collection
    tweets = pd.DataFrame(list(tweets))
    tweets = tweets.sample(frac=1, random_state=1)
    tweets = tweets[[
        '_id', 'text', 'negative', 'positive:', 's_anger', 's_disgust',
        's_fear', 's_joy', 's_sadness', 's_surprise', 'favorites'
    ]]  # Keep emotion analysis features and y
    tweets.rename(columns={"positive:": "positive"}, inplace=True)
    # print(tweets.columns)
    # print(tweets)
    # Handle like prediction as a classification problem by creating 4 different bins for like prediction (0-1, 2-5, 6-10, 11+)
    y = tweets[['favorites']]
    y['favorites'] = np.where(y['favorites'].between(0, 1), 0, y['favorites'])
    y['favorites'] = np.where(y['favorites'].between(2, 5), 1, y['favorites'])
    y['favorites'] = np.where(y['favorites'].between(6, 10), 2, y['favorites'])
    y['favorites'] = np.where(y['favorites'] > 10, 3, y['favorites'])
    return tweets, y
class TweetMiner(object):
    api = None
    connection = None

    def __init__(self):
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        self.api = API(auth,
                       wait_on_rate_limit=True,
                       wait_on_rate_limit_notify=True)
        self.connection = MongoHandler()

    # Retrieve tweets fith a given tweet id
    def get_tweets_with_id(self):

        old_posts = self.connection.retrieve_from_collection("twitter")
        new_posts = self.connection.retrieve_from_collection("twitter_new")
        new_ids_list = [row["_id"] for row in new_posts]
        ids_list = [
            row["_id"] for row in old_posts
            if not row["_id"] in new_ids_list and
            not row["full_text"].startswith("RT @") and ("promo" or "giveaway")
            not in row["full_text"] and len(row["full_text"].split()) >= 5
        ]

        print("Starting...")
        count0 = 0
        count1 = 0
        for tweet_id in ids_list:
            try:
                # tweet = self.api.get_status(tweet_id, tweet_mode="extended")._json
                tweets = self.connection.get_with_id("twitter",
                                                     {"_id": tweet_id})
                for tweet in tweets:
                    _pre_tweet = self.preprocess_tweet(tweet)
                    # print(json.dumps(pre_tweet, indent=4, sort_keys=True))
                    count1 += 1
            except TweepError:
                count0 += 1

        print("--------------------------------")
        print(f"Number of found: {count1}")
        print("--------------------------------")
        print(f"Number of not found: {count0}")

    # Preprocess tweet text
    def preprocess_tweet(self, tweet):
        tweet_dict = dict()
        tweet_dict["_id"] = tweet["id"]
        created_at = time.strftime(
            '%Y-%m-%d',
            time.strptime(tweet["created_at"], '%a %b %d %H:%M:%S +0000 %Y'))
        tweet_dict["created_at"] = created_at
        tweet_dict["text"] = preprocess_text(tweet["full_text"])
        tweet_dict["hashtags"] = [
            hashtag["text"] for hashtag in tweet["entities"]["hashtags"]
        ]
        tweet_dict["mentions"] = [
            hashtag["name"] for hashtag in tweet["entities"]["user_mentions"]
        ]
        tweet_dict["hashtags"] = [
            hashtag["text"] for hashtag in tweet["entities"]["hashtags"]
        ]
        tweet_dict["urls"] = [
            hashtag["url"] for hashtag in tweet["entities"]["urls"]
        ]
        tweet_dict["user_id"] = tweet["user"]["id"]
        tweet_dict["user_name"] = tweet["user"]["name"]
        tweet_dict["user_screen_name"] = tweet["user"]["screen_name"]
        tweet_dict["user_location"] = tweet["user"]["location"]
        tweet_dict["user_followers"] = tweet["user"]["followers_count"]
        tweet_dict["user_friends"] = tweet["user"]["friends_count"]
        tweet_dict["user_listed"] = tweet["user"]["listed_count"]
        tweet_dict["user_favourites"] = tweet["user"]["favourites_count"]
        ts = time.strftime(
            '%Y-%m',
            time.strptime(tweet["user"]["created_at"],
                          '%a %b %d %H:%M:%S +0000 %Y'))
        date_time_obj = datetime.datetime.strptime(ts, '%Y-%m')
        end_date = datetime.datetime.now()
        num_months = (end_date.year - date_time_obj.year) * 12 + (
            end_date.month - date_time_obj.month)
        tweet_dict["user_months"] = num_months
        tweet_dict["user_statuses"] = tweet["user"]["statuses_count"]
        tweet_dict["user_verified"] = int(tweet["user"]["verified"])
        tweet_dict["retweets"] = tweet["retweet_count"]
        tweet_dict["favorites"] = tweet["favorite_count"]
        tweet_dict["is_quoted"] = tweet["is_quote_status"]
        self.connection.store_to_collection(tweet_dict, "twitter_new")
        return tweet_dict

    # Retrieve new tweets
    def get_new_tweets(self):
        count = 0
        for tweet in Cursor(self.api.search,
                            q="@#ClimateChange",
                            lang="en",
                            tweet_mode="extended").items():
            if not tweet._json["full_text"].startswith("RT @") and (
                    "promo"
                    or "giveaway") not in tweet._json["full_text"] and len(
                        tweet._json["full_text"].split()) >= 5:
                count += 1
                self.preprocess_tweet(tweet._json)
        print("--------------------------------")
        print(f"Number of found: {count}")

    # Get tweets from a particular user
    def get_user_tweets(self):
        re_list = []
        users = profiling.get_user_names()
        # for user in lexicons.deniers:
        # for user in lexicons.non_deniers:

        count_users = 0
        for user in users[489:500]:  # 363
            try:
                print("User: "******"en",
                                                      tweet_mode="extended")
                    for status in statuses:
                        if any(keyword in status.full_text for keyword in lexicons.keywords) \
                                and len(status.full_text.split()) >= 5 \
                                and detect(status.full_text) == 'en':
                            # and not status.full_text.startswith("RT @"):
                            status_dict = dict()
                            status_dict["_id"] = status.id
                            status_dict[
                                "user_name"] = status.author.screen_name
                            status_dict["location"] = status.author.location
                            status_dict["description"] = preprocess_text(
                                status.author.description)
                            status_dict[
                                'date'] = f"{status.created_at.year}-{status.created_at.month}-{status.created_at.day}"
                            clean_text = preprocess_text(
                                re.sub(r'^RT\s@\w+:', r'', status.full_text))
                            status_dict["text"] = clean_text

                            status_dict["sentiment"] = round(
                                sentiment_analyzer_scores(
                                    status.full_text)['compound'], 3)

                            anger, anticipation, disgust, fear, joy, _negative, _positive, sadness, surprise, trust = get_emotions(
                                clean_text)
                            status_dict["anger"] = anger
                            status_dict["anticipation"] = anticipation
                            status_dict["disgust"] = disgust
                            status_dict["fear"] = fear
                            status_dict["joy"] = joy
                            status_dict["sadness"] = sadness
                            status_dict["surprise"] = surprise
                            status_dict["trust"] = trust

                            subj = TextBlob(''.join(
                                status.full_text)).sentiment
                            status_dict["subjectivity"] = round(subj[1], 3)

                            # status_dict["label"] = 0 # non - denier
                            # status_dict["label"] = 1 # denier
                            user_tweets.append(status_dict)
                    # re_list.append(statuses)
                for status_dict in user_tweets:
                    try:
                        self.connection.store_to_collection(
                            status_dict, "twitter_profiles_1K"
                        )  # new_twitter_profiles for training data
                        count_tweets += 1
                    except pymongo.errors.DuplicateKeyError:
                        # print(status_dict.id)
                        print("exception")
                        continue
                print("Found ", count_tweets, " relevant tweets by the user: "******"test sleep!")
                    time.sleep(300)
                    print("test sleep ended!!!")
                if count_users > 1001:
                    print("break!")
                    break
            except tweepy.error.TweepError:
                print("Locked profile!")
                continue
            except langdetect.lang_detect_exception.LangDetectException:
                continue

        return re_list
Exemple #4
0
from SentimentAnalysis.Supervised.feature_extraction import vectorize_test_dataframe, vectorize_train_dataframe, \
    preprocces_dataframe, preprocces_mongo_tweets_and_posts
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from DataCollection.mongo import MongoHandler

# Connect to mongo to retrieve tweets and insta post
mongo_connect = MongoHandler()
# tweets = mongo_connect.retrieve_from_collection("twitter")  # Retrieve tweets from collection
insta = mongo_connect.retrieve_from_collection(
    "instagram")  # Retrieve insta posts from collection

# Read tweets from mongo
# mongo_tweets = pd.DataFrame(list(tweets))
# print(mongo_tweets.columns)
# mongo_tweets = mongo_tweets.sample(frac=1, random_state=1)
# mongo_tweets = mongo_tweets[['_id','text']]

# Read instagram posts from mongo
mongo_insta = pd.DataFrame(list(insta))
mongo_insta = mongo_insta.sample(frac=1, random_state=1)
mongo_insta = mongo_insta[['_id', 'caption']]

# Read the dataset
data = pd.read_csv("2018-11 emotions-classification-train.txt", sep="\t")

# Preprocces the dataset
emotions = preprocces_dataframe(data)
emotions_categories = [
Exemple #5
0
class InstaMiner(object):

    loader = None
    connection = None

    def __init__(self):
        loader = Instaloader(download_pictures=False,
                             download_video_thumbnails=False,
                             download_videos=False,
                             compress_json=False,
                             sleep=True)
        # loader.login(insta_username, insta_password)
        self.connection = MongoHandler()

    # Retrieve new posts from Instagram
    def get_new_posts(self):

        for post in self.loader.get_hashtag_posts('climatechange'):
            # Keeping only necessary k-v
            # print(post._node)
            new_post = dict()
            new_post["_id"] = post._node.pop("id")
            print(json.dumps(post._node, indent=4, sort_keys=True))
            try:
                new_post["caption"] = post._node["edge_media_to_caption"][
                    "edges"][0]["node"]["text"]
            except:
                new_post["caption"] = None
            try:
                new_post["location"] = post._node["location"]
            except:
                new_post["location"] = None
            try:
                new_post["shortcode"] = post._node["shortcode"]
            except:
                new_post["shortcode"] = None
            try:
                new_post["timestamp"] = post._node["taken_at_timestamp"]
            except:
                new_post["timestamp"] = None
            try:
                new_post["liked_by"] = post._node["edge_liked_by"]["count"]
            except:
                new_post["liked_by"] = None
            try:
                new_post["user_id"] = post._node["owner"]["id"]
            except:
                new_post["user_id"] = None
            try:
                new_post["username"] = post._node["owner"]["username"]
            except:
                new_post["username"] = None
            try:
                new_post["is_verified"] = post._node["owner"]["is_verified"]
            except:
                new_post["is_verified"] = None
            try:
                new_post["is_private"] = post._node["owner"]["is_private"]
            except:
                new_post["is_private"] = None

            self.connection.store_to_collection(new_post, "instagram")

    # Preprocesses instagram posts
    def preprocess_posts(self):
        posts = self.connection.retrieve_from_collection("instagram")

        count = 0
        for post in posts:
            if post["caption"]:
                try:
                    if not len(post["caption"].split()) < 5 and detect(
                            post["caption"]) == 'en':
                        new_post = dict()
                        new_post["_id"] = int(post["_id"])
                        new_post["hashtags"] = get_hashtags(post["caption"])
                        new_post["mentions"] = get_mentions(post["caption"])
                        new_post["caption"] = preprocess_text(post["caption"])
                        new_post["shortcode"] = post["shortcode"]
                        new_post["user_id"] = post["user_id"]
                        new_post["likes"] = post["liked_by"]
                        new_post[
                            "created_at"] = datetime.datetime.fromtimestamp(
                                post["timestamp"]).strftime("%Y-%m-%d")
                        self.connection.store_to_collection(
                            new_post, "instagram_new")
                        count += 1
                except:
                    print(1)

        print("--------------------------------")
        print(f"Number of found: {count}")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from DataCollection.mongo import MongoHandler
import matplotlib.pyplot as plt
import seaborn as sns

mongo_connect = MongoHandler()
profiles = mongo_connect.retrieve_from_collection("twitter_profiles")

df = pd.DataFrame(list(profiles))
df = df.sample(frac=1, random_state=1)
df = df.drop('_id', axis=1)

text = df['text']
Y = df['label']

# data = df[['sentiment','subjectivity', 'label']]
# corr = data.corr()
# cor_plot = sns.heatmap(corr, annot=True)
# # plt.show()

# df[df['label'] == 0].hist()
# df[df['label'] == 1].hist()
# print(df[df['label'] == 0].describe())
# print(df[df['label'] == 1].describe())


def word_cloud(df):
    import wordcloud
    from wordcloud import WordCloud