def base_model(): mongo_connect = MongoHandler() like_tweets = mongo_connect.retrieve_from_collection("twitter_new") df = pd.DataFrame(list(like_tweets)) # text = df['text'] # df = df.drop(['user_name','user_location','hashtags','mentions','created_at'],axis=1) # Column / Feature selection base = df[[ 'user_followers', 'user_friends', 'user_favourites', 'user_months', 'user_statuses', 'user_verified', 'retweets' ]] per_month = round((base['user_statuses'] + 1) / (base['user_months'] + 1), 2) per_month = pd.DataFrame(per_month) per_month.columns = ['tweet_per_month'] base = pd.concat([base, per_month], axis=1) target = df['favorites'] # base = base[['user_followers', 'retweets', 'user_favourites', 'user_statuses']] columns = base.columns.values.tolist() # Tranform the problem of regression into a multi-class classification. Classes: zero, low, medium, high for i in range(len(target)): if 0 < target[i] < 6: target[i] = 1 elif 5 < target[i] < 11: target[i] = 2 elif target[i] >= 11: target[i] = 3 # target.hist() # plt.show() nm1 = NearMiss(version=1) # Under-sampling technique base, target = nm1.fit_resample(base, target) x_train, x_test, y_train, y_test = model_selection.train_test_split( base, target, random_state=1, test_size=0.3) # testing different ML algorithms # model = tree.DecisionTreeClassifier(criterion="entropy", random_state=5) # class_weight="balanced") # model = linear_model.LogisticRegression(solver="lbfgs", random_state=5) # model = naive_bayes.MultinomialNB() # model = tree.ExtraTreeClassifier(random_state=5) # model = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10), n_estimators=1000, random_state=5) model = RandomForestClassifier(n_estimators=500, random_state=5) k_fold_cv(model, x_train, y_train, False) model.fit(x_train, y_train) y_predict = model.predict(x_test) evaluation(y_test, y_predict) # feature importance / interpretability new_y_train = model.predict(x_train) tree_model = tree.DecisionTreeClassifier( criterion="entropy", random_state=5) # class_weight="balanced") tree_model.fit(x_train, new_y_train) tree_feature_importance(tree_model, columns, x_train) # calls function for interpretable ML
def __init__(self): auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) self.api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) self.connection = MongoHandler()
def __init__(self): loader = Instaloader(download_pictures=False, download_video_thumbnails=False, download_videos=False, compress_json=False, sleep=True) # loader.login(insta_username, insta_password) self.connection = MongoHandler()
def read_tweets_and_instaposts(collection): mongo_connect = MongoHandler() tweets = mongo_connect.retrieve_from_collection( collection) # Retrieve tweets from collection tweets = pd.DataFrame(list(tweets)) tweets = tweets.sample(frac=1, random_state=1) tweets = tweets[[ '_id', 'text', 'negative', 'positive:', 's_anger', 's_disgust', 's_fear', 's_joy', 's_sadness', 's_surprise', 'favorites' ]] # Keep emotion analysis features and y tweets.rename(columns={"positive:": "positive"}, inplace=True) # print(tweets.columns) # print(tweets) # Handle like prediction as a classification problem by creating 4 different bins for like prediction (0-1, 2-5, 6-10, 11+) y = tweets[['favorites']] y['favorites'] = np.where(y['favorites'].between(0, 1), 0, y['favorites']) y['favorites'] = np.where(y['favorites'].between(2, 5), 1, y['favorites']) y['favorites'] = np.where(y['favorites'].between(6, 10), 2, y['favorites']) y['favorites'] = np.where(y['favorites'] > 10, 3, y['favorites']) return tweets, y
class TweetMiner(object): api = None connection = None def __init__(self): auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) self.api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) self.connection = MongoHandler() # Retrieve tweets fith a given tweet id def get_tweets_with_id(self): old_posts = self.connection.retrieve_from_collection("twitter") new_posts = self.connection.retrieve_from_collection("twitter_new") new_ids_list = [row["_id"] for row in new_posts] ids_list = [ row["_id"] for row in old_posts if not row["_id"] in new_ids_list and not row["full_text"].startswith("RT @") and ("promo" or "giveaway") not in row["full_text"] and len(row["full_text"].split()) >= 5 ] print("Starting...") count0 = 0 count1 = 0 for tweet_id in ids_list: try: # tweet = self.api.get_status(tweet_id, tweet_mode="extended")._json tweets = self.connection.get_with_id("twitter", {"_id": tweet_id}) for tweet in tweets: _pre_tweet = self.preprocess_tweet(tweet) # print(json.dumps(pre_tweet, indent=4, sort_keys=True)) count1 += 1 except TweepError: count0 += 1 print("--------------------------------") print(f"Number of found: {count1}") print("--------------------------------") print(f"Number of not found: {count0}") # Preprocess tweet text def preprocess_tweet(self, tweet): tweet_dict = dict() tweet_dict["_id"] = tweet["id"] created_at = time.strftime( '%Y-%m-%d', time.strptime(tweet["created_at"], '%a %b %d %H:%M:%S +0000 %Y')) tweet_dict["created_at"] = created_at tweet_dict["text"] = preprocess_text(tweet["full_text"]) tweet_dict["hashtags"] = [ hashtag["text"] for hashtag in tweet["entities"]["hashtags"] ] tweet_dict["mentions"] = [ hashtag["name"] for hashtag in tweet["entities"]["user_mentions"] ] tweet_dict["hashtags"] = [ hashtag["text"] for hashtag in tweet["entities"]["hashtags"] ] tweet_dict["urls"] = [ hashtag["url"] for hashtag in tweet["entities"]["urls"] ] tweet_dict["user_id"] = tweet["user"]["id"] tweet_dict["user_name"] = tweet["user"]["name"] tweet_dict["user_screen_name"] = tweet["user"]["screen_name"] tweet_dict["user_location"] = tweet["user"]["location"] tweet_dict["user_followers"] = tweet["user"]["followers_count"] tweet_dict["user_friends"] = tweet["user"]["friends_count"] tweet_dict["user_listed"] = tweet["user"]["listed_count"] tweet_dict["user_favourites"] = tweet["user"]["favourites_count"] ts = time.strftime( '%Y-%m', time.strptime(tweet["user"]["created_at"], '%a %b %d %H:%M:%S +0000 %Y')) date_time_obj = datetime.datetime.strptime(ts, '%Y-%m') end_date = datetime.datetime.now() num_months = (end_date.year - date_time_obj.year) * 12 + ( end_date.month - date_time_obj.month) tweet_dict["user_months"] = num_months tweet_dict["user_statuses"] = tweet["user"]["statuses_count"] tweet_dict["user_verified"] = int(tweet["user"]["verified"]) tweet_dict["retweets"] = tweet["retweet_count"] tweet_dict["favorites"] = tweet["favorite_count"] tweet_dict["is_quoted"] = tweet["is_quote_status"] self.connection.store_to_collection(tweet_dict, "twitter_new") return tweet_dict # Retrieve new tweets def get_new_tweets(self): count = 0 for tweet in Cursor(self.api.search, q="@#ClimateChange", lang="en", tweet_mode="extended").items(): if not tweet._json["full_text"].startswith("RT @") and ( "promo" or "giveaway") not in tweet._json["full_text"] and len( tweet._json["full_text"].split()) >= 5: count += 1 self.preprocess_tweet(tweet._json) print("--------------------------------") print(f"Number of found: {count}") # Get tweets from a particular user def get_user_tweets(self): re_list = [] users = profiling.get_user_names() # for user in lexicons.deniers: # for user in lexicons.non_deniers: count_users = 0 for user in users[489:500]: # 363 try: print("User: "******"en", tweet_mode="extended") for status in statuses: if any(keyword in status.full_text for keyword in lexicons.keywords) \ and len(status.full_text.split()) >= 5 \ and detect(status.full_text) == 'en': # and not status.full_text.startswith("RT @"): status_dict = dict() status_dict["_id"] = status.id status_dict[ "user_name"] = status.author.screen_name status_dict["location"] = status.author.location status_dict["description"] = preprocess_text( status.author.description) status_dict[ 'date'] = f"{status.created_at.year}-{status.created_at.month}-{status.created_at.day}" clean_text = preprocess_text( re.sub(r'^RT\s@\w+:', r'', status.full_text)) status_dict["text"] = clean_text status_dict["sentiment"] = round( sentiment_analyzer_scores( status.full_text)['compound'], 3) anger, anticipation, disgust, fear, joy, _negative, _positive, sadness, surprise, trust = get_emotions( clean_text) status_dict["anger"] = anger status_dict["anticipation"] = anticipation status_dict["disgust"] = disgust status_dict["fear"] = fear status_dict["joy"] = joy status_dict["sadness"] = sadness status_dict["surprise"] = surprise status_dict["trust"] = trust subj = TextBlob(''.join( status.full_text)).sentiment status_dict["subjectivity"] = round(subj[1], 3) # status_dict["label"] = 0 # non - denier # status_dict["label"] = 1 # denier user_tweets.append(status_dict) # re_list.append(statuses) for status_dict in user_tweets: try: self.connection.store_to_collection( status_dict, "twitter_profiles_1K" ) # new_twitter_profiles for training data count_tweets += 1 except pymongo.errors.DuplicateKeyError: # print(status_dict.id) print("exception") continue print("Found ", count_tweets, " relevant tweets by the user: "******"test sleep!") time.sleep(300) print("test sleep ended!!!") if count_users > 1001: print("break!") break except tweepy.error.TweepError: print("Locked profile!") continue except langdetect.lang_detect_exception.LangDetectException: continue return re_list
from SentimentAnalysis.Supervised.feature_extraction import vectorize_test_dataframe, vectorize_train_dataframe, \ preprocces_dataframe, preprocces_mongo_tweets_and_posts import pandas as pd from sklearn.multiclass import OneVsRestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from DataCollection.mongo import MongoHandler # Connect to mongo to retrieve tweets and insta post mongo_connect = MongoHandler() # tweets = mongo_connect.retrieve_from_collection("twitter") # Retrieve tweets from collection insta = mongo_connect.retrieve_from_collection( "instagram") # Retrieve insta posts from collection # Read tweets from mongo # mongo_tweets = pd.DataFrame(list(tweets)) # print(mongo_tweets.columns) # mongo_tweets = mongo_tweets.sample(frac=1, random_state=1) # mongo_tweets = mongo_tweets[['_id','text']] # Read instagram posts from mongo mongo_insta = pd.DataFrame(list(insta)) mongo_insta = mongo_insta.sample(frac=1, random_state=1) mongo_insta = mongo_insta[['_id', 'caption']] # Read the dataset data = pd.read_csv("2018-11 emotions-classification-train.txt", sep="\t") # Preprocces the dataset emotions = preprocces_dataframe(data) emotions_categories = [
class InstaMiner(object): loader = None connection = None def __init__(self): loader = Instaloader(download_pictures=False, download_video_thumbnails=False, download_videos=False, compress_json=False, sleep=True) # loader.login(insta_username, insta_password) self.connection = MongoHandler() # Retrieve new posts from Instagram def get_new_posts(self): for post in self.loader.get_hashtag_posts('climatechange'): # Keeping only necessary k-v # print(post._node) new_post = dict() new_post["_id"] = post._node.pop("id") print(json.dumps(post._node, indent=4, sort_keys=True)) try: new_post["caption"] = post._node["edge_media_to_caption"][ "edges"][0]["node"]["text"] except: new_post["caption"] = None try: new_post["location"] = post._node["location"] except: new_post["location"] = None try: new_post["shortcode"] = post._node["shortcode"] except: new_post["shortcode"] = None try: new_post["timestamp"] = post._node["taken_at_timestamp"] except: new_post["timestamp"] = None try: new_post["liked_by"] = post._node["edge_liked_by"]["count"] except: new_post["liked_by"] = None try: new_post["user_id"] = post._node["owner"]["id"] except: new_post["user_id"] = None try: new_post["username"] = post._node["owner"]["username"] except: new_post["username"] = None try: new_post["is_verified"] = post._node["owner"]["is_verified"] except: new_post["is_verified"] = None try: new_post["is_private"] = post._node["owner"]["is_private"] except: new_post["is_private"] = None self.connection.store_to_collection(new_post, "instagram") # Preprocesses instagram posts def preprocess_posts(self): posts = self.connection.retrieve_from_collection("instagram") count = 0 for post in posts: if post["caption"]: try: if not len(post["caption"].split()) < 5 and detect( post["caption"]) == 'en': new_post = dict() new_post["_id"] = int(post["_id"]) new_post["hashtags"] = get_hashtags(post["caption"]) new_post["mentions"] = get_mentions(post["caption"]) new_post["caption"] = preprocess_text(post["caption"]) new_post["shortcode"] = post["shortcode"] new_post["user_id"] = post["user_id"] new_post["likes"] = post["liked_by"] new_post[ "created_at"] = datetime.datetime.fromtimestamp( post["timestamp"]).strftime("%Y-%m-%d") self.connection.store_to_collection( new_post, "instagram_new") count += 1 except: print(1) print("--------------------------------") print(f"Number of found: {count}")
import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler from DataCollection.mongo import MongoHandler import matplotlib.pyplot as plt import seaborn as sns mongo_connect = MongoHandler() profiles = mongo_connect.retrieve_from_collection("twitter_profiles") df = pd.DataFrame(list(profiles)) df = df.sample(frac=1, random_state=1) df = df.drop('_id', axis=1) text = df['text'] Y = df['label'] # data = df[['sentiment','subjectivity', 'label']] # corr = data.corr() # cor_plot = sns.heatmap(corr, annot=True) # # plt.show() # df[df['label'] == 0].hist() # df[df['label'] == 1].hist() # print(df[df['label'] == 0].describe()) # print(df[df['label'] == 1].describe()) def word_cloud(df): import wordcloud from wordcloud import WordCloud
import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics, model_selection from sklearn.linear_model import LogisticRegression from DataCollection.mongo import MongoHandler from secret_keys import insta_username, insta_password import pickle from DataCollection import lexicons from nltk.corpus import words from collections import Counter from geotext import GeoText # from visualization import word_cloud mongo_connect = MongoHandler() def dummy(doc): return doc # Opinion mining function. Classifies Tweeter users into "Deniers, Non-Deniers or Uncertain" # regarding the issue of Climate Change denialism def predict_denier_profiles(): # svm_load = pickle.load(open('final_models/final_svm.pickle', 'rb')) # unused SVM model # loading the trained Logistic Regression model LR = pickle.load(open('../final_models/final_LR.pickle', 'rb')) # loading the pre-trained Tf-Idf Lexicon