def friends_loop(self): database = mongoController.mongoController() while (True): print("REST API PROCESSING FRIENDS") # get user id from file friends_id_line = self.file_controller.get_and_remove_first_line( "data/friends.txt") if (friends_id_line == False): time.sleep(60 * 2) continue split_friends_id_line = friends_id_line.split(":") date_difference = self.calculate_days_diff_from_today( split_friends_id_line[1]) if (date_difference.days > 1): user = database.users.get_user_with_id( split_friends_id_line[0]) if not 'friends' in user: # get users friends and insert into db friends = self.rest_client.get_friends_ids( friends_id_line[0]) database.friends.insert_friends(split_friends_id_line[0], friends) else: self.file_controller.append_one_line("data/friends.txt", friends_id_line)
def retweets_group(self, group): database = mongoController.mongoController() users = database.users.get_all_users() retweet_map = {} for user in users: if 'tweets' in user: for tweet in user['tweets']: if 'retweets' in tweet: if 'group' in tweet and tweet['group'] == group: for retweet in tweet['retweets']: for ind_retweets in retweet['retweets']: if not ind_retweets['user'] in retweet_map: retweet_map[ind_retweets['user']] = { user['idd']: 1 } else: temp = retweet_map[ ind_retweets['user']] if user['idd'] in temp: temp.update({ user['idd']: temp[user['idd']] + 1 }) else: temp[user['idd']] = 1 # print(retweet_map) return retweet_map
def quotes_group(self, group): database = mongoController.mongoController() users = database.users.get_all_users() quote_map = {} for user in users: if 'tweets' in user: for tweet in user['tweets']: if 'quote_status' in tweet: if 'group' in tweet and tweet['group'] == group: if not user['idd'] in quote_map: quote_map[user['idd']] = { tweet['quote_user']: 1 } else: temp = quote_map[user['idd']] if tweet['quote_user'] in temp: temp.update({ tweet['quote_user']: temp[tweet['quote_user']] + 1 }) else: temp[tweet['quote_user']] = 1 # print(quote_map) return quote_map
def retweet_loop(self): database = mongoController.mongoController() while (True): print("REST API PROCESSING RETWEETS") # get the tweet id from file tweet_id_line = self.file_controller.get_and_remove_first_line( "data/retweets.txt") if (tweet_id_line == False): time.sleep(60 * 2) continue # calculate the difference between today and the date user was added to file split_tweet_line = tweet_id_line.split(':') date_difference = self.calculate_days_diff_from_today( split_tweet_line[1]) # if ready to be processed # parse and add to tweet database object if (date_difference.days > 1): retweets = self.rest_client.get_retweets(split_tweet_line[0]) parsed_retweets = self.retweet_parser.parse_retweet_chain( split_tweet_line[0], retweets) database.retweets.insert_retweets(split_tweet_line[0], parsed_retweets) # else re-add tweet id to bottom of file else: self.file_controller.append_one_line("data/retweets.txt", tweet_id_line)
def parse_tweets(self, final_tweets_list, json_tweet): print("PARSING TWEET") file_controller = fileController.fileController() database = mongoController.mongoController() # skip if repeated tweet if not database.tweets.get_tweet_exists(json_tweet['user']['id'], json_tweet['id']): # if user not in db - add them user = database.users.get_user_with_id(json_tweet['user']['id']) if not user: self.create_new_user(database, json_tweet['user']) # create if not a retweet if not 'retweeted_status' in json_tweet: mongo_tweet = self.mongo_tweet_structure(json_tweet) # parse the quoted tweet if exists if 'quoted_status' in json_tweet: self.parse_tweets(final_tweets_list, json_tweet['quoted_status']) final_tweets_list.append(mongo_tweet) # if it is a retweet get the original tweet and parse else: self.parse_tweets(final_tweets_list, json_tweet['retweeted_status']) retweet_line = str( json_tweet['retweeted_status']['id']) + ':' + str( datetime.date.today()) file_controller.append_one_line("data/retweets.txt", retweet_line) return final_tweets_list
def kmeans(self): print("PROCESSING GROUPS") database = mongoController.mongoController() # get all tweets documents, ids = self.process() # turn tweets into TD-IDF representation vectorizer = TfidfVectorizer( stop_words=nltk.corpus.stopwords.words('english')) X = vectorizer.fit_transform(documents) model = KMeans(n_clusters=self.n_clusters, init='k-means++', max_iter=10000, n_init=1) model.fit(X) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() # Assign groups to each tweet and insert into database for i in range(len(documents)): print("Processing Kmeans for tweet " + str(i) + "/" + str(len(documents))) Y = vectorizer.transform([documents[i]]) prediction = model.predict(Y) database.tweets.insert_group_to_tweet(ids[i], prediction[0])
def total_tweets(self): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 # loop over tweets set for each user for tweets in all_tweets: # loop over each tweet for current user for tweet in tweets['tweets']: total = total + 1 return total
def total_retweets(self): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 for tweets in all_tweets: for tweet in tweets['tweets']: if 'retweets' in tweet: total = total + len(tweet['retweets']) return total
def total_quotes(self): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 for tweets in all_tweets: for tweet in tweets['tweets']: if 'quote_status' in tweet: total = total + 1 return total
def total_retweets_group(self, group): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 for tweets in all_tweets: for tweet in tweets['tweets']: if 'group' in tweet and tweet[ 'group'] == group and 'retweets' in tweet: total = total + len(tweet['retweets']) return total
def total_tweets_group(self, group): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 # loop over tweets set for each user for tweets in all_tweets: # loop over each tweet for current user for tweet in tweets['tweets']: if 'group' in tweet and tweet['group'] == group: total = total + 1 return total
def total_quote_group(self, group): database = mongoController.mongoController() all_tweets = database.tweets.get_all_tweets() total = 0 for tweets in all_tweets: for tweet in tweets['tweets']: if 'group' in tweet and tweet[ 'group'] == group and 'quote_status' in tweet: total = total + 1 return total
def process(self): database = mongoController.mongoController() tweets = database.tweets.get_all_tweets() documents = [] ids = [] for tweet in tweets: for ind_tweet in tweet['tweets']: new_string = ' '.join( [w for w in ind_tweet['text'].split() if len(w) > 3]) new_string = re.sub(r'http\S+', '', new_string) documents.append(new_string) ids.append(ind_tweet['idd']) return documents, ids
def parse_followers(self, followers): database = mongoController.mongoController() all_followers = [] for follower in followers: # Parse into json temp = json.dumps(follower._json) json_follower = json.loads(temp) # does user exist? Create user if not user_exists = database.users.get_user_exists(json_follower['id']) if not user_exists: new_user = self.user_parser.parse_user(json_follower) database.users.insert_a_user(new_user) # user = database.users.get_user_with_id(json_follower['id']) all_followers.append(json_follower['id']) return all_followers
def timeline_loop(self): database = mongoController.mongoController() while (True): print("REST API PROCESSING USERS TIMELINE") # get the user id from file timeline_id_line = self.file_controller.get_and_remove_first_line( "data/users.txt") if (timeline_id_line == False): time.sleep(60 * 2) continue tweets = self.rest_client.get_tweets_from_user(timeline_id_line) # parse and insert tweets parsed_tweets = self.tweet_parser.parse_rest_tweet_chain(tweets) if not parsed_tweets is None: for parsed_tweet in parsed_tweets: if len(parsed_tweet) > 0: database.tweets.insert_tweets(parsed_tweet)
def followers_loop(self): database = mongoController.mongoController() switch = 0 while (True): print("REST API PROCESSING FOLLOWERS") # get the user id from file follower_id_line = self.file_controller.get_and_remove_first_line( "data/followers.txt") if (follower_id_line == False): time.sleep(60 * 2) continue split_follower_id_line = follower_id_line.split(':') date_difference = self.calculate_days_diff_from_today( split_follower_id_line[1]) if (date_difference.days > 1): # switch between both follower API's user = database.users.get_user_with_id( split_follower_id_line[0]) if not 'followers' in user: if (switch % 2 == 0): followers = self.rest_client.get_followers( split_follower_id_line[0]) all_followers = self.follower_parser.parse_followers( followers) else: all_followers = self.rest_client.get_followers_ids( split_follower_id_line[0]) # insert followers to db database.followers.insert_followers( split_follower_id_line[0], all_followers) # calculate switch if (switch) == 1: switch = 0 else: switch = switch + 1 else: self.file_controller.append_one_line("data/followers.txt", follower_id_line)
def hashtags_groups(self, group): database = mongoController.mongoController() users = database.users.get_all_users() hashtag_map = {} for user in users: if 'tweets' in user: for tweet in user['tweets']: if 'group' in tweet and tweet['group'] == group: for hashtag in tweet['hashtags']: if not hashtag in hashtag_map: hashtag_map[hashtag] = {tweet['user']: 1} else: temp = hashtag_map[hashtag] if tweet['user'] in temp: temp.update({ tweet['user']: temp[tweet['user']] + 1 }) else: temp[tweet['user']] = 1 # print(hashtag_map) return hashtag_map
def hashtags(self): database = mongoController.mongoController() users = database.users.get_all_users() usable_users = [] hashtag_map = {} for user in users: if 'tweets' in user: usable_users.append(user) for i, user in enumerate(usable_users): for tweet in user['tweets']: for hashtag in tweet['hashtags']: if not hashtag in hashtag_map: hashtag_map[hashtag] = {tweet['user']: 1} else: temp = hashtag_map[hashtag] if tweet['user'] in temp: temp.update( {tweet['user']: temp[tweet['user']] + 1}) else: temp[tweet['user']] = 1 return hashtag_map
def replies(self): database = mongoController.mongoController() users = database.users.get_all_users() reply_map = {} for user in users: if 'tweets' in user: for tweet in user['tweets']: if 'response_status' in tweet: if not user['idd'] in reply_map: reply_map[user['idd']] = { tweet['response_user']: 1 } else: temp = reply_map[user['idd']] if tweet['response_user'] in temp: temp.update({ tweet['response_user']: temp[tweet['response_user']] + 1 }) else: temp[tweet['response_user']] = 1 # print(reply_map) return reply_map
) if len(sys.argv) == 2: if (sys.argv[1]) == 'run': print("RUNNING STREAMER AND REST API") run() if (sys.argv[1]) == 'run-stream': print("RUNNING STREAMER") stream() if (sys.argv[1]) == 'run-api': print("RUNNING REST API") api() if (sys.argv[1]) == 'process': print("PROCESSING TWEETS") process_results() print("TWEETS PROCESSED") if (sys.argv[1]) == 'status': print("PRINTING STATUS") status() if (sys.argv[1] == 'purge'): print("PURGING DATABASE") database = mongoController.mongoController() database.users.remove_all_users() print(database.users.get_all_users()) print("PURGED DATABASE") if (sys.argv[1] == "manual"): manual() ## Write to file for debug # file_controller = fileController.fileController() # file_controller.write_data_to_file("test.json", database.users.get_all_users())
def __init__(self): self.tweet_parser = tweetParser.tweetParser() self.user_parser = userParser.userParser() self.database = mongoController.mongoController()