def dump_retweets_job(news: NewsItem, config: Config, twython_connector: TwythonConnector): data = news.tweet_data dir = news.dir for tweet, count in zip(data.tweet_id, data.retweet_count): if count != 0: try: connection = twython_connector.get_twython_connection( "get_retweet") retweets = connection.get_retweets(id=tweet, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet)) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet, connection)) for retweet in retweets: data = data.append(extract_retweet_features( retweet, tweet, data['fake'][0]), ignore_index=True) print('Saving ' + dir) data.to_csv(dir, index=False)
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id)) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) retweet_dir = "{}/retweets".format(dump_dir) create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector): """Collect info and dump info of tweet chunk containing atmost 100 tweets""" tweet_list = [] for tweet in tweet_chunk: tweet_list.append(tweet.tweet_id) try: tweet_objects_map = twython_connector.get_twython_connection( Constants.GET_TWEET).lookup_status(id=tweet_list, include_entities=True, map=True)['id'] for tweet in tweet_chunk: tweet_object = tweet_objects_map[str(tweet.tweet_id)] if tweet_object: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) create_dir(dump_dir) create_dir(tweet_dir) json.dump( tweet_object, open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w")) except TwythonRateLimitError: print("Twython API rate limit exception") logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") print("exception in collecting tweet objects:", str(ex)) return None
def dump_retweets_job( tweet: Tweet, config: Config, twython_connector: TwythonConnector ): retweets = [] connection = None dump_dir = get_dump_dir(config, tweet) if _should_fetch_retweets(tweet, dump_dir): try: connection = twython_connector.get_twython_connection(Constants.GET_RETWEET) retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id ) ) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection) ) retweet_obj = {"retweets": retweets} retweet_dir = "{}/retweets".format(dump_dir) create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
def dump_tweet_information(tweet: Tweet, config: Config, twython_connector: TwythonConnector): try: tweet_object = twython_connector.get_twython_connection( Constants.GET_TWEET).show_status(id=tweet.tweet_id) if tweet_object: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) create_dir(dump_dir) create_dir(tweet_dir) json.dump( tweet_object, open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w")) except TwythonRateLimitError: logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") return None
def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process): self.dataset_dir = data_dir self.dump_location = data_collection_dir self.tweet_keys_file = tweet_keys_file self.num_process = num_process self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): hop_index = tweet.hop_index news_dir = f"{config.dump_location}/{tweet.news_source}/{tweet.label}/{tweet.news_id}" retweet_dir = f"{news_dir}/retweets_{hop_index}" retweet_path = f"{retweet_dir}/{tweet.tweet_id}.json" if os.path.exists(retweet_path): print("[PASSED] news:{}, hop index: {}".format(tweet.news_id, hop_index)) return else: print("[NEW] news:{}, hop index: {}".format(tweet.news_id, hop_index)) retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception(f"Twython API rate limit exception - tweet id : {tweet.tweet_id}") except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} create_dir(news_dir) create_dir(retweet_dir) json.dump(retweet_obj, open(retweet_path, "w"))
def dump_user_following(user_id, save_location, twython_connector: TwythonConnector): # Fetch and save user information if the file is not already present if not Path("{}/{}.json".format(save_location, user_id)).is_file(): try: user_following = fetch_user_friends_ids(user_id, twython_connector.get_twython_connection(GET_FRIENDS_ID)) user_following_info = {USER_ID: user_id,FOLLOWING : user_following} json.dump(user_following_info, open("{}/{}.json".format(save_location, user_id), "w")) except: logging.exception("Exception in getting follower_ids for user : {}".format(user_id))
def dump_user_profile_job(user_id, save_location, twython_connector: TwythonConnector): profile_info = None # Fetch and save user information if the file is not already present if not Path("{}/{}.json".format(save_location, user_id)).is_file(): try: profile_info = twython_connector.get_twython_connection(GET_USER).show_user(user_id=user_id) except TwythonRateLimitError as ex: logging.exception("Twython API rate limit exception") finally: if profile_info: json.dump(profile_info, open("{}/{}.json".format(save_location, user_id), "w"))
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) retweet_dir = "{}/retweets".format(dump_dir) retweet_path = "{}/{}.json".format(retweet_dir, tweet.tweet_id) if os.path.exists(retweet_path): print("[PASSED] source:{}, label:{}, news:{}, retweet: tweet{}".format( tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id)) return else: print("[NEW] source:{}, label:{}, news:{}, retweet: tweet{}".format( tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id)) retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id)) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open(retweet_path, "w"))
def dump_user_recent_tweets_job(user_id, save_location, twython_connector: TwythonConnector): profile_info = None # Fetch and save user information if the file is not already present if not Path("{}/{}/{}.json".format(save_location, user_id[2], user_id[0])).is_file(): create_dir("{}/{}".format(save_location, user_id[2])) try: profile_info = twython_connector.get_twython_connection(GET_USER_TWEETS).get_user_timeline(user_id=user_id[0], count=200, exclude_replies=False, incude_rts=True, max_id=user_id[1]) except TwythonRateLimitError as ex: logging.exception("Twython API rate limit exception") finally: if len(profile_info) > 0: logging.info("found {} tweets in timeline for user {}".format(len(profile_info), user_id[0])) json.dump(profile_info, open("{}/{}/{}.json".format(save_location, user_id[2], user_id[0]), "w")) else: logging.warning("couldn't retrieve the timeline of user {}".format(user_id[0])) else: logging.info("file for users and story already existis")
def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector): """Collect info and dump info of tweet chunk containing at most 100 tweets""" # skip downloading tweets which are already been downloaded filtered_tweet_chunk = [] for tweet in tweet_chunk: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json" if os.path.exists(tweet_path): print(f"[PASSED] source:{tweet.news_source}, label:{tweet.label}, news:{tweet.news_id}") # save user profile stored in tweet user_profiles_folder = f"{config.dump_location}/user_profiles" with open(tweet_path, "r") as tweet_file: tweet_dict = json.loads(tweet_file.read()) user_id = tweet_dict['user']['id'] user_profile_path = f"{user_profiles_folder}/{user_id}.json" if not os.path.exists(user_profile_path): print(f"[NEW] User profile: {user_id}") with open(user_profile_path, "w") as user_profile_file: user_profile_file.write(json.dumps(tweet_dict['user'])) continue else: print(f"[NEW] source:{tweet.news_source}, label:{tweet.label}, news:{tweet.news_id}") filtered_tweet_chunk.append(tweet) tweet_id_list = [] for tweet in filtered_tweet_chunk: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json" tweet_id_list.append(tweet.tweet_id) try: tweet_objects_map = twython_connector.get_twython_connection(Constants.GET_TWEET).lookup_status(id=tweet_id_list, include_entities=True, map=True)['id'] for tweet in filtered_tweet_chunk: tweet_object = tweet_objects_map[str(tweet.tweet_id)] if tweet_object: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) tweet_path = f"{tweet_dir}/{tweet.tweet_id}.json" create_dir(dump_dir) create_dir(tweet_dir) json.dump(tweet_object, open(tweet_path, "w")) # save user profile stored in tweet user_profiles_folder = f"{config.dump_location}/user_profiles" user_id = tweet_object['user']['id'] user_profile_path = f"{user_profiles_folder}/{user_id}.json" if not os.path.exists(user_profile_path): print(f"[NEW] User profile: {user_id}") with open(user_profile_path, "w") as user_profile_file: user_profile_file.write(json.dumps(tweet_object['user'])) except TwythonRateLimitError: logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") return None
from os import path import json from util.TwythonConnector import TwythonConnector TWEET_ID = 1309376185340538881 # read config file config_path = path.abspath(path.join(path.dirname(__file__), '../config.json')) json_object = json.load(open(config_path)) tweet_keys_file = json_object["tweet_keys_file"] # get twypthon connector twython_connector = TwythonConnector("localhost:5000", tweet_keys_file) connection = twython_connector.get_twython_connection("get_retweet") # request target retweets = connection.get_retweets(id=TWEET_ID, count=100, cursor=-1) print(json.dumps(retweets))
#%% import os import json import time import random import argparse from collect_tweets import TweetCollector from util.TwythonConnector import TwythonConnector parser = argparse.ArgumentParser(description='crawl') parser.add_argument("--save_dir", type=str, default="../dataset") args = parser.parse_known_args()[0] tweet_keys_file = "./resources/tweet_keys_file.txt" connector = TwythonConnector(tweet_keys_file) # %% from news import News from collect_users import UserCollector user_root = os.path.join(args.save_dir, "user_network") user_collector = UserCollector(user_root, connector) user_collector.collect_user_follower_profiles() user_collector.collect_user_following_profiles() # %%
from os import path import json from util.TwythonConnector import TwythonConnector from util import Constants TWEET_ID_LIST = [1309376185340538881] # read config file config_path = path.abspath(path.join(path.dirname(__file__), '../config.json')) json_object = json.load(open(config_path)) tweet_keys_file = json_object["tweet_keys_file"] # get twypthon connector twython_connector = TwythonConnector("localhost:5000", tweet_keys_file) connection = twython_connector.get_twython_connection(Constants.GET_TWEET) # request target tweet_objects_map = connection.lookup_status(id=TWEET_ID_LIST, include_entities=True, map=True)['id'] print(json.dumps(tweet_objects_map))