def collect_user_profiles(config: Config, twython_connector: TwythonConnector): dump_location = config.dump_location all_user_ids = set() all_user_ids.update( get_user_ids_in_folder("{}/politifact/fake".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/politifact/real".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/gossipcop/fake".format(dump_location))) all_user_ids.update( get_user_ids_in_folder("{}/gossipcop/real".format(dump_location))) user_profiles_folder = "{}/{}".format(dump_location, "user_profiles") user_timeline_tweets_folder = "{}/{}".format(dump_location, "user_timeline_tweets") create_dir(user_profiles_folder) create_dir(user_timeline_tweets_folder) multiprocess_data_collection(dump_user_profile_job, all_user_ids, (user_profiles_folder, twython_connector), config) multiprocess_data_collection( dump_user_recent_tweets_job, all_user_ids, (user_timeline_tweets_folder, twython_connector), config)
def collect_user_followers(self, users): """ users should be a set that you want to crawl """ create_dir(self.user_followers_dir) # users = self.get_own_user_id(self.user_profiles_dir) existed_id_set = self.get_own_user_id(self.user_followers_dir) new_users_set = users - existed_id_set print("We are adding {}/{} to {}".format(len(new_users_set), len(users), self.user_followers_dir)) for i, id in enumerate(new_users_set): try: save_dir = "{}/{}.json".format(self.user_followers_dir, id) followers = self.twython_connector.get_twython_connection( Constants.GET_FOLLOWERS_ID).get_followers_ids(user_id=id, count=200) json.dump(followers, open(save_dir, 'w')) except TwythonRateLimitError: print("Twython API rate limit exception") except TwythonAuthError: print("{} followers, we have authorized execption".format(id)) except: print("Other execption") if (i % 15 == 0): print("{}/{} followers are attained".format( i, len(new_users_set)))
def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector): """Collect info and dump info of tweet chunk containing atmost 100 tweets""" tweet_list = [] for tweet in tweet_chunk: tweet_list.append(tweet.tweet_id) try: tweet_objects_map = twython_connector.get_twython_connection( Constants.GET_TWEET).lookup_status(id=tweet_list, include_entities=True, map=True)['id'] for tweet in tweet_chunk: tweet_object = tweet_objects_map[str(tweet.tweet_id)] if tweet_object: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) create_dir(dump_dir) create_dir(tweet_dir) json.dump( tweet_object, open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w")) except TwythonRateLimitError: print("Twython API rate limit exception") logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") print("exception in collecting tweet objects:", str(ex)) return None
def dump_retweets_job( tweet: Tweet, config: Config, twython_connector: TwythonConnector ): retweets = [] connection = None dump_dir = get_dump_dir(config, tweet) if _should_fetch_retweets(tweet, dump_dir): try: connection = twython_connector.get_twython_connection(Constants.GET_RETWEET) retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id ) ) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection) ) retweet_obj = {"retweets": retweets} retweet_dir = "{}/retweets".format(dump_dir) create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): hop_index = tweet.hop_index news_dir = f"{config.dump_location}/{tweet.news_source}/{tweet.label}/{tweet.news_id}" retweet_dir = f"{news_dir}/retweets_{hop_index}" retweet_path = f"{retweet_dir}/{tweet.tweet_id}.json" if os.path.exists(retweet_path): print("[PASSED] news:{}, hop index: {}".format(tweet.news_id, hop_index)) return else: print("[NEW] news:{}, hop index: {}".format(tweet.news_id, hop_index)) retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception(f"Twython API rate limit exception - tweet id : {tweet.tweet_id}") except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} create_dir(news_dir) create_dir(retweet_dir) json.dump(retweet_obj, open(retweet_path, "w"))
def collect_retweets(news_list, news_source, label, config: Config): create_dir("{}/{}/raw".format(config.dump_location, news_source)) news_list_to_process = [] empty_data_objects = 0 for news in news_list: news_dir = "{}/{}/{}/tweets/{}.csv".format(config.dump_location, news_source, label, news.news_id) data = pd.read_csv(news_dir) raw_dir = "{}/{}/complete/{}.csv".format(config.dump_location, news_source, news.news_id) if data.empty: empty_data_objects += 1 continue if path.exists(raw_dir): continue else: news_list_to_process.append(NewsItem(data, raw_dir)) print('Collecting for ' + str(len(news_list_to_process)) + ' news storie retweets.') print( str(empty_data_objects) + '/' + str(len(news_list)) + ' datasets were skipped, as they were empty. ') multiprocess_data_collection(dump_retweets_job, news_list_to_process, (config, config.twython_connector), config)
def collect_data(self, choices): use_id_from_profile = True # collect user IDs if use_id_from_profile: all_user_ids = get_user_ids_from_profile( self.config.dump_location) # List object returned else: all_user_ids = set() for choice in choices: choice_dir = f"{self.config.dump_location}/{choice['news_source']}/{choice['label']}" all_user_ids.update( get_user_ids_in_folder(choice_dir)) # Set object returned all_user_ids = list(all_user_ids) # create dir to store user followers user_followers_folder = f"{self.config.dump_location}/user_followers" create_dir(user_followers_folder) multiprocess_data_collection( dump_user_followers, all_user_ids, (user_followers_folder, self.config.twython_connector), self.config)
def collect_user_recent_tweets(self, users): """ users should be a set that you want to crawl """ create_dir(self.user_timelines_dir) # users = self.get_own_user_id(self.user_profiles_dir) existed_id_set = self.get_own_user_id(self.user_timelines_dir) new_users_set = users - existed_id_set print("We are adding {}/{} to user_timelines".format( len(new_users_set), len(users))) for i, id in enumerate(new_users_set): try: time_lines = self.twython_connector.get_twython_connection( Constants.GET_USER_TWEETS).get_user_timeline(user_id=id, count=200) json.dump( time_lines, open("{}/{}.json".format(self.user_timelines_dir, id), 'w')) except TwythonRateLimitError: print("Twython API rate limit exception") except TwythonAuthError: print("{} timelines, we have authorized execption".format(id)) except: print("Other exception") if (i % 100 == 0): print("{} users timelines are attained".format(i))
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id)) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) retweet_dir = "{}/retweets".format(dump_dir) create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open("{}/{}.json".format(retweet_dir, tweet.tweet_id), "w"))
def collect_user_followings(self, users): create_dir(self.user_following_dir) # users = self.get_own_user_id(self.user_profiles_dir) existed_id_set = self.get_own_user_id(self.user_following_dir) new_users_set = users - existed_id_set print("We are adding {}/{} to {}".format(len(new_users_set), len(users), self.user_following_dir)) for i, id in enumerate(new_users_set): try: save_dir = "{}/{}.json".format(self.user_following_dir, id) friends = self.twython_connector.get_twython_connection( Constants.GET_FRIENDS_ID).get_friends_ids(user_id=id, count=5000) json.dump(friends, open(save_dir, 'w')) except TwythonRateLimitError: print("Twython API rate limit exception") except TwythonAuthError: print("{} following, we have authorized execption".format(id)) except: print("other execption") if (i % 15 == 0): print("{}/{} following are attained".format( i, len(new_users_set)))
def dump_tweet_information(tweet: Tweet, config: Config, twython_connector: TwythonConnector): try: tweet_object = twython_connector.get_twython_connection( Constants.GET_TWEET).show_status(id=tweet.tweet_id) if tweet_object: dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) tweet_dir = "{}/tweets".format(dump_dir) create_dir(dump_dir) create_dir(tweet_dir) json.dump( tweet_object, open("{}/{}.json".format(tweet_dir, tweet.tweet_id), "w")) except TwythonRateLimitError: logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") return None
def init(self): # Create output folder create_dir(self.output_path) # Save settings with open(os.path.join(self.output_path, 'args.txt'), 'w') as outfile: json.dump(vars(self.args), outfile, sort_keys=True, indent=4) # Copy label map copyfile(self.label_map, os.path.join(self.output_path, self.label_map)) self.categories = json.load(open(self.label_map, 'r')).get('classes') self.categories = [{ 'id': cat['id'] + 1, 'name': cat['name'] } for cat in self.categories] self.org_categories = copy.deepcopy(self.categories) if self.included_classes is None: self._check_for_excluded_classes() else: self._check_for_included_classes() # Get included ids self.included_ids = [cat['id'] for cat in self.categories] if self.remap_labels: self._remap_labels() if self.args.rearrange_ids: self._rearrange_ids() self._write_label_map() if check_label_names_for_duplicates(self.categories): print('\nExiting! Please fix label map.') sys.exit(-1) self.cat2id = {cat['name']: cat['id'] for cat in self.categories} self.id2cat = {cat['id']: cat['name'] for cat in self.categories} self.gt_boxes = { cat['id']: { 'name': cat['name'], 'num_gt_boxes': {} } for cat in self.categories } self._fill_lists() assert validate_match( self.image_sets, self.images, self.label), 'Image and label files do not match.'
def convert(self): # Get supercategories if 'supercategory' not in self.categories[0]: for item in self.categories: name = item.get('name') idx = name.rfind('(') item['supercategory'] = name[idx + 1:-1] time.sleep(0.1) print("\nCreating dataset...") # Make annotations output dir annotations_dir = os.path.join(self.output_path, "annotations") create_dir(annotations_dir) for image_set in self.image_sets: time.sleep(0.1) print("\tCreating {} set...".format(image_set)) time.sleep(0.1) # Make image_set output dir image_set_dir = os.path.join(self.output_path, image_set) create_dir(image_set_dir) images, annotations = self._get_images_and_annotations(image_set) json_data = { "info": self.info, "licenses": self.licenses, "images": images, "annotations": annotations, "categories": self.categories } time.sleep(0.1) print('\tWriting annotations to disk...\n') time.sleep(0.1) annotation_file = os.path.join(self.output_path, "annotations", "instances_" + image_set + ".json") with open(annotation_file, "w") as jsonfile: json.dump(json_data, jsonfile, indent=4) for image_set in self.image_sets: print('\nTesting dataset {} ...'.format(image_set)) annotation_file = os.path.join(self.output_path, "annotations", "instances_" + image_set + ".json") self._test_dataset(annotation_file) if self.args.show_not_verified: warning_not_verified_label_files(self.not_verified_label_files)
def collect_data(self, choices): all_user_ids = [] for choice in choices: all_user_ids.extend(get_user_ids_in_folder( "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"]))) user_timeline_tweets_folder = "{}/{}".format(self.config.dump_location, "user_timeline_tweets") create_dir(user_timeline_tweets_folder) multiprocess_data_collection(dump_user_recent_tweets_job, list(all_user_ids), (user_timeline_tweets_folder, self.config.twython_connector), self.config)
def __init__(self, root_location, twython_connector): self.root_location = root_location create_dir(root_location) self.twython_connector = twython_connector self.user_profiles_dir = os.path.join(self.root_location, "user_profiles") self.user_followers_dir = os.path.join(self.root_location, "user_followers") self.user_following_dir = os.path.join(self.root_location, "user_following") self.user_timelines_dir = os.path.join(self.root_location, "user_timeline_tweets")
def collect_data(self, choices): all_user_ids = set() for choice in choices: all_user_ids.update(get_user_ids_in_folder( "{}/{}/{}".format(self.config.dump_location, choice["news_source"], choice["label"]))) user_friends_folder = "{}/{}".format(self.config.dump_location, "user_following") create_dir(user_friends_folder) multiprocess_data_collection(dump_user_following, list(all_user_ids), (user_friends_folder, self.config.twython_connector), self.config)
def collect_tweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) # create dir for tweets create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) # create dir for user profiles create_dir(f"{config.dump_location}/user_profiles") tweet_list = [] for news in news_list: # check whether the news is existed news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json" if not os.path.exists(news_path): # print(f"News {news.news_id} is not existed, skip downloading tweets") continue for tweet_id in news.tweet_ids: tweet_list.append(Tweet(tweet_id, news.news_id, news_source, label)) print(f"Total tweets to be downloaded: {len(tweet_list)}") tweet_chunks = equal_chunks(tweet_list, 100) multiprocess_data_collection(dump_tweet_information, tweet_chunks, (config, config.twython_connector), config)
def collect_tweets2dir(self, dump_dir, tweet_ids): search_tweet_set = set(tweet_ids) if (os.path.exists(dump_dir)): existed_tweet_set = self.get_tweetID_from_dir(dump_dir) search_tweet_set = search_tweet_set - existed_tweet_set else: existed_tweet_set = set() create_dir(dump_dir) print("{} tweets will be added, {} searched, {} existed".format( len(search_tweet_set), len(tweet_ids), len(existed_tweet_set))) from util.util import equal_chunks chunks = equal_chunks(list(search_tweet_set), chunk_size=100) for tweet_chunk in chunks: self.dump_tweet_information(tweet_chunk, dump_dir)
def _check_for_excluded_classes(self): create_dir(self.output_path) if not len(self.excluded_classes) == 0: with open(os.path.join(self.output_path, "excluded_classes.txt"), 'w') as file: remaining_categories = [] for cat in self.categories: if not cat['id'] in self.excluded_classes: remaining_categories.append(cat) else: file.write("{}\t: {}\n".format(cat['id'], cat['name'])) self.categories = remaining_categories with open(os.path.join(self.output_path, "included_classes.txt"), 'w') as file: for cat in self.categories: file.write("{}\t: {}\n".format(cat['id'], cat['name']))
def collect_data(self, choices): if not os.path.exists(f"{self.config.dump_location}/all_user_id.json"): print(f"all_user_id.json not found") return print(f"loads IDs to be fetched from all_user_id.json") with open(f"{self.config.dump_location}/all_user_id.json", "r") as id_list_file: all_user_ids = json.loads(id_list_file.read()) # set and create dest dir timeline_folder = f"{self.config.dump_location}/user_timeline_tweets" create_dir(timeline_folder) multiprocess_data_collection( dump_user_recent_tweets_job, all_user_ids, (timeline_folder, self.config.twython_connector), self.config)
def collect_news_articles(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) save_dir = "{}/{}/{}".format(config.dump_location, news_source, label) for news in tqdm(news_list): create_dir("{}/{}".format(save_dir, news.news_id)) news_article = crawl_news_article(news.news_url) if news_article: json.dump(news_article, open("{}/{}/news content.json".format(save_dir, news.news_id), "w", encoding="UTF-8"))
def collect_data(self, choices): # create dir to store user followers user_followers_folder = f"{self.config.dump_location}/rt_user_followers" create_dir(user_followers_folder) user_id_list_path = f"{self.config.dump_location}/rt_user_ids_1.json" # number need to be set final_user_id_list = [] with open(user_id_list_path, "r") as id_file: id_list = json.loads(id_file.read())['users'] for uid in id_list: if not os.path.exists(f"{user_followers_folder}/{uid}.json"): final_user_id_list.append(int(uid)) multiprocess_data_collection( dump_user_followers, final_user_id_list, (user_followers_folder, self.config.twython_connector), self.config)
def collect_retweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/{}".format(config.dump_location, news_source, label)) save_dir = "{}/{}/{}".format(config.dump_location, news_source, label) tweet_id_list = [] for news in news_list: for tweet_id in news.tweet_ids: tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label)) filtered_tweet_id_list = [ tweet for tweet in tweet_id_list if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),) ] multiprocess_data_collection( dump_retweets_job, filtered_tweet_id_list, (config, config.twython_connector), config, )
def collect_retweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir(f"{config.dump_location}/{news_source}") create_dir(f"{config.dump_location}/{news_source}/{label}") tweet_id_list = [] for news in news_list: # check whether the news is existed news_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/news content.json" if not os.path.exists(news_path): # print(f"News {news.news_id} is not existed, skip downloading retweets") continue for tweet_id in news.tweet_ids: # check whether the tweet is existed tweet_path = f"{config.dump_location}/{news_source}/{label}/{news.news_id}/tweets/{tweet_id}.json" if not os.path.exists(tweet_path): # print(f"Tweet {tweet_id} is not existed, skip downloading retweets") continue tweet_id_list.append( Tweet(tweet_id, news.news_id, news_source, label)) multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) retweet_dir = "{}/retweets".format(dump_dir) retweet_path = "{}/{}.json".format(retweet_dir, tweet.tweet_id) if os.path.exists(retweet_path): print("[PASSED] source:{}, label:{}, news:{}, retweet: tweet{}".format( tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id)) return else: print("[NEW] source:{}, label:{}, news:{}, retweet: tweet{}".format( tweet.news_source, tweet.label, tweet.news_id, tweet.tweet_id)) retweets = [] connection = None try: connection = twython_connector.get_twython_connection("get_retweet") retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception( "Twython API rate limit exception - tweet id : {}".format( tweet.tweet_id)) except Exception: logging.exception( "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} create_dir(dump_dir) create_dir(retweet_dir) json.dump(retweet_obj, open(retweet_path, "w"))
def calc_img_statistics(self): print("Calculating image statistics ...") mean_per_image = [] for s in self.image_sets: time.sleep(0.1) print( "\tCalculating mean and variance for images in {} ...".format( s)) time.sleep(0.1) for i, image_filename in enumerate( tqdm(self.images[s], desc="\tProgress:", unit="files")): image_path = os.path.join(self.image_path, image_filename) image_arr = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB) mean_per_image.append(np.mean(image_arr, axis=(0, 1))) self.img_var.append((np.var(image_arr, axis=(0, 1)))) self.img_mean = np.mean(mean_per_image, axis=0) self.img_std = np.sqrt(np.divide(np.sum(self.img_var, axis=0), i)) print( "Mean (RGB) = {}, {}, {}\nStandard deviation = {}, {}, {}".format( self.img_mean[0], self.img_mean[1], self.img_mean[2], self.img_std[0], self.img_std[1], self.img_std[2])) create_dir(self.output_path) with open(os.path.join(self.output_path, "image_stats.txt"), 'w') as file: file.write("Image statistics per RGB Channel") file.write("mean = [{}, {}, {}]\n".format(self.img_mean[0], self.img_mean[1], self.img_mean[2])) file.write("std = [{}, {}, {}]\n".format(self.img_std[0], self.img_std[1], self.img_std[2]))
def dump_tweet_information(self, tweet_chunk, dump_dir): """Collect info and dump info of tweet chunk containing atmost 100 tweets""" try: tweet_objects_map = self.twython_connector.get_twython_connection( Constants.GET_TWEET).lookup_status(id=tweet_chunk, include_entities=True, map=True)['id'] for tweet_id in tweet_chunk: tweet_object = tweet_objects_map[str(tweet_id)] if tweet_object: create_dir(dump_dir) json.dump( tweet_object, open("{}/{}.json".format(dump_dir, tweet_id), "w")) except TwythonRateLimitError: logging.exception("Twython API rate limit exception") except Exception as ex: logging.exception("exception in collecting tweet objects") return None
def dump_user_recent_tweets_job(user_id, save_location, twython_connector: TwythonConnector): profile_info = None # Fetch and save user information if the file is not already present if not Path("{}/{}/{}.json".format(save_location, user_id[2], user_id[0])).is_file(): create_dir("{}/{}".format(save_location, user_id[2])) try: profile_info = twython_connector.get_twython_connection(GET_USER_TWEETS).get_user_timeline(user_id=user_id[0], count=200, exclude_replies=False, incude_rts=True, max_id=user_id[1]) except TwythonRateLimitError as ex: logging.exception("Twython API rate limit exception") finally: if len(profile_info) > 0: logging.info("found {} tweets in timeline for user {}".format(len(profile_info), user_id[0])) json.dump(profile_info, open("{}/{}/{}.json".format(save_location, user_id[2], user_id[0]), "w")) else: logging.warning("couldn't retrieve the timeline of user {}".format(user_id[0])) else: logging.info("file for users and story already existis")
def collect_user_profiles(self, users): dump_location = self.user_profiles_dir create_dir(dump_location) existed_id_set = self.get_own_user_id(dump_location) new_users_set = users - existed_id_set print("existed: {}, found: {}, add: {}".format(len(existed_id_set), len(users), len(new_users_set))) print("We are adding {} user profiles to {}".format( len(new_users_set), dump_location)) user_chunks = equal_chunks(list(new_users_set), 100) number = 0 for chunk in user_chunks: try: user_objects_map = self.twython_connector.get_twython_connection( Constants.GET_USER).lookup_user(user_id=chunk, include_entities=True) for user_object in user_objects_map: json.dump( user_object, open( "{}/{}.json".format(dump_location, user_object["id"]), "w")) number += 1 print("{} has been added".format(number)) except TwythonError: print("Twythonerror") except TwythonRateLimitError: print("Twython API rate limit exception") except: print("Exception") print("Finish")
def collect_tweets(news_list, news_source, label, config: Config): create_dir(config.dump_location) create_dir("{}/{}".format(config.dump_location, news_source)) create_dir("{}/{}/tweets".format(config.dump_location, news_source)) for news in news_list: print('Downloading ' + news_source + ' ' + label + ' ' + news.news_id + ' tweets') create_dir("{}/{}/{}/{}".format(config.dump_location, news_source, label, news.news_id)) data = pd.DataFrame(columns=features) news_dir = "{}/{}/tweets/{}.csv".format(config.dump_location, news_source, news.news_id) if path.exists(news_dir): continue else: for tweet in t.hydrate(news.tweet_ids): data = data.append(extract_tweet_features(tweet, label), ignore_index=True) data.to_csv(news_dir, index=False)