def get_subreddits_links_to_build_task(self): base_ = Base() extract_ = Extract() list_subreddits_data = base_.get_data_list_subreddits() downloaded_subs = base_.check_resume_file(file_path=self.resume_file) urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \ start_date=self.st_dt, end_date=self.end_dt) if len(downloaded_subs) > 0: urls = list(set(urls)- set(downloaded_subs)) print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls))) print("Completed {}%".format(len(downloaded_subs)/len(urls))) return urls
def run_extraction(self): extract_ = Extract() base_ = Base() list_subreddits_data = base_.get_data_list_subreddits() downloaded_subs = base_.check_resume_file(file_path=self.resume_file) start_time = time.time() cost = 0 urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \ start_date=self.st_dt, end_date=self.end_dt) if len(downloaded_subs) > 0: urls_ = list(set(urls) - set(downloaded_subs)) print( "Already Dowloaded {} sub-reddits yet to download {} sub-reddits" .format(len(downloaded_subs), len(urls_))) print("Completed {}%".format(len(downloaded_subs) / len(urls_))) extract_.url_based_extraction(links=urls_, base_path=self.sav_path) else: extract_.url_based_extraction(links=urls, base_path=self.sav_path)
def run_extraction(self): extract_ = Extract() base_ = Base() list_subreddits_data = base_.get_data_list_subreddits() downloaded_subs = base_.check_resume_file(file_path=self.resume_file) if len(downloaded_subs) > 0: remianing_list = list( set(list_subreddits_data) - set(downloaded_subs)) print( "Already Dowloaded {} sub-reddits yet to download {} sub-reddits" .format(len(downloaded_subs), len(remianing_list))) print("Completed {}%".format( len(downloaded_subs) / len(list_subreddits_data))) list_subreddits_data = remianing_list start_time = time.time() cost = 0 for subreddit in list_subreddits_data: start_time, cost = extract_.start_extraction(subreddit=subreddit, start_date=self.st_dt, end_date=self.end_dt, \ base_path=self.sav_path, start_time=start_time, total_cost=cost) print(cost) print(start_time)