Exemple #1
0
    def get_subreddits_links_to_build_task(self):
        base_ = Base()
        extract_ = Extract()
        list_subreddits_data = base_.get_data_list_subreddits()
        downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
        urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
            start_date=self.st_dt, end_date=self.end_dt)
        if len(downloaded_subs) > 0:
            urls = list(set(urls)- set(downloaded_subs))
            print("Already Dowloaded {} sub-reddits yet to download {} sub-reddits".format(len(downloaded_subs), len(urls)))
            print("Completed {}%".format(len(downloaded_subs)/len(urls)))

        return urls
 def run_extraction(self):
     extract_ = Extract()
     base_ = Base()
     list_subreddits_data = base_.get_data_list_subreddits()
     downloaded_subs = base_.check_resume_file(file_path=self.resume_file)
     start_time = time.time()
     cost = 0
     urls = extract_.get_urls_for_all_subreddits(subreddits=list_subreddits_data, \
         start_date=self.st_dt, end_date=self.end_dt)
     if len(downloaded_subs) > 0:
         urls_ = list(set(urls) - set(downloaded_subs))
         print(
             "Already Dowloaded {} sub-reddits yet to download {} sub-reddits"
             .format(len(downloaded_subs), len(urls_)))
         print("Completed {}%".format(len(downloaded_subs) / len(urls_)))
         extract_.url_based_extraction(links=urls_, base_path=self.sav_path)
     else:
         extract_.url_based_extraction(links=urls, base_path=self.sav_path)