def _download_url(self, url, out_dir): r = requests.get(url, stream=True) Utilities.create_dir(out_dir) if r.status_code == 200: filename = os.path.join(out_dir, url.rsplit('/', 1)[-1]) with open(filename, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) else: print("failed for " + url) self.logger.warn("failed for " + url)
def __init__(self,userid, password, out_dir, logger=None): self.userid=userid self.password = password self.logger =logger if logger is None: log_dir = os.path.join(os.path.dirname(__file__),"../../../outputdata/Ck12CorpusCreater_{}".format(time.strftime('%Y%m%d_%H%M%S'))) Utilities.create_dir(log_dir) self.logger = setup_log(log_dir, "Ck12CorpusCreater") Utilities.create_dir(out_dir) self.profile = webdriver.FirefoxProfile() self.profile.set_preference('browser.download.folderList',2) #custom location self.profile.set_preference('browser.download.dir',os.path.abspath( out_dir)) self.profile.set_preference('browser.download.manager.showWhenStarting', False) self.profile.set_preference("pdfjs.disabled", True) self.profile.set_preference('browser.helperApps.neverAsk.saveToDisk',"application/pdf") self.profile.set_preference('plugin.scan.plid.all',False) self.profile.set_preference("plugin.scan.Acrobat","99.0") self.driver = webdriver.Firefox(self.profile)
def download(url, credentials, logger ): (userid, password) = credentials folder_name = urlparse(url).path.replace("/","").replace("%","") with Ck12CorpusCreater(userid, password, os.path.join(os.path.dirname(__file__), "../../../corpus/{}".format(folder_name)), logger=logger) as ck12CorpusCreater : ck12CorpusCreater.download_book(url) log_dir = os.path.join(os.path.dirname(__file__),"../../../outputdata/Ck12CorpusCreater_{}".format(time.strftime('%Y%m%d_%H%M%S'))) Utilities.create_dir(log_dir) log = setup_log(log_dir, "Ck12CorpusCreater") try: credentialsCk12 = Utilities.get_credentials("https://www.ck12.org/") # # download("https://www.ck12.org/book/CK-12-Life-Science-Concepts-For-Middle-School",credentialsCk12, log) # download("https://www.ck12.org/book/CK-12-Earth-Science-Concepts-For-High-School", credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Earth-Science-Concepts-For-Middle-School", credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Physical-Science-Concepts-For-Middle-School",credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Biology-Concepts",credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Chemistry-Basic",credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Chemistry-Concepts-Intermediate",credentialsCk12,log) # download("https://www.ck12.org/book/CK-12-Physics-Concepts---Intermediate",credentialsCk12,log) ####download("https://www.ck12.org/book/CK-12-Understanding-Biodiversity",credentialsCk12,log)