Exemple #1
0
 def _download_url(self, url, out_dir):
     r = requests.get(url, stream=True)
     Utilities.create_dir(out_dir)
     if r.status_code == 200:
         filename = os.path.join(out_dir, url.rsplit('/', 1)[-1])
         with open(filename, 'wb') as f:
             r.raw.decode_content = True
             shutil.copyfileobj(r.raw, f)
     else:
         print("failed for " + url)
         self.logger.warn("failed for " + url)
Exemple #2
0
    def __init__(self,userid, password, out_dir, logger=None):
        self.userid=userid
        self.password = password
        self.logger =logger
        if logger is None:
            log_dir = os.path.join(os.path.dirname(__file__),"../../../outputdata/Ck12CorpusCreater_{}".format(time.strftime('%Y%m%d_%H%M%S')))
            Utilities.create_dir(log_dir)
            self.logger = setup_log(log_dir, "Ck12CorpusCreater")

        Utilities.create_dir(out_dir)

        self.profile = webdriver.FirefoxProfile()
        self.profile.set_preference('browser.download.folderList',2) #custom location
        self.profile.set_preference('browser.download.dir',os.path.abspath( out_dir))
        self.profile.set_preference('browser.download.manager.showWhenStarting', False)
        self.profile.set_preference("pdfjs.disabled", True)
        self.profile.set_preference('browser.helperApps.neverAsk.saveToDisk',"application/pdf")
        self.profile.set_preference('plugin.scan.plid.all',False)
        self.profile.set_preference("plugin.scan.Acrobat","99.0")
        self.driver = webdriver.Firefox(self.profile)
Exemple #3
0


def download(url, credentials, logger ):
    (userid, password) = credentials
    folder_name =  urlparse(url).path.replace("/","").replace("%","")
    with Ck12CorpusCreater(userid, password, os.path.join(os.path.dirname(__file__), "../../../corpus/{}".format(folder_name)),
                           logger=logger)   as  ck12CorpusCreater :
        ck12CorpusCreater.download_book(url)





log_dir = os.path.join(os.path.dirname(__file__),"../../../outputdata/Ck12CorpusCreater_{}".format(time.strftime('%Y%m%d_%H%M%S')))
Utilities.create_dir(log_dir)
log = setup_log(log_dir, "Ck12CorpusCreater")

try:

    credentialsCk12 = Utilities.get_credentials("https://www.ck12.org/")
    #
    # download("https://www.ck12.org/book/CK-12-Life-Science-Concepts-For-Middle-School",credentialsCk12, log)
    # download("https://www.ck12.org/book/CK-12-Earth-Science-Concepts-For-High-School", credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Earth-Science-Concepts-For-Middle-School", credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Physical-Science-Concepts-For-Middle-School",credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Biology-Concepts",credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Chemistry-Basic",credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Chemistry-Concepts-Intermediate",credentialsCk12,log)
    # download("https://www.ck12.org/book/CK-12-Physics-Concepts---Intermediate",credentialsCk12,log)
    ####download("https://www.ck12.org/book/CK-12-Understanding-Biodiversity",credentialsCk12,log)