def do_obtain_new_books(cpus, account, notification=lambda x, y:x): print "do_obtain_new_books in jobs.py" print "Account is: %s" % (account) # This server is an arbitrary_n job, so there is a notifier available. # Set the % complete to a small number to avoid the 'unavailable' indicator notification(0.01, "Starting up...") from calibre.library import db from calibre.utils.config import prefs prefs.refresh() db = db(read_only=False) print "DB is: %s" % (db) prefs = PrefsFacade(db) print "Prefs are: %s" % (prefs) print "Library id is (%s)" % (prefs.get_library_uuid()) reporter = ConsoleReporter() downloader = BeamEbooksDownloader(prefs, caller = reporter) print "-- LALA -- Downloader is: %s" % (downloader) if account[prefs.ENABLED]: downloader.login(account) if downloader.successful_login == False: notification(1.00, "Failed to log in...") else: notification(0.05, "Parsing document tree now...") downloadable_ebooks = downloader.recursive_descent(norms(prefs[prefs.URLBASE])) notification(0.50, "Loaded OPDS pages") reporter.notify(downloadable_ebooks) # # Now, download the obtained ebooks... notification(1.00, "Done...") adder = EBookAdder(prefs, "beam-ebooks") adder.load_books() new_ebooks = [] for entry in downloadable_ebooks: beamebooks_id = entry['id'] book = adder.books_of_this_shop.get(beamebooks_id) if book is None: new_ebooks.append(entry) result = (new_ebooks) return result
class BeamEbooksDownloader(): def __init__(self, prefs, version=None, caller=None): print "Initializing BeamEbooksDownloader()" print " myself: '%s'" % (self) self.prefs = prefs self.urlbase = prefs[prefs.URLBASE] if version is None: from calibre_plugins.beam_ebooks_downloader import Downloader version = Downloader.version self.caller = caller self.beamid = None self.successful_login = False self.already_visited_links = [] self.downloadable_ebooks = [] # TODO How do I access this string from the calibre core? USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' user_agent = 'calibre-beam-ebooks-downloader-plugin/%d.%d.%d' % ( version) user_agent = USER_AGENT + ' ' + user_agent self.browser = browser(user_agent=user_agent) # self.browser.set_debug_http(True) # self.browser.set_debug_responses(True) # self.tempdirpath = tempfile.mkdtemp(prefix = 'calibre-beam-ebooks-downloader-plugin-') self.tempdirpath = tempfile.gettempdir( ) + '/' + 'calibre-beam-ebooks-downloader-plugin' print "Saving stuff into '%s'" % (self.tempdirpath) def save_response(self, response): if not os.path.exists(self.tempdirpath): os.makedirs(self.tempdirpath) try: filename = '%s/response-%s-%d.txt' % ( self.tempdirpath, self.account_id, self.filenumber) self.filenumber = self.filenumber + 1 f = open(filename, 'w') f.write("Response Code: '%s'" % (response.code)) f.write("\n\n") content = response.get_data() f.write("Content: '%s'" % (content)) f.write("\n\n") f.close() except: print "Unexpected error:", sys.exc_info()[0] pass def login(self, account): self.beamid = None self.successful_login = False self.already_visited_links = [] self.downloadable_ebooks = [] self.account_id = account[self.prefs.ACCOUNT_ID] self.username = account[self.prefs.USERNAME] self.password = self.prefs.decrypt_password( account[self.prefs.OBFUSCATED_PASSWORD]) # Remove all cookies to be extra safe self.browser.cookiejar.clear() self.filenumber = 1000 if self.caller is not None: self.caller.notify("Logging in") url = self.urlbase + "/aldiko/cookisetzen.php" url = norms(url) print " URL: '%s'" % (url) print "Browser: '%s'" % (self.browser) # print " UA : '%s'" % (self.browser.user_agent) response = self.browser.open(url) self.save_response(response) print "Cookies: '%s'" % (self.browser.cookiejar) if response.code == 200: form = self.browser.select_form(nr=0) print "Form: '%s'" % (form) print " Auth: '%s', '%s'" % (self.username, self.password) self.browser.form['user'] = self.username self.browser.form['pass'] = self.password self.browser.submit() # After from submission self.save_response(response) # print "Response Code: '%s'" % (response.code) # print "Cookies: '%s'" % (self.browser.cookiejar) for cookie in self.browser.cookiejar: # print " C: '%s'" % (cookie) if hasattr(cookie, 'name'): if hasattr(cookie, 'value'): if cookie.name == 'beamid': self.beamid = cookie.value # TODO should we verify that the beamid is numeric??? self.successful_login = True self.caller.notify("Login Successful") # print "Beam ID: '%s', '%s'" % (self.beamid, self.successful_login) def recursive_descent(self, absolute_url=None, further_descend=True): if absolute_url is None: url = self.urlbase else: url = absolute_url caller = self.caller url = norms(url) if url in self.already_visited_links: print "Already have been here ('%s')..." % (url) else: harvested_urls = self.prefs[self.prefs.HARVESTED_URLS] harvest_state = harvested_urls.get(url) if harvest_state is None: harvest_state = {} self.prefs[self.prefs.HARVESTED_URLS][url] = harvest_state self.prefs.save() status = harvest_state.get(self.prefs.HARVEST_STATE) if status is None: harvest_state[self.prefs. HARVEST_STATE] = self.prefs.HARVEST_STATE_REVISIT self.prefs.save() title = harvest_state.get(self.prefs.HARVEST_TITLE) if title is None: harvest_state[self.prefs.HARVEST_TITLE] = "" self.prefs.save() if caller is not None: caller.notify("Visiting ('%s', '%s')..." % (url, harvest_state)) self.visit_url(absolute_url, further_descend) # In any case, return a list of ebooks to download return self.downloadable_ebooks def visit_url(self, url=None, further_descend=True): print " URL: '%s'" % (url) self.browser.open(url) response = self.browser.open(url) self.save_response(response) content = response.get_data() links_to_visit = [] if response.code != 200: print "Something horrible happened (RC %s)" % (response.code) pass root = fromstring(content) entrylist = root.xpath("//entry") for entry in entrylist: # print " Entry: '%s'" % (tostring(entry, pretty_print=True).strip()) idtag = entry.xpath('id')[0] if idtag is not None: # print " Id: '%s'\n" % (tostring(idtag, pretty_print=True).strip()) contents = idtag.text_content() # print " Id content: '%s' / '%s'" % (idtag, contents) match = re.match('urn:beam-ebooks:private', contents) if match: href = self.extract_link(entry) if href: href = norms(href) print " Seems to be a followable link ('%s')" % ( href) links_to_visit.append(href) match = re.match('urn:beam-ebooks:alle', contents) if match: href = self.extract_link(entry) if href: href = norms(href) print " Seems to be a followable link ('%s')" % ( href) links_to_visit.append(href) match = re.match('urn:beam-ebooks:titelnr:', contents) if match: (href, mimetype) = self.extract_link(entry) if href: href = norms(href) match = re.search('\/download\.php5\?.*$', href) if match: print " Seems to be an ebook ('%s', '%s')" % ( mimetype, href) data = {} data['urn'] = contents data['href'] = href data['mimetype'] = mimetype foo = re.split(':', contents) data['id'] = foo[3] self.downloadable_ebooks.append(data) else: print " Seems to be a followable link ('%s')" % ( href) links_to_visit.append(href) # Finally, visit all pages that we encountered if further_descend: for link in links_to_visit: link = norms(link) self.recursive_descent(link) # In any case, return the links we had to visit... return links_to_visit def extract_link(self, entry): linklist = entry.xpath("link") for link in linklist: # print " Link: '%s'" % (tostring(link, pretty_print=True).strip()) href = link.attrib['href'] mimetype = link.attrib['type'] match = re.search('^image\/.*', mimetype) if match: continue print " Link: '%s'" % (link) print " HRef: '%s'" % (href) print " Type: '%s'" % (mimetype) match = re.search('^http\:\/\/', href) if match is None: if re.search('\/aldiko\/', href) is None: href = self.urlbase + '/aldiko/' + href else: href = self.urlbase + href print " Extentended HRef: '%s'" % (href) match = re.search('\/bibliothek\.php\?.*$', href) if match: return href match = re.search('\/bibuebersicht\.php5\?.*$', href) if match: return href match = re.search('\/pakete\.php5\?.*$', href) if match: return href match = re.search('\/download\.php5\?.*$', href) if match: return (href, mimetype) # Just relative links for packages match = re.search('\/paket\.php5\?paketnr=.*$', href) if match: return (href, mimetype) return None # Now, mirror all ebooks encountered in the loop above def download_ebooks(self): print "Library id is (%s)" % (self.prefs.get_library_uuid()) db = self.prefs._get_db() print "Library database object is (%s)" % (db) caller = self.caller self.adder = EBookAdder(self.prefs, "beam-ebooks") self.adder.load_books() handled_ebooks = 0 for entry in self.downloadable_ebooks: beamebooks_id = entry['id'] book = self.adder.books_of_this_shop.get(beamebooks_id) if book is None: # Book not found, fetch and try to store in into the database if handled_ebooks < self.prefs[ self.prefs.DOWNLOADS_PER_SESSION]: handled_ebooks = handled_ebooks + 1 # Still in quota for this run if caller is not None: caller.notify("Working on book %d: %s" % (handled_ebooks, beamebooks_id)) self.download_ebook(entry) else: continue if caller is not None: caller.notify("Handled (%d of %d) books, waiting for next run" % (handled_ebooks, len(self.downloadable_ebooks))) # Now, mirror one single ebook whose id will be passed into this method def download_ebook(self, entry): # urn = entry['urn'] href = entry['href'] mimetype = entry['mimetype'] beamebooks_id = entry['id'] if mimetype == 'application/epub+zip': ext = 'epub' else: ext = 'bin' path = self.tempdirpath + "/" + beamebooks_id + "." + ext if os.path.exists(path) == False: print "Have to download %s, %s, %s" % (beamebooks_id, mimetype, href) self.browser.retrieve(href, path) self.adder.add(path, beamebooks_id)
class BeamEbooksDownloader(): def __init__(self, prefs, version = None, caller = None): print "Initializing BeamEbooksDownloader()" print " myself: '%s'" % (self) self.prefs = prefs self.urlbase = prefs[prefs.URLBASE] if version is None: from calibre_plugins.beam_ebooks_downloader import Downloader version = Downloader.version self.caller = caller self.beamid = None self.successful_login = False self.already_visited_links = [] self.downloadable_ebooks = [] # TODO How do I access this string from the calibre core? USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' user_agent = 'calibre-beam-ebooks-downloader-plugin/%d.%d.%d' % (version) user_agent = USER_AGENT + ' ' + user_agent self.browser = browser(user_agent=user_agent) # self.browser.set_debug_http(True) # self.browser.set_debug_responses(True) # self.tempdirpath = tempfile.mkdtemp(prefix = 'calibre-beam-ebooks-downloader-plugin-') self.tempdirpath = tempfile.gettempdir() + '/' + 'calibre-beam-ebooks-downloader-plugin' print "Saving stuff into '%s'" % (self.tempdirpath) def save_response(self, response): if not os.path.exists(self.tempdirpath): os.makedirs(self.tempdirpath) try: filename = '%s/response-%s-%d.txt' % (self.tempdirpath, self.account_id, self.filenumber) self.filenumber = self.filenumber + 1 f = open(filename, 'w') f.write("Response Code: '%s'" % (response.code)) f.write("\n\n") content = response.get_data() f.write("Content: '%s'" % (content)) f.write("\n\n") f.close() except: print "Unexpected error:", sys.exc_info()[0] pass def login(self, account): self.beamid = None self.successful_login = False self.already_visited_links = [] self.downloadable_ebooks = [] self.account_id = account[self.prefs.ACCOUNT_ID] self.username = account[self.prefs.USERNAME] self.password = self.prefs.decrypt_password(account[self.prefs.OBFUSCATED_PASSWORD]) # Remove all cookies to be extra safe self.browser.cookiejar.clear() self.filenumber = 1000 if self.caller is not None: self.caller.notify("Logging in") url = self.urlbase + "/aldiko/cookisetzen.php" url = norms(url) print " URL: '%s'" % (url) print "Browser: '%s'" % (self.browser) # print " UA : '%s'" % (self.browser.user_agent) response = self.browser.open(url) self.save_response(response) print "Cookies: '%s'" % (self.browser.cookiejar) if response.code == 200: form = self.browser.select_form(nr = 0) print "Form: '%s'" % (form) print " Auth: '%s', '%s'" % (self.username, self.password) self.browser.form['user'] = self.username self.browser.form['pass'] = self.password self.browser.submit() # After from submission self.save_response(response) # print "Response Code: '%s'" % (response.code) # print "Cookies: '%s'" % (self.browser.cookiejar) for cookie in self.browser.cookiejar: # print " C: '%s'" % (cookie) if hasattr(cookie, 'name'): if hasattr(cookie, 'value'): if cookie.name == 'beamid': self.beamid = cookie.value # TODO should we verify that the beamid is numeric??? self.successful_login = True self.caller.notify("Login Successful") # print "Beam ID: '%s', '%s'" % (self.beamid, self.successful_login) def recursive_descent(self, absolute_url = None, further_descend = True): if absolute_url is None: url = self.urlbase else: url = absolute_url caller = self.caller url = norms(url) if url in self.already_visited_links: print "Already have been here ('%s')..." % (url) else: harvested_urls = self.prefs[self.prefs.HARVESTED_URLS] harvest_state = harvested_urls.get(url) if harvest_state is None: harvest_state = {} self.prefs[self.prefs.HARVESTED_URLS][url] = harvest_state self.prefs.save() status = harvest_state.get(self.prefs.HARVEST_STATE) if status is None: harvest_state[self.prefs.HARVEST_STATE] = self.prefs.HARVEST_STATE_REVISIT self.prefs.save() title = harvest_state.get(self.prefs.HARVEST_TITLE) if title is None: harvest_state[self.prefs.HARVEST_TITLE] = "" self.prefs.save() if caller is not None: caller.notify("Visiting ('%s', '%s')..." % (url, harvest_state)) self.visit_url(absolute_url, further_descend) # In any case, return a list of ebooks to download return self.downloadable_ebooks def visit_url(self, url = None, further_descend = True): print " URL: '%s'" % (url) self.browser.open(url) response = self.browser.open(url) self.save_response(response) content = response.get_data() links_to_visit = [] if response.code != 200: print "Something horrible happened (RC %s)" % (response.code) pass root = fromstring(content) entrylist = root.xpath("//entry") for entry in entrylist: # print " Entry: '%s'" % (tostring(entry, pretty_print=True).strip()) idtag = entry.xpath('id')[0] if idtag is not None: # print " Id: '%s'\n" % (tostring(idtag, pretty_print=True).strip()) contents = idtag.text_content() # print " Id content: '%s' / '%s'" % (idtag, contents) match = re.match('urn:beam-ebooks:private', contents) if match: href = self.extract_link(entry) if href: href = norms(href) print " Seems to be a followable link ('%s')" % (href) links_to_visit.append(href) match = re.match('urn:beam-ebooks:alle', contents) if match: href = self.extract_link(entry) if href: href = norms(href) print " Seems to be a followable link ('%s')" % (href) links_to_visit.append(href) match = re.match('urn:beam-ebooks:titelnr:', contents) if match: (href, mimetype) = self.extract_link(entry) if href: href = norms(href) match = re.search('\/download\.php5\?.*$', href) if match: print " Seems to be an ebook ('%s', '%s')" % (mimetype, href) data = {} data['urn'] = contents data['href'] = href data['mimetype'] = mimetype foo = re.split(':', contents) data['id'] = foo[3] self.downloadable_ebooks.append(data) else: print " Seems to be a followable link ('%s')" % (href) links_to_visit.append(href) # Finally, visit all pages that we encountered if further_descend: for link in links_to_visit: link = norms(link) self.recursive_descent(link) # In any case, return the links we had to visit... return links_to_visit def extract_link(self, entry): linklist = entry.xpath("link") for link in linklist: # print " Link: '%s'" % (tostring(link, pretty_print=True).strip()) href = link.attrib['href'] mimetype = link.attrib['type'] match = re.search('^image\/.*', mimetype) if match: continue print " Link: '%s'" % (link) print " HRef: '%s'" % (href) print " Type: '%s'" % (mimetype) match = re.search('^http\:\/\/', href) if match is None: if re.search('\/aldiko\/', href) is None: href = self.urlbase + '/aldiko/' + href else: href = self.urlbase + href print " Extentended HRef: '%s'" % (href) match = re.search('\/bibliothek\.php\?.*$', href) if match: return href match = re.search('\/bibuebersicht\.php5\?.*$', href) if match: return href match = re.search('\/pakete\.php5\?.*$', href) if match: return href match = re.search('\/download\.php5\?.*$', href) if match: return (href, mimetype) # Just relative links for packages match = re.search('\/paket\.php5\?paketnr=.*$', href) if match: return (href, mimetype) return None # Now, mirror all ebooks encountered in the loop above def download_ebooks(self): print "Library id is (%s)" % (self.prefs.get_library_uuid()) db = self.prefs._get_db() print "Library database object is (%s)" % (db) caller = self.caller self.adder = EBookAdder(self.prefs, "beam-ebooks") self.adder.load_books() handled_ebooks = 0 for entry in self.downloadable_ebooks: beamebooks_id = entry['id'] book = self.adder.books_of_this_shop.get(beamebooks_id) if book is None: # Book not found, fetch and try to store in into the database if handled_ebooks < self.prefs[self.prefs.DOWNLOADS_PER_SESSION]: handled_ebooks = handled_ebooks + 1 # Still in quota for this run if caller is not None: caller.notify("Working on book %d: %s" % (handled_ebooks, beamebooks_id)) self.download_ebook(entry) else: continue if caller is not None: caller.notify("Handled (%d of %d) books, waiting for next run" % (handled_ebooks, len(self.downloadable_ebooks))) # Now, mirror one single ebook whose id will be passed into this method def download_ebook(self, entry): # urn = entry['urn'] href = entry['href'] mimetype = entry['mimetype'] beamebooks_id = entry['id'] if mimetype == 'application/epub+zip': ext = 'epub' else: ext = 'bin' path = self.tempdirpath + "/" + beamebooks_id + "." + ext if os.path.exists(path) == False: print "Have to download %s, %s, %s" % (beamebooks_id, mimetype, href) self.browser.retrieve(href, path) self.adder.add(path, beamebooks_id)