def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): q = Query(get_template(self.browser(), self.config['url'])) q.searchTerms = query q.count = max_results count = 0 for book in search(self.browser(), q.url(), timeout): if count >= max_results: return else: yield book
def open_search(url, query, max_results=10, timeout=60): url_template = 'https://standardebooks.org/opds/all?query={searchTerms}' oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = 'https://standardebooks.org' + href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = 'https://standardebooks.org' + href elif 'http://opds-spec.org/acquisition' in rel: if type: ext = href.split('.')[1] if ext: ext = ext[:].upper().strip() s.downloads[ ext] = 'https://standardebooks.org' + href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() yield s
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'): ''' Manybooks uses a very strange opds feed. The opds main feed is structured like a stanza feed. The search result entries give very little information and requires you to go to a detail link. The detail link has the wrong type specified (text/html instead of application/atom+xml). ''' description = Description(open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw_data = f.read() raw_data = raw_data.decode('utf-8', 'replace') doc = etree.fromstring(raw_data) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() detail_links = data.xpath('./*[local-name() = "link" and @type = "text/html"]') if not detail_links: continue detail_link = detail_links[0] detail_href = detail_link.get('href') if not detail_href: continue s.detail_item = 'http://manybooks.net/titles/' + detail_href.split('tid=')[-1] + '.html' # These can have HTML inside of them. We are going to get them again later # just in case. s.title = ''.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//text()')).strip() # Follow the detail link to get the rest of the info. with closing(br.open(detail_href, timeout=timeout/4)) as df: ddoc = etree.fromstring(df.read()) ddata = ddoc.xpath('//*[local-name() = "entry"][1]') if ddata: ddata = ddata[0] # This is the real title and author info we want. We got # it previously just in case it's not specified here for some reason. s.title = ''.join(ddata.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(ddata.xpath('./*[local-name() = "author"]//text()')).strip() if s.author.startswith(','): s.author = s.author[1:] if s.author.endswith(','): s.author = s.author[:-1] s.cover_url = ''.join(ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href')).strip() for link in ddata.xpath('./*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = safe_xml_fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = BASE_URL + link.get('href') type = link.get('type') title = link.get('title') ext = None if rel and href and type: if 'http://opds-spec.org/image/thumbnail' == rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/open-access' == rel: if type == 'application/epub+zip' and title == 'Recommended compatible epub': ext = 'EPUB' elif type == 'application/x-mobipocket-ebook': ext = 'AZW3' if ext: s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() s.drm = SearchResult.DRM_UNLOCKED s.price = '$0.00' yield s
def search(self, query, max_results=10, timeout=60): ''' Manybooks uses a very strange opds feed. The opds main feed is structured like a stanza feed. The search result entries give very little information and requires you to go to a detail link. The detail link has the wrong type specified (text/html instead of application/atom+xml). ''' if not hasattr(self, 'open_search_url'): return description = Description(self.open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw_data = f.read() raw_data = raw_data.decode('utf-8', 'replace') doc = etree.fromstring(raw_data) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() detail_links = data.xpath( './*[local-name() = "link" and @type = "text/html"]') if not detail_links: continue detail_link = detail_links[0] detail_href = detail_link.get('href') if not detail_href: continue s.detail_item = 'http://manybooks.net/titles/' + detail_href.split( 'tid=')[-1] + '.html' # These can have HTML inside of them. We are going to get them again later # just in case. s.title = ''.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//text()')).strip() # Follow the detail link to get the rest of the info. with closing(br.open(detail_href, timeout=timeout / 4)) as df: ddoc = etree.fromstring(df.read()) ddata = ddoc.xpath('//*[local-name() = "entry"][1]') if ddata: ddata = ddata[0] # This is the real title and author info we want. We got # it previously just in case it's not specified here for some reason. s.title = ''.join( ddata.xpath('./*[local-name() = "title"]//text()') ).strip() s.author = ', '.join( ddata.xpath('./*[local-name() = "author"]//text()') ).strip() if s.author.startswith(','): s.author = s.author[1:] if s.author.endswith(','): s.author = s.author[:-1] s.cover_url = ''.join( ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href' )).strip() for link in ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]' ): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' yield s
def search_flibusta(url, query, web_url, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = web_url + href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition/open-access' in rel: if 'application/fb2+zip' in type: s.downloads['FB2'] = web_url + href elif 'application/txt+zip' in type: s.downloads['TXT'] = web_url + href elif 'application/html+zip' in type: s.downloads['HTML'] = web_url + href elif 'application/x-mobipocket-ebook' in type: s.downloads['MOBI'] = web_url + href elif type: ext = guess_extension(type) ext2 = guess_extension(type.replace("+zip", "")) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = web_url + href elif ext2: ext2 = ext2[1:].upper().strip() s.downloads[ext2] = web_url + href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): description = Description(self.open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = self.create_browser() while url != None and counter > 0: with closing(br.open(url, timeout=timeout)) as f: s = f.read() doc = etree.fromstring(s) url = None for link in doc.xpath('//*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel == 'next' and type == 'application/atom+xml': if href[0] == "/": href = self.base_url + href url = href for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() drm = False for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if type == 'application/fb2+xml': ext = '.fb2' if ext: ext = ext[1:].upper().strip() if href[0] == "/": href = self.base_url + href s.downloads[ext] = href for enc in link.xpath('./*[local-name() = "encryption_method"]'): drm = True s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() if s.cover_url: s.cover_bak = s.cover_url s.cover_url = None yield s