def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def _read_version_history_html(self, forum_link): br = browser() br.set_handle_gzip(True) try: raw = br.open_novisit(forum_link).read() if not raw: return None except: traceback.print_exc() return None raw = raw.decode('utf-8', errors='replace') root = html.fromstring(raw) spoiler_nodes = root.xpath('//div[@class="smallfont" and strong="Spoiler"]') for spoiler_node in spoiler_nodes: try: if spoiler_node.getprevious() is None: # This is a spoiler node that has been indented using [INDENT] # Need to go up to parent div, then previous node to get header heading_node = spoiler_node.getparent().getprevious() else: # This is a spoiler node after a BR tag from the heading heading_node = spoiler_node.getprevious().getprevious() if heading_node is None: continue if heading_node.text_content().lower().find('version history') != -1: div_node = spoiler_node.xpath('div')[0] text = html.tostring(div_node, method='html', encoding='unicode') return re.sub(r'<div\s.*?>', '<div>', text) except: if DEBUG: prints('======= MobileRead Parse Error =======') traceback.print_exc() prints(html.tostring(spoiler_node)) return None
def get_details(self, search_result, timeout): url = 'http://www.ebooks.com/ebooks/book_display.asp?IID=' mo = re.search(r'\?IID=(?P<id>\d+)', search_result.detail_item) if mo: id = mo.group('id') if not id: return price = _('Not Available') br = browser() with closing(br.open(url + id, timeout=timeout)) as nf: pdoc = html.fromstring(nf.read()) price_l = pdoc.xpath('//span[@class="price"]/text()') if price_l: price = price_l[0] search_result.price = price.strip() search_result.drm = SearchResult.DRM_UNLOCKED permissions = ' '.join(pdoc.xpath('//div[@class="permissions-items"]//text()')) if 'off' in permissions: search_result.drm = SearchResult.DRM_LOCKED fdata = pdoc.xpath('//div[contains(@class, "more-links") and contains(@class, "more-links-info")]/div//span/text()') if len(fdata) > 1: search_result.formats = ', '.join(fdata[1:]) return True
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): if counter <= 0: break id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=10, timeout=60): search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ 'search=%s&limit=0,%s' search_url = search_url % (urllib2.quote(query), max_results) counter = max_results br = browser() br.addheaders.append(['Accept-Encoding','gzip']) with closing(br.open(search_url, timeout=timeout)) as r: ungzipResponse(r,br) raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(raw, parser=parser) for data in doc.xpath('//*[local-name() = "fb2-book"]'): if counter <= 0: break counter -= 1 try: sRes = self.create_search_result(data) except Exception as e: prints('ERROR: cannot parse search result #%s: %s'%(max_results - counter + 1, e)) continue yield sRes
def check_links(): br = browser(honor_time=False, verify_ssl_certificates=False) while True: try: full_href, locations = items.get_nowait() except Empty: return href, frag = full_href.partition('#')[::2] try: res = br.open(href, timeout=10) except Exception as e: ans.append((locations, e, full_href)) else: if frag and check_anchors: ct = res.info().get('Content-Type') if ct and ct.split(';')[0].lower() in {'text/html', XHTML_MIME}: ids = downloaded_html_ids.get(href) if ids is None: try: ids = downloaded_html_ids[href] = get_html_ids(res.read()) except Exception: ids = downloaded_html_ids[href] = frozenset() if frag not in ids: ans.append((locations, ValueError('HTML anchor {} not found on the page'.format(frag)), full_href)) res.close() finally: done.append(None) progress_callback(len(done), len(external_links))
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//div[@class="f"]//a//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'): if counter <= 0: break id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href')) if not id: continue title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()')) author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()')) price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()')) cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip() s.formats = 'EPUB' yield s
def search(self, query, max_results=10, timeout=60): url = "http://www.legimi.com/pl/ebooki/?szukaj=" + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = "".join(data.xpath('.//a[@class="plainLink"]/@href')) if not id: continue cover_url = "".join(data.xpath(".//img[1]/@src")) title = "".join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()')) author = "".join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()')) price = "".join(data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = "http://www.legimi.com/" + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = "http://www.legimi.com/" + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def read_available_plugins(raise_error=False): import json, bz2 display_plugins = [] br = browser() try: raw = br.open_novisit(INDEX_URL).read() if not raw: return raw = json.loads(bz2.decompress(raw)) except: if raise_error: raise traceback.print_exc() return for plugin in raw.itervalues(): try: display_plugin = DisplayPlugin(plugin) get_installed_plugin_status(display_plugin) display_plugins.append(display_plugin) except: if DEBUG: prints('======= Plugin Parse Error =======') traceback.print_exc() import pprint pprint.pprint(plugin) display_plugins = sorted(display_plugins, key=lambda k: k.name) return display_plugins
def _download_zip(self, plugin_zip_url): from calibre.ptempfile import PersistentTemporaryFile br = browser(user_agent='%s %s' % (__appname__, __version__)) raw = br.open_novisit(plugin_zip_url).read() with PersistentTemporaryFile('.zip') as pt: pt.write(raw) return pt.name
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def _download_zip(self, plugin_zip_url): from calibre.ptempfile import PersistentTemporaryFile br = browser() raw = br.open_novisit(plugin_zip_url).read() with PersistentTemporaryFile('.zip') as pt: pt.write(raw) return pt.name
def get_download_filename(url, cookie_file=None): ''' Get a local filename for a URL using the content disposition header Returns empty string if an error occurs. ''' from calibre import browser from contextlib import closing filename = '' br = browser() if cookie_file: from mechanize import MozillaCookieJar cj = MozillaCookieJar() cj.load(cookie_file) br.set_cookiejar(cj) try: with closing(br.open(url)) as r: filename = get_download_filename_from_response(r) except: import traceback traceback.print_exc() return filename
def get_details(self, search_result, timeout): br = browser() with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: idata = html.fromstring(nf.read()) formats = ', '.join(idata.xpath('//div[@id="product-bonus"]/div/ul/li/text()')) search_result.formats = formats.upper() return True
def read_available_plugins(raise_error=False): display_plugins = [] br = browser() br.set_handle_gzip(True) try: raw = br.open_novisit(MR_INDEX_URL).read() if not raw: return except: if raise_error: raise traceback.print_exc() return raw = raw.decode('utf-8', errors='replace') root = html.fromstring(raw) list_nodes = root.xpath('//div[@id="post_message_1362767"]/ul/li') # Add our deprecated plugins which are nested in a grey span list_nodes.extend(root.xpath('//div[@id="post_message_1362767"]/span/ul/li')) for list_node in list_nodes: try: display_plugin = DisplayPlugin(list_node) get_installed_plugin_status(display_plugin) display_plugins.append(display_plugin) except: if DEBUG: prints('======= MobileRead Parse Error =======') traceback.print_exc() prints(html.tostring(list_node)) display_plugins = sorted(display_plugins, key=lambda k: k.name) return display_plugins
def _read_zip_attachment_url(self, forum_link): br = browser() br.set_handle_gzip(True) try: raw = br.open_novisit(forum_link).read() if not raw: return None except: traceback.print_exc() return None raw = raw.decode('utf-8', errors='replace') root = html.fromstring(raw) attachment_nodes = root.xpath('//fieldset/table/tr/td/a') for attachment_node in attachment_nodes: try: filename = attachment_node.text_content().lower() if filename.find('.zip') != -1: full_url = MR_URL + attachment_node.attrib['href'] return full_url except: if DEBUG: prints('======= MobileRead Parse Error =======') traceback.print_exc() prints(html.tostring(attachment_node)) return None
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def get_details(self, search_result, timeout): # get format and DRM status from calibre import browser from contextlib import closing from lxml import html br = browser() with closing(br.open(search_result.detail_item, timeout=timeout)) as nf: idata = html.fromstring(nf.read()) search_result.formats = '' if idata.xpath('.//span[@class="format epub"]'): search_result.formats = 'EPUB' if idata.xpath('.//span[@class="format pdf"]'): if search_result.formats == '': search_result.formats = 'PDF' else: search_result.formats.join(', PDF') if idata.xpath('.//span[@class="format nodrm-icon"]'): search_result.drm = SearchResult.DRM_UNLOCKED else: search_result.drm = SearchResult.DRM_LOCKED return True
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def __init__(self, prefs, version = None, caller = None): print "Initializing BeamEbooksDownloader()" print " myself: '%s'" % (self) self.prefs = prefs self.urlbase = prefs[prefs.URLBASE] if version is None: from calibre_plugins.beam_ebooks_downloader import Downloader version = Downloader.version self.caller = caller self.beamid = None self.successful_login = False self.already_visited_links = [] self.downloadable_ebooks = [] # TODO How do I access this string from the calibre core? USER_AGENT = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' user_agent = 'calibre-beam-ebooks-downloader-plugin/%d.%d.%d' % (version) user_agent = USER_AGENT + ' ' + user_agent self.browser = browser(user_agent=user_agent) # self.browser.set_debug_http(True) # self.browser.set_debug_responses(True) # self.tempdirpath = tempfile.mkdtemp(prefix = 'calibre-beam-ebooks-downloader-plugin-') self.tempdirpath = tempfile.gettempdir() + '/' + 'calibre-beam-ebooks-downloader-plugin' print "Saving stuff into '%s'" % (self.tempdirpath)
def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] ## add this as the fist try if it looks like ozon ID if re.match("^\d{6,9}$", query): ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query search_urls.insert(0, ozon_detail) xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'https://www.xinxii.com/catalog-search/query/?keywords=' + quote_plus( query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def download_builtin_recipe(urn): br = browser() return br.open_novisit('http://status.calibre-ebook.com/recipe/' + urn).read()
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='field-keywords'): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urllib.parse.urlencode(uquery).decode('ascii') br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) try: results = doc.xpath('//div[@id="atfResults" and @class]')[0] except IndexError: return if 's-result-list-parent-container' in results.get('class', ''): data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" format_xpath = './/a[@title="Kindle Edition"]/@title' asin_xpath = '@data-asin' cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' price_xpath = 'descendant::span[contains(@class, "sx-price")]/../@aria-label' else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='k'): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k): asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urllib.urlencode(uquery).decode('ascii') br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath( '//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]' ): kformat = ''.join( result.xpath( './/a[contains(text(), "Kindle Edition")]//text()')) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h5')[0], method='text', encoding='unicode') adiv = result.xpath( './/div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index('|') author = ' '.join(aparts[1:idx]) price = ''.join( result.xpath( './/span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]/text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: allText = f.read() doc = html.fromstring(allText)#.decode('latin-1', 'replace')) format_xpath2 = '' if doc.xpath('//div[@id = "atfResults" and contains(@class, "grid")]'): #print('grid form') data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') elif doc.xpath('//div[@id = "atfResults" and contains(@class, "ilresults")]'): #print('ilo form') data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()') asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()') elif doc.xpath('//div[@id = "atfResults" and contains(@class, "list")]'): #print('list form') data_xpath = '//li[@class="s-result-item"]' format_xpath = './/a[contains(@class, "a-size-small")]/text()' format_xpath2 = './/h3[contains(@class, "s-inline")]/text()' asin_xpath = '@data-asin' cover_xpath = './/img[contains(@class, "cfMarker")]/@src' title_xpath = './/h2[contains(@class, "s-access-title")]/text()' author_xpath = ('.//div[contains(@class, "a-fixed-left-grid-col")]' '/div/div/span//text()') price_xpath = ('.//span[contains(@class, "s-price")]/text()') else: # URK -- whats this? print('unknown result table form for Amazon EU search') #with open("c:/amazon_search_results.html", "w") as out: # out.write(allText) return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (authors pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): if format_xpath2: format_ = ''.join(data.xpath(format_xpath2)) if 'kindle' not in format_.lower(): # print(etree.tostring(data, pretty_print=True)) continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) authors = ''.join(data.xpath(author_xpath)) authors = re.sub('^' + self.author_article, '', authors) authors = re.sub(self.and_word, ' & ', authors) mo = re.match(r'(.*)(\(\d.*)$', authors) if mo: authors = mo.group(1).strip() price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = authors.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus( query) counter = max_results br = browser(user_agent='calibre/' + __version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = fix_url(''.join( data.xpath('./*[local-name() = "id"]/text()')).strip()) s.detail_item = url_slash_cleaner( '%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id))) s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout / 4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath( '//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]' ): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): cdata = href.replace('data:image/png;base64,', '') if not isinstance(cdata, bytes): cdata = cdata.encode('ascii') s.cover_data = base64.b64decode(cdata) yield s
def create_browser(self): from calibre import browser user_agent = '%s/%s' % (__appname__, __version__) return browser(user_agent=user_agent)
def search(self, query, max_results=10, timeout=60): if not hasattr(self, 'open_search_url'): return description = Description(self.open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search_amazon( query, max_results=10, timeout=60, write_html_to=None, search_url='http://www.amazon.com/s/?url=search-alias%3Ddigital-text&field-keywords=' ): url = search_url + query.encode('ascii', 'backslashreplace').replace( '%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) try: results = doc.xpath('//div[@id="atfResults" and @class]')[0] except IndexError: return if 's-result-list-parent-container' in results.get('class', ''): data_xpath = "descendant-or-self::li[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-result-item ')]" format_xpath = './/a[contains(text(), "Kindle Edition")]//text()' asin_xpath = '@data-asin' cover_xpath = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-image ')]/@src" title_xpath = "descendant-or-self::h2[@class and contains(concat(' ', normalize-space(@class), ' '), ' s-access-title ')]//text()" author_xpath = './/span[starts-with(text(), "by ")]/following-sibling::span//text()' price_xpath = '(.//span[contains(@class, " s-price ")])[last()]//text()' elif 'grid' in results.get('class', ''): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif 'ilresults' in results.get('class', ''): data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif 'list' in results.get('class', ''): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[contains(@class, "productImage")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def load(self, url): ''' For loading up a description object from a url. Normally you'll probably just want to pass a URL into the constructor. ''' br = browser() with closing(br.open(url, timeout=15)) as f: doc = safe_xml_fromstring(f.read()) # version 1.1 has repeating Url elements. self.urls = [] for element in doc.xpath('//*[local-name() = "Url"]'): template = element.get('template') type = element.get('type') if template and type: url = URL() url.template = template url.type = type self.urls.append(url) # Stanza catalogs. for element in doc.xpath('//*[local-name() = "link"]'): if element.get('rel') != 'search': continue href = element.get('href') type = element.get('type') if href and type: url = URL() url.template = href url.type = type self.urls.append(url) # this is version 1.0 specific. self.url = '' if not self.urls: self.url = ''.join( doc.xpath('//*[local-name() = "Url"][1]//text()')) self.format = ''.join( doc.xpath('//*[local-name() = "Format"][1]//text()')) self.shortname = ''.join( doc.xpath('//*[local-name() = "ShortName"][1]//text()')) self.longname = ''.join( doc.xpath('//*[local-name() = "LongName"][1]//text()')) self.description = ''.join( doc.xpath('//*[local-name() = "Description"][1]//text()')) self.image = ''.join( doc.xpath('//*[local-name() = "Image"][1]//text()')) self.sameplesearch = ''.join( doc.xpath('//*[local-name() = "SampleSearch"][1]//text()')) self.developer = ''.join( doc.xpath('//*[local-name() = "Developer"][1]//text()')) self.contact = ''.join( doc.xpath('/*[local-name() = "Contact"][1]//text()')) self.attribution = ''.join( doc.xpath('//*[local-name() = "Attribution"][1]//text()')) self.syndicationright = ''.join( doc.xpath('//*[local-name() = "SyndicationRight"][1]//text()')) tag_text = ' '.join(doc.xpath('//*[local-name() = "Tags"]//text()')) if tag_text is not None: self.tags = tag_text.split(' ') self.adultcontent = doc.xpath( 'boolean(//*[local-name() = "AdultContent" and contains(., "true")])' )
def search(self, query, max_results=10, timeout=60): url = 'http://www.empik.com/szukaj/produkt?c=ebooki-ebooki&q=' + \ urllib.quote(query) + '&qtype=basicForm&start=1&catalogType=pl&searchCategory=3501&format=epub&format=mobi&format=pdf&resultsPP=' + str(max_results) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productsSet"]/div'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="productBox-450Title"]/@href')) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="productBox-450Pic"]/a/img/@data-original' )) title = ''.join( data.xpath('.//a[@class="productBox-450Title"]/text()')) title = re.sub(r' \(ebook\)', '', title) author = ', '.join( data.xpath( './/div[@class="productBox-450Author"]/a/text()')) price = ''.join( data.xpath('.//span[@class="currentPrice"]/text()')) formats = ''.join( data.xpath('.//div[@class="productBox-450Type"]/text()')) formats = re.sub(r'Ebook *,? *', '', formats) formats = re.sub(r'\(.*\)', '', formats) with closing( br.open('http://empik.com' + id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) crawled = idata.xpath( './/td[(@class="connectedInfo") or (@class="connectedInfo connectedBordered")]/a/text()' ) formats_more = ','.join([ re.sub('ebook, ', '', x) for x in crawled if 'ebook' in x ]) if formats_more: formats += ', ' + formats_more drm = data.xpath( 'boolean(.//div[@class="productBox-450Type" and contains(text(), "ADE")])' ) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://empik.com' + id.strip() s.formats = formats.upper().strip() s.drm = SearchResult.DRM_LOCKED if drm else SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = str(query.strip(), 'utf-8') reObj = re.search('^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib.parse.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src' )).strip() s.title = ''.join( data.xpath( './/div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt' )).strip() s.author = ''.join( data.xpath( './/div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()' )).strip() s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@title') ).strip() author = ''.join( data.xpath('.//div[@style="float:left;width:90%"]/b/text()' )).strip().replace('Автор: ', '') if title.lower().find( query.lower()) == -1 and author.lower().find( query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath('.//a[@class="gk_vm_product_image"]/img/@src') ).strip() s.title = title s.author = author s.price = ''.join( data.xpath( './/span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( 'http://www.publio.pl/szukaj,strona' + str(page) + '.html?q=' + urllib.quote(query) + '§ions=EMAGAZINE§ions=MINIBOOK§ions=EBOOK', timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="img"]/a/img/@data-original')) title = ''.join( data.xpath('.//div[@class="img"]/a/@title')) title2 = ''.join( data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + '. ' + title2 if (''.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()' )).strip() == "Seria:"): series = ''.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title' )) title = title + ' (seria ' + series + ')' author = ', '.join( data.xpath( './div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title' )) price = ''.join( data.xpath( './/div[@class="priceBox tk-museo-slab"]/ins/text()' )) if not price: price = ''.join( data.xpath( './/div[@class="priceBox tk-museo-slab"]/text()' )).strip() formats = ', '.join([ x.strip() for x in data.xpath( './/div[@class="formats"]/a/text()') ]) counter -= 1 s = SearchResult() s.cover_url = 'http://www.publio.pl' + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://www.publio.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(' DRM', '').strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search_flibusta(url, query, web_url, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join( data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = web_url + href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = web_url + href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition/open-access' in rel: if 'application/fb2+zip' in type: s.downloads['FB2'] = web_url + href elif 'application/txt+zip' in type: s.downloads['TXT'] = web_url + href elif 'application/html+zip' in type: s.downloads['HTML'] = web_url + href elif 'application/x-mobipocket-ebook' in type: s.downloads['MOBI'] = web_url + href elif type: ext = guess_extension(type) ext2 = guess_extension(type.replace("+zip", "")) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = web_url + href elif ext2: ext2 = ext2[1:].upper().strip() s.downloads[ext2] = web_url + href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath( './*[local-name() = "author"]//*[local-name() = "name"]//text()' )).strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'): ''' Manybooks uses a very strange opds feed. The opds main feed is structured like a stanza feed. The search result entries give very little information and requires you to go to a detail link. The detail link has the wrong type specified (text/html instead of application/atom+xml). ''' description = Description(open_search_url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw_data = f.read() raw_data = raw_data.decode('utf-8', 'replace') doc = etree.fromstring(raw_data) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() detail_links = data.xpath( './*[local-name() = "link" and @type = "text/html"]') if not detail_links: continue detail_link = detail_links[0] detail_href = detail_link.get('href') if not detail_href: continue s.detail_item = 'http://manybooks.net/titles/' + detail_href.split( 'tid=')[-1] + '.html' # These can have HTML inside of them. We are going to get them again later # just in case. s.title = ''.join( data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join( data.xpath('./*[local-name() = "author"]//text()')).strip() # Follow the detail link to get the rest of the info. with closing(br.open(detail_href, timeout=timeout / 4)) as df: ddoc = etree.fromstring(df.read()) ddata = ddoc.xpath('//*[local-name() = "entry"][1]') if ddata: ddata = ddata[0] # This is the real title and author info we want. We got # it previously just in case it's not specified here for some reason. s.title = ''.join( ddata.xpath( './*[local-name() = "title"]//text()')).strip() s.author = ', '.join( ddata.xpath( './*[local-name() = "author"]//text()')).strip() if s.author.startswith(','): s.author = s.author[1:] if s.author.endswith(','): s.author = s.author[:-1] s.cover_url = ''.join( ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/thumbnail"][1]/@href' )).strip() for link in ddata.xpath( './*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]' ): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = 'EPUB, PDB (eReader, PalmDoc, zTXT, Plucker, iSilo), FB2, ZIP, AZW, MOBI, PRC, LIT, PKG, PDF, TXT, RB, RTF, LRF, TCR, JAR' yield s
def test(): from calibre import browser import pprint br = browser() urls = get_urls(br, ['consider', 'phlebas', 'banks']) pprint.pprint(urls)
def search(query, max_results=10, timeout=60): url = 'http://woblink.com/publication/ajax?mode=none&query=' + urllib.parse.quote_plus( query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser(user_agent='CalibreCrawler/1.0') br.set_handle_gzip(True) rq = Request(url, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referrer': 'http://woblink.com/ebooki-kategorie', 'Cache-Control': 'max-age=0', }, data=urllib.parse.urlencode({ 'nw_filtry_filtr_zakrescen_formularz[min]': '0', 'nw_filtry_filtr_zakrescen_formularz[max]': '350', })) r = br.open(rq) raw = r.read() doc = html.fromstring('<html><body>' + raw.decode('utf-8') + '</body></html>') counter = max_results for data in doc.xpath( '//div[@class="nw_katalog_lista_ksiazka ebook " or @class="nw_katalog_lista_ksiazka ebook promocja"]' ): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath('.//div[@class="nw_opcjezakupu_cena"]/span[2]/text()')) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()' )) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats counter -= 1 s.drm = SearchResult.DRM_LOCKED if 'DRM' in formats else SearchResult.DRM_UNLOCKED yield s
def __init__(self): from calibre import browser self.browser = browser()
def search(self, query, max_results=10, timeout=60): url = ( 'http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/') + 1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join( details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join( data.xpath( './td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join( data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath( 'boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) if doc.xpath( '//div[@id = "atfResults" and contains(@class, "grid")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "ilresults")]' ): data_xpath = '//li[(@class="ilo")]' format_xpath = ( './/ul[contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './div[@class = "ilf"]/a/img[contains(@class, "ilo")]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) elif doc.xpath( '//div[@id = "atfResults" and contains(@class, "list")]'): data_xpath = '//div[contains(@class, "prod")]' format_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = ( './/ul[contains(@class, "rsltL")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) else: return for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus( query) br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'http:' + cover_url break else: cover_url = None for p in select('p.title', item): title = etree.tostring(p, method='text', encoding=unicode).strip() for a in select('a[href]', p): url = 'http://store.kobobooks.com' + a.get('href') break else: url = None break else: title = None authors = [] for a in select('p.author a.contributor', item): authors.append( etree.tostring(a, method='text', encoding=unicode).strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding=unicode).strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = self.SEARCH_URL % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[@id="searchresult-list"]/descendant::div[contains(@class, "doc-item")]' ): if counter <= 0: break s = SearchResult() s.price = _('Not Available') p = ''.join( item.xpath( 'descendant::p[@class="doc-price"]/descendant::span[@itemprop="price"]/text()' )).strip() if p: s.price = 'AUD ' + p.split('$')[-1] title = item.xpath('descendant::h3[@class="doc-title"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue st = item.xpath('descendant::p[@class="doc-subtitle"]') if st: st = etree.tostring(st[0], method='text', encoding=unicode) if st and st.strip(): title = title.strip() + ': ' + st s.title = title.strip() aut = item.xpath('descendant::p[@class="doc-author"]') if not aut: continue s.author = etree.tostring(aut[0], method='text', encoding=unicode).strip() if not s.author: continue du = ''.join( item.xpath( 'descendant::h3[position() = 1 and @class="doc-title"]/descendant::a[position() = 1 and @href]/@href' )).strip() if not du: continue detail_url = 'https://au.readerstore.sony.com' + du s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath( 'descendant::p[@class="doc-cover" and position() = 1]/' 'descendant::img[position() = 1 and @src]/@src')) if cover_url: s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def browser(self): if self._browser is None: self._browser = browser(user_agent=self.user_agent) if self.supports_gzip_transfer_encoding: self._browser.set_handle_gzip(True) return self._browser.clone_browser()
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) #.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' # Results can be in a grid (table) or a column format_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' ) asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' # Results can be in a grid (table) or a column price_xpath = ( './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]' '//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' ) for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (authors pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) authors = ''.join(data.xpath(author_xpath)) authors = re.sub('^' + self.author_article, '', authors) authors = re.sub(self.and_word, ' & ', authors) mo = re.match(r'(.*)(\(\d.*)$', authors) if mo: authors = mo.group(1).strip() price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = authors.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus( query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_opcjezakupu_cena"]/span/text()' )) price = re.sub('\.', ',', price) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_formaty"]/span/text()' )) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'EPUB DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
class ChitankaStore(BasicStoreConfig, StorePlugin): def open(self, parent=None, detail_item=None, external=False): url = 'http://chitanka.info' if external or self.config.get('open_external', False): if detail_item: url = url + detail_item open_url(QUrl(url_slash_cleaner(url))) else: detail_url = None if detail_item: detail_url = url + detail_item d = WebStoreDialog(self.gui, url, parent, detail_url) d.setWindowTitle(self.name) d.set_tags(self.config.get('tags', '')) d.exec_() def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) if not reObj: return base_url = 'http://chitanka.info' url = base_url + '/search?q=' + urllib2.quote(query) counter = max_results # search for book title br = browser() try: with closing(br.open(url, timeout=timeout)) as f: f = unicode(f.read(), 'utf-8') doc = html.fromstring(f) for data in doc.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath( './/a[@class="booklink"]/img/@src')).strip() s.title = ''.join( data.xpath( './/a[@class="booklink"]/i/text()')).strip() s.author = ''.join( data.xpath( './/span[@class="bookauthor"]/a/text()')).strip() s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip( ).replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-epub"]/@href')).strip( ).replace('.zip', '') s.downloads['TXT'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-txt"]/@href')).strip( ).replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s except urllib2.HTTPError, e: if e.code == 404: return else: raise # search for author names for data in doc.xpath('//ul[@class="superlist"][1]/li/dl/dt'): author_url = ''.join( data.xpath('.//a[contains(@href,"/person/")]/@href')) if author_url == '': continue if counter <= 0: break br2 = browser() with closing(br2.open(base_url + author_url, timeout=timeout)) as f: if counter <= 0: break f = unicode(f.read(), 'utf-8') doc2 = html.fromstring(f) # search for book title for data in doc2.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue title = ''.join( data.xpath( './/a[@class="booklink"]/i/text()')).strip() author = ''.join( data.xpath( './/span[@class="bookauthor"]/a/text()')).strip() if title.lower().find( query.lower()) == -1 and author.lower().find( query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath( './/a[@class="booklink"]/img/@src')).strip() s.title = title s.author = author s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip( ).replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-epub"]/@href')).strip( ).replace('.zip', '') s.downloads['TXT'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-txt"]/@href')).strip( ).replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s
def add_password(self, *args, **kwargs): B.add_password(self, *args, **kwargs) self._clone_actions['add_password'] = ('add_password', args, kwargs) def add_proxy_password(self, *args, **kwargs): B.add_proxy_password(self, *args, **kwargs) self._clone_actions['add_proxy_password'] = ('add_proxy_password', args, kwargs) def clone_browser(self): clone = self.__class__() clone.https_handler.ssl_context = self.https_handler.ssl_context clone.addheaders = copy.deepcopy(self.addheaders) for func, args, kwargs in self._clone_actions.values(): func = getattr(clone, func) func(*args, **kwargs) return clone if __name__ == '__main__': from calibre import browser from pprint import pprint orig = browser() clone = orig.clone_browser() pprint(orig._ua_handlers) pprint(clone._ua_handlers) assert orig._ua_handlers.keys() == clone._ua_handlers.keys() assert orig._ua_handlers['_cookies'].cookiejar is \ clone._ua_handlers['_cookies'].cookiejar assert orig.addheaders == clone.addheaders
def search(self, query, max_results=10, timeout=60): url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'): if counter <= 0: break curr = ''.join( item.xpath( 'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="currency"]/@title' )).strip() amt = ''.join( item.xpath( 'descendant::div[@class="pricing"]/p[@class="price money"]/descendant::*[@class="amount"]/text()' )).strip() s = SearchResult() s.price = (curr + ' ' + amt) if (curr and amt) else _('Not Available') title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue s.title = title.strip() s.author = ''.join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')).strip() if not s.author: continue detail_url = ''.join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')) if not detail_url: continue if detail_url.startswith('/'): detail_url = 'http:' + detail_url s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath('descendant::li[@class="coverart"]/' 'descendant::img[@src]/@src')) if cover_url: if cover_url.startswith('//'): cover_url = 'http:' + cover_url elif cover_url.startswith('/'): cover_url = 'http://ebookstore.sony.com' + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
from .pylibgen import Library from contextlib import closing from PyQt5.Qt import QUrl from calibre import browser from calibre.gui2 import open_url from calibre.gui2.store import StorePlugin from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.web_store_dialog import WebStoreDialog lg = Library() br = browser() class LibGen_Store(StorePlugin): RES_THRESH = 5 url = 'http://gen.lib.rus.ec' def open(self, parent=None, detail_item=None, external=False): detail_url = None if detail_item: detail_url = self.get_cover_page(detail_item) if external or self.config.get('open_external', False):
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.de/de/pathSearch?nav=52122&searchString=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[contains(@class, "articlecontainer")]'): if counter <= 0: break details = data.xpath( './div[contains(@class, "articleinfobox")]') if not details: continue details = details[0] id_ = ''.join(details.xpath('./a/@name')).strip() if not id_: continue title = ''.join( details.xpath('./h3[@class="title"]/a/text()')).strip() author = ''.join( details.xpath('.//div[@class="author"]/text()')).strip() if author.startswith('von'): author = author[4:] pdf = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "pdf")]/text())' ) epub = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "epub")]/text())' ) mobi = details.xpath( 'boolean(.//span[@class="bindername" and contains(text(), "mobipocket")]/text())' ) cover_url = ''.join( data.xpath('.//div[@class="coverimg"]/a/img/@src')) price = ''.join( data.xpath('.//div[@class="preis"]/text()')).replace( '*', '').strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id_ formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') if mobi: formats.append('MOBI') s.formats = ', '.join(formats) yield s