def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): ''' XinXii's open search url is: http://www.xinxii.com/catalog-search/query/?keywords={searchTerms}&pw={startPage?}&doc_lang={docLang}&ff={docFormat},{docFormat},{docFormat} This url requires the docLang and docFormat. However, the search itself sent to XinXii does not require them. They can be ignored. We cannot push this into the stanard OpenSearchOPDSStore search because of the required attributes. XinXii doesn't return all info supported by OpenSearchOPDSStore search function so this one is modified to remove parts that are used. ''' url = 'http://www.xinxii.com/catalog-search/query/?keywords=' + urllib.quote_plus(query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): s.cover_url = href if rel == 'alternate': s.detail_item = href s.formats = 'EPUB, PDF' s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//div[@class="f"]//a//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'): if counter <= 0: break id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href')) if not id: continue title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()')) author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()')) price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()')) cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip() s.formats = 'EPUB' yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query), timeout=timeout, ) ) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = "".join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original')) title = "".join(data.xpath('.//div[@class="img"]/a/@title')) title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + ". " + title2 if ( "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()') ).strip() == "Seria:" ): series = "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title') ) title = title + " (seria " + series + ")" author = ", ".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title') ) price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()')) if not price: price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip() formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt')) counter -= 1 s = SearchResult() s.cover_url = "http://www.publio.pl" + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = "http://www.publio.pl" + id.strip() s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(" DRM", "").strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productInline"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="productThumb"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src')) title = ''.join(data.xpath('.//a[@class="title"]/text()')) author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()')) price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()')) price = re.sub('\.', ',', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://zixo.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): if counter <= 0: break id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] ## add this as the fist try if it looks like ozon ID if re.match("^\d{6,9}$", query): ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query search_urls.insert(0, ozon_detail) xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src')) title = ''.join(data.xpath('.//a[@class="title"]//text()')) author = ', '.join(data.xpath('.//a[@class="contributor"]//text()')) price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): print( "search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( { "q": q } ) print( url ) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc)>0: result = json.loads( json_doc ) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[ t['type'] ] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print( "scrape nothing." )
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = "http://www.legimi.com/pl/ebooki/?szukaj=" + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = "".join(data.xpath('.//a[@class="plainLink"]/@href')) if not id: continue cover_url = "".join(data.xpath(".//img[1]/@src")) title = "".join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()')) author = "".join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()')) price = "".join(data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = "http://www.legimi.com/" + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = "http://www.legimi.com/" + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) id = id.split('.mobile')[0] title = ''.join(data.xpath('.//span[@class="title"]/text()')) author = ''.join(data.xpath('.//span[@class="subtitle"]/text()')) counter -= 1 s = SearchResult() s.cover_url = '' s.detail_item = id.strip() s.title = title.strip() s.author = author.strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=15, timeout=60): search_url = ( self.shop_url + "/webservice/webservice.asmx/SearchWebService?" "searchText=%s&searchContext=ebook" % urllib2.quote(query) ) search_urls = [search_url] xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format("ID")) s.title = data.xpath(xp_template.format("Name")) s.author = data.xpath(xp_template.format("Author")) s.price = data.xpath(xp_template.format("Price")) s.cover_url = data.xpath(xp_template.format("Picture")) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() # MOBI should be send first, if 'MOBI' in formats: t = copy.copy(s) t.title += ' MOBI' t.drm = SearchResult.DRM_UNLOCKED t.formats = 'MOBI' formats.remove('MOBI') counter -= 1 yield t # and the remaining formats (if any) next if formats: if 'epub' in formats: formats.remove('epub') formats.append('WOBLINK') if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'): formats.insert(0, 'EPUB') s.drm = SearchResult.DRM_LOCKED s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query) counter = max_results br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()) s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id))) s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout/4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): cdata = href.replace('data:image/png;base64,', '') if not isinstance(cdata, bytes): cdata = cdata.encode('ascii') s.cover_data = base64.b64decode(cdata) yield s
def search(self, query, max_results=10, timeout=60): s = SearchResult() s.title = 'Amazon required that this<br>store be permanently closed.' s.author = '' s.price = '' s.detail_item = '' s.drm = SearchResult.DRM_UNKNOWN yield s
def run(self): url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html' self.update_details.emit(_('Checking last download date.')) last_download = self.config.get('last_download', None) # Don't update the book list if our cache is less than one week old. if last_download and (time.time() - last_download) < 604800: return self.update_details.emit(_('Downloading book list from MobileRead.')) # Download the book list HTML file from MobileRead. br = browser() raw_data = None try: with closing(br.open(url, timeout=self.timeout)) as f: raw_data = f.read() except: return if not raw_data or not self._run: return self.update_details.emit(_('Processing books.')) # Turn books listed in the HTML file into SearchResults's. books = [] try: data = html.fromstring(raw_data) raw_books = data.xpath('//ul/li') self.total_changed.emit(len(raw_books)) for i, book_data in enumerate(raw_books): self.update_details.emit( _('%(num)s of %(tot)s books processed.') % dict( num=i, tot=len(raw_books))) book = SearchResult() book.detail_item = ''.join(book_data.xpath('.//a/@href')) book.formats = ''.join(book_data.xpath('.//i/text()')) book.formats = book.formats.strip() text = ''.join(book_data.xpath('.//a/text()')) if ':' in text: book.author, q, text = text.partition(':') book.author = book.author.strip() book.title = text.strip() books.append(book) if not self._run: books = [] break else: self.update_progress.emit(i) except: pass # Save the book list and it's create time. if books: self.config['book_list'] = self.seralize_books(books) self.config['last_download'] = time.time()
def search_google(query, max_results=10, timeout=60, write_html_to=None): url = 'https://www.google.com/search?tbm=bks&q=' + quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = parse_html(raw) if write_html_to is not None: praw = html.tostring(doc, encoding='utf-8') open(write_html_to, 'wb').write(praw) for data in doc.xpath('//div[@id="rso"]/div'): if counter <= 0: break h3 = data.xpath('descendant::h3') if not h3: continue h3 = h3[0] a = h3.getparent() id = a.get('href') if not id: continue title = ''.join(data.xpath('.//h3//text()')).strip() authors = data.xpath( 'descendant::a[@class="fl" and @href]//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ' & '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2') ) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join( data.xpath( './/ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()' )) cover_url = ''.join( data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join( data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join( data.xpath( './/p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.', ',', price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join( data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join( data.xpath( './/a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join( data.xpath( './/p[contains(@class, "flowview-item-title")]//text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = u'http://uk.nook.com/s/%s?s%%5Bdref%%5D=1&s%%5Bkeyword%%5D=%s' % (query.replace(' ', '-'), urllib.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = html.fromstring(raw) for data in doc.xpath('//ul[contains(@class, "product_list")]/li'): if counter <= 0: break id_ = ''.join(data.xpath('.//span[contains(@class, "image")]/a/@href')) if not id_: continue if id_.startswith('/gb'): id_ = id_[3:] id_ = 'http://uk.nook.com' + id_.strip() cover_url = ''.join(data.xpath('.//span[contains(@class, "image")]//img/@data-src')) title = ''.join(data.xpath('.//div[contains(@class, "title")]//text()')).strip() if not title: continue author = ', '.join(data.xpath('.//div[contains(@class, "contributor")]//a/text()')).strip() price = ''.join(data.xpath('.//div[contains(@class, "action")]//a//text()')).strip() price = re.sub(r'[^\d.,£]', '', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id_ s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebooks.com/SearchApp/SearchResults.net?term=' + quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="results"]//li'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')) mo = re.search(r'\d+', id) if not mo: continue id = mo.group() cover_url = ''.join(data.xpath('.//div[contains(@class, "img")]//img/@src')) title = ''.join(data.xpath( 'descendant::span[@class="book-title"]/a/text()')).strip() author = ', '.join(data.xpath( 'descendant::span[@class="author"]/a/text()')).strip() if not title or not author: continue price = ''.join(data.xpath( './/span[starts-with(text(), "US$") or' ' starts-with(text(), "€") or starts-with(text(), "CA$") or' ' starts-with(text(), "AU$") or starts-with(text(), "£")]/text()')).strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = '?url=http://www.ebooks.com/cj.asp?IID=' + id.strip() + '&cjsku=' + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % (query.decode('utf-8').replace(' ', '-'), quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = html.fromstring(raw) for data in doc.xpath('//ol[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-block")]/a/@href')) if not id: continue cover_url = '' cover_id = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@id')) m = re.search(r"%s'.*?srcUrl: '(?P<iurl>.*?)'.*?}" % cover_id, raw) if m: cover_url = m.group('iurl') title = ''.join(data.xpath('descendant::p[@class="title"]//span[@class="name"]//text()')).strip() if not title: continue author = ', '.join(data.xpath('.//ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()')).strip() price = ''.join(data.xpath('.//a[contains(@class, "bn-price")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = 'https://drmfree.calibre-ebook.com/search/?q=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@id="object_list"]//li'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="links"]/a[1]/@href')) id = id.strip() if not id: continue cover_url = ''.join( data.xpath('.//div[@class="cover"]/img/@src')) price = ''.join(data.xpath('.//div[@class="price"]/text()')) a, b, price = price.partition('Price:') price = price.strip() if not price: continue title = ''.join(data.xpath('.//div/strong/text()')) author = ''.join(data.xpath('.//div[@class="author"]//text()')) author = author.partition('by')[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search.scgi?szukaj=' + urllib.quote_plus( query.decode('utf-8').encode( 'iso-8859-2')) + '&serwisyall=0&x=0&y=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-list"]/ul[2]/li'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="cover"]/@href')) if not id: continue formats = ', '.join( data.xpath('.//div[@class="ikony"]/span/text()')) if formats in ['MP3', '']: continue cover_url = ''.join( data.xpath('.//a[@class="cover"]/img/@src')) title = ''.join(data.xpath('.//h3/a/@title')) title = re.sub('eBook.', '', title) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price"]/ins/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://ebookpoint.pl' + re.sub( '72x9', '65x8', cover_url) s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.', ',', price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.empik.com/ebooki/ebooki,3501,s?sort=scoreDesc&resultsPP={}&q={}'.format( max_results, quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@class="search-content js-search-content"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="name"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath('.//a/img[@class="lazy"]/@lazy-img')) author = ', '.join( data.xpath('.//a[@class="smartAuthor"]/text()')) title = ''.join(data.xpath('.//div[@class="name"]/a/@title')) price = ''.join( data.xpath('.//div[@class="price ta-price-tile "]/text()')) # with closing(br.open('https://empik.com' + id.strip(), timeout=timeout/4)) as nf: # idata = html.fromstring(nf.read()) # crawled = idata.xpath('.//a[(@class="chosen hrefstyle") or (@class="connectionsLink hrefstyle")]/text()') # formats = ','.join([re.sub('ebook, ','', x.strip()) for x in crawled if 'ebook' in x]) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.split(' - ')[0] s.author = author.strip() s.price = price.strip() s.detail_item = 'https://empik.com' + id.strip() # s.formats = formats.upper().strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://wolnelektury.pl/szukaj?q=' + quote_plus( query.encode('utf-8')) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="Book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="title"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath('.//div[@class="cover-area"]//img/@src')) title = ''.join( data.xpath('.//div[@class="title"]/a[1]/text()')) author = ', '.join( data.xpath('.//div[@class="author"]/a/text()')) price = '0,00 zł' counter -= 1 s = SearchResult() for link in data.xpath( './/div[@class="book-box-formats"]/span/a'): ext = ''.join(link.xpath('./text()')) href = 'http://wolnelektury.pl' + link.get('href') s.downloads[ext] = href s.cover_url = 'http://wolnelektury.pl' + cover_url.strip() s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://wolnelektury.pl' + id s.formats = ', '.join(s.downloads.keys()) s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): try: results = lg.lookup(lg.search(query)) print('Reached LibGen Mirrors.') except Exception as e: print(e) print('pylibgen crashed. In most cases this is caused by unreachable LibGen Mirrors, try again in a few minutes.') return self.num_results = len(results) for r in results: s = SearchResult() s.title = r['title'] s.author = r['author'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED s.formats = r['extension'] s.detail_item = r['md5'] yield s
def search(self, query, max_results=10, timeout=60): url = 'http://weightlessbooks.com/?s=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="cover"]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="cover"]/a/img/@src')) price = ''.join(data.xpath('.//div[@class="buy_buttons"]/b[1]/text()')) if not price: continue formats = ', '.join(data.xpath('.//select[@class="eStore_variation"]//option//text()')) formats = formats.upper() title = ''.join(data.xpath('.//h3/a/text()')) author = ''.join(data.xpath('.//h3//text()')) author = author.replace(title, '') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats yield s
def search(self, query, max_results=10, timeout=60): br = browser() page=1 counter = max_results while counter: with closing(br.open('https://cdp.pl/ksiazki/e-book.html?q=' + urllib.parse.quote_plus(query) + '&p=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="products"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="product-image"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="product-image"]/img/@data-src')) title = ''.join(data.xpath('.//h3[1]/a/@title')) price = ''.join(data.xpath('.//span[@class="custom_price"]/text()'))+','+''.join(data.xpath('.//span[@class="custom_price"]/sup/text()')) author = ''.join(data.xpath('.//div[@class="authors"]/@title')) formats = '' with closing(br.open(id.strip(), timeout=timeout/4)) as nf: idata = html.fromstring(nf.read()) formats = idata.xpath('//div[@class="second-part-holder"]//div[@class="product-attributes-container"]/ul/li/span/text()')[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.replace(' (ebook)','').strip() s.author = author s.price = price + ' zł' s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('//a[@class="next-page"]'): break page+=1
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join( data.xpath( './div/div[contains(@class, "product--info")]/a/@href') ).strip() if not id_: continue cover_url = ''.join( data.xpath( './div/div[contains(@class, "product--info")]/a//img/@srcset' )) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath( './/a[@class="product--author"]/text()')[0].strip() title = data.xpath( './/a[@class="product--title"]/text()')[0].strip() price = data.xpath( './/div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.rw2010.pl/go.live.php/?launch_macro=catalogue-search-rd' values={ 'fkeyword': query, 'file_type':'' } br = browser() counter = max_results with closing(br.open(url, data=urlencode(values), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="ProductDetail"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue with closing(br.open(id.strip(), timeout=timeout/4)) as nf: idata = html.fromstring(nf.read()) cover_url = ''.join(idata.xpath('//div[@class="boxa"]//div[@class="img"]/img/@src')) author = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Autor: "]/span/text()')) title = ''.join(idata.xpath('//div[@class="boxb"]/h2[1]/text()')) title = re.sub(r'\(#.+\)', '', title) formats = ''.join(idata.xpath('//div[@class="boxb"]//h3[text()="Format pliku: "]/span/text()')) price = ''.join(idata.xpath('//div[@class="price-box"]/span/text()')) + ',00 zł' counter -= 1 s = SearchResult() s.cover_url = 'http://www.rw2010.pl/' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = re.sub(r'%3D', '=', id) s.drm = SearchResult.DRM_UNLOCKED s.formats = formats[0:-2].upper() yield s
def create_search_result(self, data): xp_template = 'normalize-space(@{0})' sRes = SearchResult() sRes.drm = SearchResult.DRM_UNLOCKED sRes.detail_item = data.xpath(xp_template.format('hub_id')) sRes.title = data.xpath('string(.//title-info/book-title/text()|.//publish-info/book-name/text())') # aut = concat('.//title-info/author/first-name', ' ') authors = data.xpath('.//title-info/author/first-name/text()|' './/title-info/author/middle-name/text()|' './/title-info/author/last-name/text()') sRes.author = u' '.join(map(type(u''), authors)) sRes.price = data.xpath(xp_template.format('price')) # cover vs cover_preview sRes.cover_url = data.xpath(xp_template.format('cover_preview')) sRes.price = format_price_in_RUR(sRes.price) types = data.xpath('//fb2-book//files/file/@type') fmt_set = _parse_ebook_formats(' '.join(types)) sRes.formats = ', '.join(fmt_set) return sRes
def search(self, query, max_results=10, timeout=60): url = ( 'http://www.ebook.nl/store/advanced_search_result.php?keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="books"]/div[@itemtype="http://schema.org/Book"]' ): if counter <= 0: break id = ''.join( data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join( data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join( data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join( data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join( data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.legimi.pl/ebooki/?szukaj=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')) if not id: continue cover_url = ''.join( data.xpath( './/span[@class="listImage imageDarkLoader"]/img/@src') ) title = ''.join( data.xpath( './/span[@class="bookListTitle ellipsis"]/text()')) author = ''.join( data.xpath( './/span[@class="bookListAuthor ellipsis"]/text()')) price = ''.join( data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'https://www.legimi.pl/' + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote( query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join( data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join( data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join( data.xpath( './div[@class="ProductDetails"]/strong/a/text()')) price = ''.join( data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def _do_search(self, url, max_results, timeout): br = browser() with closing(br.open(url, timeout=timeout)) as f: page = f.read().decode('utf-8') doc = html.fromstring(page) for data in doc.xpath('//ul[contains(@class,"book_list")]/li'): if max_results <= 0: break s = SearchResult() s.detail_item = ''.join( data.xpath('.//a[@class="th"]/@href')).strip() if not id: continue s.cover_url = ''.join( data.xpath( './/a[@class="th"]/img/@data-original')).strip() s.title = ''.join( data.xpath( './/div[@class="item-title"]/a/text()')).strip() s.author = ', '.join( data.xpath('.//div[@class="item-author"]/a/text()')).strip( ', ') price_list = data.xpath('.//div[@class="item-price"]') for price_item in price_list: if price_item.text.startswith('е-книга:'): s.price = ''.join(price_item.xpath('.//span/text()')) break s.price = '0.00 лв.' if not s.price and not price_list else s.price if not s.price: # no e-book available continue max_results -= 1 yield s
def search(query, max_results=15, timeout=60): url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % quote_plus(query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] root = parse_html(raw) for tile in root.xpath('//*[@class="bShelfTile inline"]'): if counter <= 0: break counter -= 1 s = SearchResult(store_name='OZON.ru') s.detail_item = shop_url + tile.xpath('descendant::a[@class="eShelfTile_Link"]/@href')[0] s.title = tile.xpath('descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0] s.author = tile.xpath('descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0] s.price = ''.join(tile.xpath('descendant::div[contains(@class, "eShelfTile_Price")]/text()')) s.cover_url = 'http:' + tile.xpath('descendant::img/@data-original')[0] s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote( query.encode('utf-8')) + '&product_type=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productInline"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="productThumb"]/@href')) if not id: continue cover_url = ''.join( data.xpath('.//a[@class="productThumb"]/img/@src')) title = ''.join(data.xpath('.//a[@class="title"]/text()')) author = ','.join( data.xpath( './/div[@class="productDescription"]/span[1]/a/text()') ) price = ''.join( data.xpath('.//div[@class="priceList"]/span/text()')) price = re.sub('\.', ',', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://zixo.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): br = browser() page=1 counter = max_results while counter: with closing(br.open('http://www.koobe.pl/s,p,' + str(page) + ',szukaj/fraza:' + urllib.quote(query), timeout=timeout)) as f: doc = html.fromstring(f.read().decode('utf-8')) for data in doc.xpath('//div[@class="seach_result"]/div[@class="result"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="cover"]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="cover"]/a/img/@src')) price = ''.join(data.xpath('.//span[@class="current_price"]/text()')) title = ''.join(data.xpath('.//h2[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//h3[@class="book_author"]/a/text()')) formats = ', '.join(data.xpath('.//div[@class="formats"]/div/div/@title')) counter -= 1 s = SearchResult() s.cover_url = 'http://koobe.pl/' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://koobe.pl' + id[1:] s.formats = formats.upper() s.drm = SearchResult.DRM_UNLOCKED yield s if not doc.xpath('//div[@class="site_bottom"]//a[@class="right"]'): break page+=1
def search(self, query, max_results=10, timeout=60): br = browser() page=1 counter = max_results while counter: with closing(br.open('http://ebooki.allegro.pl/szukaj?fraza=' + urllib.quote(query) + '&strona=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read().decode('utf-8')) for data in doc.xpath('//div[@class="listing-list"]/div[@class="listing-list-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="listing-cover-wrapper"]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="listing-cover-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="listing-info"]/div[1]/a/text()')) author = ', '.join(data.xpath('.//div[@class="listing-info"]/div[2]/a/text()')) price = ''.join(data.xpath('.//div[@class="book-price"]/text()')) formats = ', '.join(data.xpath('.//div[@class="listing-buy-formats"]//div[@class="devices-wrapper"]/span[@class="device-label"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://ebooki.allegro.pl/' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://ebooki.allegro.pl/' + id[1:] s.formats = formats.upper() s.drm = SearchResult.DRM_UNLOCKED yield s if not doc.xpath('//a[@class="paging-arrow right-paging-arrow"]'): break page+=1
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing(br.open('http://www.publio.pl/e-booki,strona' + str(page) + '.html?q=' + quote(query), timeout=timeout)) as f: # noqa doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="products-list"]//div[@class="product-tile"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="product-tile-cover"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[@class="product-tile-cover-photo"]/@src')) title = ''.join(data.xpath('.//span[@class="product-tile-title-long"]/text()')) author = ', '.join(data.xpath('.//span[@class="product-tile-author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="product-tile-price-wrapper "]/a/ins/text()')) formats = ''.join(data.xpath('.//a[@class="product-tile-cover"]/img/@alt')).split(' - ebook ')[1] counter -= 1 s = SearchResult() s.cover_url = 'http://www.publio.pl' + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = 'http://www.publio.pl' + id.strip() s.formats = formats.upper().strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page+=1
def search(self, query, max_results=10, timeout=60): br = browser() page=1 counter = max_results while counter: with closing(br.open('https://www.swiatebookow.pl/ebooki/?q=' + quote(query) + '&page=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read().decode('utf-8')) for data in doc.xpath('//div[@class="category-item-container"]//div[@class="book-large"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="cover-xs"]/img/@src')) price = ''.join(data.xpath('.//span[@class="item-price"]/text()')+data.xpath('.//span[@class="sub-price"]/text()')) title = ''.join(data.xpath('.//h3/text()')) author = ', '.join(data.xpath('.//div[@class="details"]/p/a/text()')) counter -= 1 s = SearchResult() s.cover_url = 'https://www.swiatebookow.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'https://www.swiatebookow.pl' + id # s.formats = formats.upper() s.drm = SearchResult.DRM_UNLOCKED yield s if not doc.xpath('//div[@class="paging_bootstrap pagination"]//a[@class="next"]'): break page+=1
def search(self, query, max_results=10, timeout=60): url = 'https://www.legimi.pl/ebooki/?sort=score&searchphrase=' + quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@class="book-search row auto-clear"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="panel-body"]/a/@href')) if not id: continue cover_url = ''.join( data.xpath('.//div[@class="img-content"]/img/@data-src')) title = ''.join( data.xpath( './/a[@class="book-title clampBookTitle"]/text()')) author = ' '.join( data.xpath( './/div[@class="authors-container clampBookAuthors"]/a/text()' )) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.detail_item = 'https://www.legimi.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = ( 'https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join( data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'https://www.whsmith.co.uk' + id_ cover_url = ''.join( data.xpath('.//img[@class="product_image"]/@src')) title = ''.join( data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join( data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join( data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join( data.xpath( './/span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join( data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s