def search(self, query, max_results=10, timeout=60): url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "results-pane")]'): if counter <= 0: break id = ''.join(data.xpath('./div/div/h2/a/@href')).strip() if not id: continue cover_url = ''.join( data.xpath('.//div[@class="image"]/a/img/@src')) if not cover_url.startswith("http"): cover_url = 'http://www.waterstones.com' + cover_url title = ''.join(data.xpath('./div/div/h2/a/text()')) author = ', '.join( data.xpath('.//p[@class="byAuthor"]/a/text()')) price = ''.join( data.xpath( './/p[@class="price"]/span[@class="priceRed2"]/text()') ) drm = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "DRM")])' ) pdf = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "PDF")])' ) epub = data.xpath( 'boolean(.//td[@headers="productFormat" and contains(., "EPUB")])' ) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if drm: s.drm = SearchResult.DRM_LOCKED else: s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ', '.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item backgroundmix"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() if 'epub_drm' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'EPUB' counter -= 1 yield s elif 'pdf' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'PDF' counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED if 'MOBI_nieb' in formats: formats.remove('MOBI_nieb') formats.append('MOBI') s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item backgroundmix"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t mYHaveItYes"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//td[@class="va-t"]/h3/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/strong/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('.')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() if 'epub_drm' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'EPUB' counter -= 1 yield s elif 'pdf' in formats: s.drm = SearchResult.DRM_LOCKED s.formats = 'PDF' counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED if 'MOBI_nieb' in formats: formats.remove('MOBI_nieb') formats.append('MOBI') s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/')+1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[contains(@class, "productListing")]/tr'): if counter <= 0: break details = data.xpath('./td/div[@class="prodImage"]/a') if not details: continue details = details[0] id = ''.join(details.xpath('./@href')).strip() id = id[id.rfind('/')+1:] i = id.rfind('?') if i > 0: id = id[:i] if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(details.xpath('./img/@src')) title = ''.join(details.xpath('./img/@title')).strip() author = ''.join(data.xpath('./td/div[@class="prodTitle"]/h3/a/text()')).strip() price = ''.join(data.xpath('./td/div[@class="prodTitle"]/b/text()')) pdf = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: Pdf")])') epub = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "Bestandsformaat: ePub")])') nodrm = data.xpath('boolean(./td/div[@class="prodTitle"]/' 'p[contains(text(), "zonder DRM") or' ' contains(text(), "watermerk")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if nodrm: s.drm = SearchResult.DRM_UNLOCKED else: s.drm = SearchResult.DRM_LOCKED s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ','.join(formats) yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = type(u'')(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): """Searches LibGen for Books. Since the mirror links are not direct downloads, it should not provide these as `s.downloads`. """ debug_print = partial(module_debug_print, 'LibgenStore:search:') debug_print('search:query = ', query) libgen_results = self.libgen.search(query) for result in libgen_results.results[:min(max_results, len(libgen_results.results))]: debug_print('result.title = ', result.title) for mirror in result.mirrors[0:1]: # Calibre only shows 1 anyway debug_print('result.mirror.url = ', mirror.url) s = SearchResult() s.store_name = PLUGIN_NAME s.cover_url = result.image_url s.title = '{} ({}, {}{})'.format( result.title, result.language, mirror.size, mirror.unit) s.author = result.authors s.price = '0.00' s.detail_item = result.md5 s.drm = SearchResult.DRM_UNLOCKED s.formats = mirror.format s.plugin_author = PLUGIN_AUTHORS debug_print('s = ', s) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.es/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + urllib2.quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src')) title = ''.join(data.xpath('.//a[@class="title"]//text()')) author = ', '.join(data.xpath('.//a[@class="contributor"]//text()')) price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) raw = read_url(url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('h2.title', item): title = etree.tostring(p, method='text', encoding='unicode').strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None if title: for p in select('p.subtitle', item): title += ' - ' + etree.tostring( p, method='text', encoding='unicode').strip() authors = [] for a in select('.contributors a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding='unicode').strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding='unicode').strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) id = id.split('.mobile')[0] title = ''.join(data.xpath('.//span[@class="title"]/text()')) author = ''.join(data.xpath('.//span[@class="subtitle"]/text()')) counter -= 1 s = SearchResult() s.cover_url = '' s.detail_item = id.strip() s.title = title.strip() s.author = author.strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productInline"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="productThumb"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src')) title = ''.join(data.xpath('.//a[@class="title"]/text()')) author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()')) price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()')) price = re.sub('\.', ',', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://zixo.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//span[@class="f"]//a//text()') if authors and authors[-1].strip().lower() in ('preview', 'read'): authors = authors[:-1] else: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=12, timeout=60): url = 'http://virtualo.pl/?q=' + urllib.quote( query) + '&f=format_id:4,6,3' br = browser() no_drm_pattern = re.compile(r'Znak wodny|Brak') counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="content"]//div[@class="list_box list_box_border"]' ): if counter <= 0: break id = ''.join( data.xpath('.//div[@class="list_middle_left"]//a/@href') ).split(r'?q=')[0] if not id: continue price = ''.join( data.xpath( './/span[@class="price"]/text() | .//span[@class="price abbr"]/text()' )) cover_url = ''.join( data.xpath( './/div[@class="list_middle_left"]//a//img/@src')) title = ''.join( data.xpath( './/div[@class="list_title list_text_left"]/a/text()')) author = ', '.join( data.xpath( './/div[@class="list_authors list_text_left"]/a/text()' )) formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath( './/div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src' ) ] nodrm = no_drm_pattern.search(''.join( data.xpath( './/div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()' ))) counter -= 1 s = SearchResult() s.cover_url = cover_url.split('.jpg')[0] + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = 'http://virtualo.pl' + id.strip().split( 'http://')[0] s.formats = ', '.join(formats).upper() s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): print("search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( {"q": q}) print(url) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc) > 0: result = json.loads(json_doc) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[t['type']] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print("scrape nothing.")
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): if counter <= 0: break id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=10, timeout=60): print( "search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( { "q": q } ) print( url ) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc)>0: result = json.loads( json_doc ) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[ t['type'] ] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print( "scrape nothing." )
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query), timeout=timeout, ) ) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = "".join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original')) title = "".join(data.xpath('.//div[@class="img"]/a/@title')) title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + ". " + title2 if ( "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()') ).strip() == "Seria:" ): series = "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title') ) title = title + " (seria " + series + ")" author = ", ".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title') ) price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()')) if not price: price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip() formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt')) counter -= 1 s = SearchResult() s.cover_url = "http://www.publio.pl" + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = "http://www.publio.pl" + id.strip() s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(" DRM", "").strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = ('https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'https://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//div[@class="f"]//a//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-ebooki?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' counter = max_results try: results = fork_job(js_browser,'get_results', (url, timeout,), module_is_source_code=True) except WorkerError as e: raise Exception('Could not get results: %s'%e.orig_tb) doc = html.fromstring(strip_encoding_declarations(results['result'])) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()')) price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()')) formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()')) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'): if counter <= 0: break curr = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title' )).strip() amt = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()' )).strip() s = SearchResult() s.price = (curr + ' ' + amt) if (curr and amt) else _('Not Available') title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue s.title = title.strip() s.author = ''.join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')).strip() if not s.author: continue detail_url = ''.join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')) if not detail_url: continue s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath('descendant::li[@class="coverart"]/' 'descendant::img[@src]/@src')) if cover_url: if cover_url.startswith('//'): cover_url = 'http:' + cover_url elif cover_url.startswith('/'): cover_url = 'http://ebookstore.sony.com' + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(query, max_results=10, timeout=60): url = 'https://www.smashwords.com/books/search?query=' + urllib.parse.quote( query) br = browser() try: br.set_simple_cookie('adultOff', 'erotica', '.smashwords.com', path='/') except AttributeError: pass # old version of mechanize counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="pageContent"]//div[@class="library-book"]'): if counter <= 0: break data = html.fromstring(html.tostring(data)) id = None id_a = ''.join( data.xpath('//a[contains(@class, "library-title")]/@href')) if id_a: id = id_a.split('/')[-1] if not id: continue cover_url = ''.join( data.xpath('//img[contains(@class, "book-list-image")]/@src')) title = ''.join( data.xpath('.//a[contains(@class, "library-title")]/text()')) author = ''.join(data.xpath('.//a[@itemprop="author"]//text()')) price = ''.join(data.xpath('.//div[@class="subnote"]//text()')) if 'Price:' in price: try: price = price.partition('Price:')[2] price = re.sub('\s', ' ', price).strip() price = price.split(' ')[0].strip() except Exception: price = 'Unknown' if price == 'Free!': price = '$0.00' counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = '/books/view/' + id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() # MOBI should be send first, if 'MOBI' in formats: t = copy.copy(s) t.title += ' MOBI' t.drm = SearchResult.DRM_UNLOCKED t.formats = 'MOBI' formats.remove('MOBI') counter -= 1 yield t # and the remaining formats (if any) next if formats: if 'epub' in formats: formats.remove('epub') formats.append('WOBLINK') if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'): formats.insert(0, 'EPUB') s.drm = SearchResult.DRM_LOCKED s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): br = browser() page = 1 counter = max_results while counter: with closing( br.open(u'https://cdp.pl/ksiazki/e-book.html?q=' + urllib.quote_plus(query) + '&p=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="grid-of-products"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="product-image"]/@href')) if not id: continue if 'ksiazki' not in id: continue cover_url = ''.join( data.xpath( './/a[@class="product-image"]/img/@data-src')) title = ''.join(data.xpath('.//h3[1]/a/@title')) price = ''.join( data.xpath('.//span[@class="custom_price"]/text()') ) + ',' + ''.join( data.xpath( './/span[@class="custom_price"]/sup/text()')) author = '' formats = '' with closing(br.open(id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ', '.join( idata.xpath( './/ul[@class="film-data"]/li[1]/p/text()')) formats = idata.xpath( '//div[@class="product-attributes-container"][2]/ul/li/span/text()' )[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author s.price = price + ' zł' s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('//span[@class="next-page"]/a'): break page += 1
def search(self, query, max_results=10, timeout=60): s = SearchResult() s.title = 'Amazon required that this<br>store be permanently closed.' s.author = '' s.price = '' s.detail_item = '' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-e-book?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()')) author = ', '.join(data.xpath('.//h3[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()')) price = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_opcjezakupu_cena"]/span/text()')) price = re.sub('\.', ',', price) formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_formaty"]/span/text()')) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'EPUB DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def search(self, query, max_results=10, timeout=60): counter = max_results page = 1 url = 'http://www.gandalf.com.pl/we/' + urllib.quote_plus(query.decode('utf-8').encode('iso8859_2')) + '/bdb' br = browser() while counter: with closing(br.open((url + str(page-1) + '/#s') if (page-1) else (url + '/#s'), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="box"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="info"]/h3/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="info"]/h3/a/@id')) title = ''.join(data.xpath('.//div[@class="info"]/h3/a/@title')) formats = ''.join(data.xpath('.//div[@class="info"]/p[1]/text()')) formats = re.findall(r'\((.*?)\)',formats)[0] author = ''.join(data.xpath('.//div[@class="info"]/h4/text() | .//div[@class="info"]/h4/span/text()')) price = ''.join(data.xpath('.//div[@class="options"]/h3/text()')) price = re.sub('PLN', 'zł', price) price = re.sub('\.', ',', price) drm = data.xpath('boolean(.//div[@class="info" and contains(., "Zabezpieczenie: DRM")])') counter -= 1 s = SearchResult() s.cover_url = 'http://imguser.gandalf.com.pl/' + re.sub('p', 'p_', cover_url) + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() if drm: s.drm = SearchResult.DRM_LOCKED else: s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('boolean(//div[@class="wyszukiwanie_podstawowe_header"]//div[@class="box"])'): break page+=1
def search(self, query, max_results=10, timeout=60): search_url = 'http://www.amazon.fr/s/?url=search-alias%3Ddigital-text&field-keywords=' url = search_url + query.encode('ascii', 'backslashreplace').replace( '%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: # doc = html.fromstring(f.read().decode('latin-1', 'replace')) # Apparently amazon Europe is responding in UTF-8 now doc = html.fromstring(f.read()) data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]' format_xpath = './/span[@class="format"]/text()' cover_xpath = './/img[@class="productImage"]/@src' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). So we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = ''.join(data.xpath("@name")) cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath('.//a[@class="title"]/text()')) price = ''.join( data.xpath( './/div[@class="newPrice"]/span[contains(@class, "price")]/text()' )) author = unicode(''.join( data.xpath( './/h3[@class="title"]/span[@class="ptBrand"]/text()')) ) if author.startswith('de '): author = author[3:] counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=100, timeout=180): url = 'http://www.e-knjiga.si/rezultati_cover.php?query=' + urllib2.quote(query) print("will search for: " + urllib2.quote(query) + ":\n " + url) br = browser() # counter = max_results with closing(br.open(url, timeout=timeout)) as f: html=etree.HTML(f.read()) #get list of books for book in html.xpath("//table[@class='zebra']"): print(etree.tostring(book, pretty_print=True, method="html")) author = book.find('.//tr/[0]/td/[1]').text title = book.find('.//tr/[0]/td/[2]/a').text details = 'http://www.e-knjiga.si/' + book.find('.//tr/[0]/td/[2]/a').get("href") ## get details fo = urllib2.urlopen(details) det=etree.HTML(fo.read()) fo.close() table=det.find(".//div[@id='center_container']").find('./table') cover='http://www.e-knjiga.si/' + table.find('.//tr/[1]/td/[1]/div/img').get("src") description=table.find(".//tr/[6]/td[@class='knjige_spremna']").text links=[] files=table.find('.//tr/[7]/td/[1]') for file in files.iter('a'): links.append("http://www.e-knjiga.si/"+file.get("href")) #print("Author: " + author) #print("Title: " + title) #print("Details: " + details) #print("Description: " + description) #print("Cover: " + cover) #print("Files: ") #print('\n '.join(links)) s = SearchResult() s.title = title s.author = author s.price = "0.00eur" s.drm = SearchResult.DRM_UNLOCKED s.detail_item = description for f in links: ftype = f.split(".")[-1] s.downloads[ftype] = f s.formats += ftype s.cover_url = cover yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.nexto.pl/szukaj.xml?search-clause=' + urllib.parse.quote_plus( query) + '&scid=1015' br = browser() offset = 0 counter = max_results while counter: with closing( br.open(url + '&_offset=' + str(offset), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="productslist"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="col-2"]/a/@href')) if not id: continue price = ''.join( data.xpath('.//strong[@class="nprice"]/text()')) cover_url = ''.join( data.xpath('.//img[@class="cover"]/@src')) cover_url = re.sub(r'%2F', '/', cover_url) cover_url = re.sub(r'widthMax=120&heightMax=200', 'widthMax=64&heightMax=64', cover_url) title = ''.join(data.xpath('.//a[@class="title"]/text()')) title = re.sub(r' – ebook', '', title) author = ', '.join( data.xpath('.//div[@class="col-7"]//h4//a/text()')) formats = ', '.join( data.xpath('.//ul[@class="formats"]/li//b/text()')) DrmFree = re.search( r'znak', str(data.xpath( './/ul[@class="formats"]/li//b/@title'))) counter -= 1 s = SearchResult() s.cover_url = cover_url if cover_url[: 4] == 'http' else 'http://www.nexto.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED if DrmFree else SearchResult.DRM_LOCKED s.formats = formats.upper().strip() yield s if not doc.xpath( '//div[@class="listnavigator"]//a[@class="next"]'): break offset += 10
def search(self, query, max_results=10, timeout=60): url = 'http://www.waterstones.com/waterstonesweb/simpleSearch.do?simpleSearchString=ebook+' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "results-pane")]'): if counter <= 0: break id = ''.join(data.xpath('./div/div/h2/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('.//div[@class="image"]/a/img/@src')) if not cover_url.startswith("http"): cover_url = 'http://www.waterstones.com' + cover_url title = ''.join(data.xpath('./div/div/h2/a/text()')) author = ', '.join(data.xpath('.//p[@class="byAuthor"]/a/text()')) price = ''.join(data.xpath('.//p[@class="price"]/span[@class="priceRed2"]/text()')) drm = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "DRM")])') pdf = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "PDF")])') epub = data.xpath('boolean(.//td[@headers="productFormat" and contains(., "EPUB")])') counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price if drm: s.drm = SearchResult.DRM_LOCKED else: s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id formats = [] if epub: formats.append('ePub') if pdf: formats.append('PDF') s.formats = ', '.join(formats) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook&view=list' % ( query.replace(' ', '-'), urllib.parse.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() doc = html.fromstring(raw) for data in doc.xpath( '//ol[contains(@class, "result-set")]/li[contains(@class, "result")]' ): if counter <= 0: break id = ''.join( data.xpath( './/div[contains(@class, "image-block")]/a/@href')) if not id: continue cover_url = '' cover_id = ''.join( data.xpath( './/img[contains(@class, "product-image")]/@id')) m = re.search(r"%s'.*?srcUrl: '(?P<iurl>.*?)'.*?}" % cover_id, raw) if m: cover_url = m.group('iurl') title = ''.join( data.xpath( 'descendant::p[@class="title"]//span[@class="name"]//text()' )).strip() if not title: continue author = ', '.join( data.xpath( './/ul[contains(@class, "contributors")]//a[contains(@class, "subtle")]//text()' )).strip() price = ''.join( data.xpath('.//a[contains(@class, "bn-price")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = "http://ebookstore.sony.com/search?keyword=%s" % urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]' ): if counter <= 0: break curr = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title') ).strip() amt = "".join( item.xpath('descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()') ).strip() s = SearchResult() s.price = (curr + " " + amt) if (curr and amt) else _("Not Available") title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method="text", encoding=unicode) if not title: continue s.title = title.strip() s.author = "".join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()') ).strip() if not s.author: continue detail_url = "".join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href') ) if not detail_url: continue if detail_url.startswith("/"): detail_url = "http:" + detail_url s.detail_item = detail_url counter -= 1 cover_url = "".join(item.xpath('descendant::li[@class="coverart"]/' "descendant::img[@src]/@src")) if cover_url: if cover_url.startswith("//"): cover_url = "http:" + cover_url elif cover_url.startswith("/"): cover_url = "http://ebookstore.sony.com" + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = "Sony" yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{3,}$', uquery) if not reObj: return base_url = 'http://chitanka.info' url = base_url + '/search?q=' + urllib2.quote(query) counter = max_results # search for book title br = browser() try: with closing(br.open(url, timeout=timeout)) as f: f = unicode(f.read(), 'utf-8') doc = html.fromstring(f) for data in doc.xpath('//ul[@class="superlist booklist"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="booklink"]/@href')).strip() if not id: continue counter -= 1 s = SearchResult() s.cover_url = ''.join( data.xpath( './/a[@class="booklink"]/img/@src')).strip() s.title = ''.join( data.xpath( './/a[@class="booklink"]/i/text()')).strip() s.author = ''.join( data.xpath( './/span[@class="bookauthor"]/a/text()')).strip() s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-fb2"]/@href')).strip( ).replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-epub"]/@href')).strip( ).replace('.zip', '') s.downloads['TXT'] = base_url + ''.join( data.xpath('.//a[@class="dl dl-txt"]/@href')).strip( ).replace('.zip', '') s.formats = 'FB2, EPUB, TXT, SFB' yield s except urllib2.HTTPError, e: if e.code == 404: return else: raise
def parse_book_details(self, node): r = SearchResult() r.title = text(node, './/*[@itemprop="name"]') r.author = text(node, './/*', 'bookdetails__authorname') r.price = text(node, './/*', 'bookdetails__price') r.cover_url = text(node, './/img[@itemprop="image"]', '', '/@src') r.formats = text(node, './/*', 'book_info__format', '/span[2]/text()') r.drm = text(node, './/*', 'book_info__drm', '/span[2]/text()') return r
def search(self, query, max_results=10, timeout=60): url = 'http://www.baenebooks.com/searchadv.aspx?IsSubmit=true&SearchTerm=' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table//table//table//table//tr'): if counter <= 0: break id = ''.join(data.xpath('./td[1]/a/@href')) if not id or not id.startswith('p-'): continue title = ''.join(data.xpath('./td[1]/a/text()')) author = '' cover_url = '' price = '' with closing( br.open('http://www.baenebooks.com/' + id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ''.join( idata.xpath( '//span[@class="ProductNameText"]/../b/text()')) author = author.split('by ')[-1] price = ''.join( idata.xpath('//span[@class="variantprice"]/text()')) a, b, price = price.partition('$') price = b + price pnum = '' mo = re.search(r'p-(?P<num>\d+)-', id.strip()) if mo: pnum = mo.group('num') if pnum: cover_url = 'http://www.baenebooks.com/' + ''.join( idata.xpath( '//img[@id="ProductPic%s"]/@src' % pnum)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'RB, MOBI, EPUB, LIT, LRF, RTF, HTML' yield s