def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = type(u'')(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): # check for cyrillic symbols before performing search uquery = unicode(query.strip(), 'utf-8') reObj = re.search(u'^[а-яА-Я\\d\\s]{2,}$', uquery) if not reObj: return base_url = 'http://e-knigi.net' url = base_url + '/virtuemart?page=shop.browse&search_category=0&search_limiter=anywhere&keyword=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) # if the store finds only one product, it opens directly detail view for data in doc.xpath('//div[@class="prod_details"]'): s = SearchResult() s.cover_url = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@src')).strip() s.title = ''.join(data.xpath('.//div[@class="vm_main_info clearfix"]/div[@class="lf"]/a/img/@alt')).strip() s.author = ''.join(data.xpath('.//div[@class="td_bg clearfix"]/div[@class="gk_product_tab"]/div/table/tr[3]/td[2]/text()')).strip() s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = url s.drm = SearchResult.DRM_UNLOCKED yield s return # search in store results for data in doc.xpath('//div[@class="browseProductContainer"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')).strip() if not id: continue title = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@title')).strip() author = ''.join(data.xpath('.//div[@style="float:left;width:90%"]/b/text()')).strip().replace('Автор: ', '') if title.lower().find(query.lower()) == -1 and author.lower().find(query.lower()) == -1: continue counter -= 1 s = SearchResult() s.cover_url = ''.join(data.xpath('.//a[@class="gk_vm_product_image"]/img/@src')).strip() s.title = title s.author = author s.price = ''.join(data.xpath('.//span[@class="productPrice"]/text()')).strip() s.detail_item = base_url + id s.drm = SearchResult.DRM_UNLOCKED yield s
def parse_book_page(doc, base_url, counter): for data in doc.xpath('//div[@class="booklist"]/div/div'): if counter <= 0: break id = ''.join( data.xpath('.//div[@class="media-body"]/a[@class="booklink"]/@href' )).strip() if not id: continue counter -= 1 s = SearchResult() s.cover_url = 'http:' + ''.join( data.xpath( './/div[@class="media-left"]/a[@class="booklink"]/div/img/@src' )).strip() s.title = ''.join( data.xpath( './/div[@class="media-body"]/a[@class="booklink"]/i/text()') ).strip() alternative_headline = data.xpath( './/div[@class="media-body"]/div[@itemprop="alternativeHeadline"]/text()' ) if len(alternative_headline) > 0: s.title = "{} ({})".format(s.title, ''.join(alternative_headline).strip()) s.author = ', '.join( data.xpath( './/div[@class="media-body"]/div[@class="bookauthor"]/span/a/text()' )).strip(', ') s.detail_item = id s.drm = SearchResult.DRM_UNLOCKED s.downloads['FB2'] = base_url + ''.join( data.xpath( './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-fb2")]/@href' )).strip().replace('.zip', '') s.downloads['EPUB'] = base_url + ''.join( data.xpath( './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-epub")]/@href' )).strip().replace('.zip', '') s.downloads['TXT'] = base_url + ''.join( data.xpath( './/div[@class="media-body"]/div[@class="download-links"]/div/a[contains(@class,"dl-txt")]/@href' )).strip().replace('.zip', '') s.formats = 'FB2, EPUB, TXT' yield s return counter
def search(self, query, max_results=10, timeout=60): print("search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( {"q": q}) print(url) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc) > 0: result = json.loads(json_doc) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[t['type']] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print("scrape nothing.")
def search(self, query, max_results=15, timeout=60): search_url = ( self.shop_url + "/webservice/webservice.asmx/SearchWebService?" "searchText=%s&searchContext=ebook" % urllib2.quote(query) ) search_urls = [search_url] xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format("ID")) s.title = data.xpath(xp_template.format("Name")) s.author = data.xpath(xp_template.format("Author")) s.price = data.xpath(xp_template.format("Price")) s.cover_url = data.xpath(xp_template.format("Picture")) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=25, timeout=60): url = 'http://ebookpoint.pl/search?qa=&szukaj=' + quote_plus( query.decode('utf-8').encode('iso-8859-2')) + '&serwisyall=0&wprzyg=0&wsprzed=1&wyczerp=0&formaty=em-p' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="list"]/li'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) if not id: continue formats = ', '.join(data.xpath('.//ul[@class="book-type book-type-points"]//span[@class="popup"]/span/text()')) cover_url = ''.join(data.xpath('.//p[@class="cover"]/img/@data-src')) title = ''.join(data.xpath('.//div[@class="book-info"]/h3/a/text()')) author = ''.join(data.xpath('.//p[@class="author"]//text()')) price = ''.join(data.xpath('.//p[@class="price price-incart"]/a/ins/text()|.//p[@class="price price-add"]/a/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = re.sub(r'\.',',',price) s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.es/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def _do_search(self, url, max_results, timeout): br = browser() with closing(br.open(url, timeout=timeout)) as f: page = f.read().decode('utf-8') doc = html.fromstring(page) for data in doc.xpath('//ul[contains(@class,"book_list")]/li'): if max_results <= 0: break s = SearchResult() s.detail_item = ''.join(data.xpath('.//a[@class="th"]/@href')).strip() if not id: continue s.cover_url = ''.join(data.xpath('.//a[@class="th"]/img/@data-original')).strip() s.title = ''.join(data.xpath('.//div[@class="item-title"]/a/text()')).strip() s.author = ', '.join(data.xpath('.//div[@class="item-author"]/a/text()')).strip(', ') price_list = data.xpath('.//div[@class="item-price"]') for price_item in price_list: if price_item.text.startswith('е-книга:'): s.price = ''.join(price_item.xpath('.//span/text()')) break s.price = '0.00 лв.' if not s.price and not price_list else s.price if not s.price: # no e-book available continue max_results -= 1 yield s
def search_kobo(query, max_results=10, timeout=60, write_html_to=None): from css_selectors import Select url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query) raw = read_url(url, timeout=timeout) if write_html_to is not None: with open(write_html_to, 'w') as f: f.write(raw) doc = html.fromstring(raw) select = Select(doc) for i, item in enumerate(select('.result-items .item-wrapper.book')): if i == max_results: break for img in select('.item-image img[src]', item): cover_url = img.get('src') if cover_url.startswith('//'): cover_url = 'https:' + cover_url break else: cover_url = None for p in select('h2.title', item): title = etree.tostring(p, method='text', encoding='unicode').strip() for a in select('a[href]', p): url = a.get('href') break else: url = None break else: title = None if title: for p in select('p.subtitle', item): title += ' - ' + etree.tostring( p, method='text', encoding='unicode').strip() authors = [] for a in select('.contributors a.contributor-name', item): authors.append( etree.tostring(a, method='text', encoding='unicode').strip()) authors = authors_to_string(authors) for p in select('p.price', item): price = etree.tostring(p, method='text', encoding='unicode').strip() break else: price = None if title and authors and url: s = SearchResult() s.cover_url = cover_url s.title = title s.author = authors s.price = price s.detail_item = url s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(query, max_results=15, timeout=60): url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % urllib.quote_plus( query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] root = parse_html(raw) for tile in root.xpath('//*[@class="bShelfTile inline"]'): if counter <= 0: break counter -= 1 s = SearchResult(store_name='OZON.ru') s.detail_item = shop_url + tile.xpath( 'descendant::a[@class="eShelfTile_Link"]/@href')[0] s.title = tile.xpath( 'descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0] s.author = tile.xpath( 'descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0] s.price = ''.join( tile.xpath( 'descendant::div[contains(@class, "eShelfTile_Price")]/text()' )) s.cover_url = 'http:' + tile.xpath( 'descendant::img/@data-original')[0] s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//span[@class="f"]//a//text()') if authors and authors[-1].strip().lower() in ('preview', 'read'): authors = authors[:-1] else: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=12, timeout=60): url = 'http://virtualo.pl/?q=' + urllib.quote( query) + '&f=format_id:4,6,3' br = browser() no_drm_pattern = re.compile(r'Znak wodny|Brak') counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="content"]//div[@class="list_box list_box_border"]' ): if counter <= 0: break id = ''.join( data.xpath('.//div[@class="list_middle_left"]//a/@href') ).split(r'?q=')[0] if not id: continue price = ''.join( data.xpath( './/span[@class="price"]/text() | .//span[@class="price abbr"]/text()' )) cover_url = ''.join( data.xpath( './/div[@class="list_middle_left"]//a//img/@src')) title = ''.join( data.xpath( './/div[@class="list_title list_text_left"]/a/text()')) author = ', '.join( data.xpath( './/div[@class="list_authors list_text_left"]/a/text()' )) formats = [ form.split('_')[-1].replace('.png', '') for form in data.xpath( './/div[@style="width:55%;float:left;text-align:left;height:18px;"]//a/span/img/@src' ) ] nodrm = no_drm_pattern.search(''.join( data.xpath( './/div[@style="width:45%;float:right;text-align:right;height:18px;"]//span[@class="prompt_preview"]/text()' ))) counter -= 1 s = SearchResult() s.cover_url = cover_url.split('.jpg')[0] + '.jpg' s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = 'http://virtualo.pl' + id.strip().split( 'http://')[0] s.formats = ', '.join(formats).upper() s.drm = SearchResult.DRM_UNLOCKED if nodrm else SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): print( "search!") q = query.decode('utf-8') url = "https://views.scraperwiki.com/run/haodooscraperview/?" + urlencode( { "q": q } ) print( url ) br = browser() with closing(br.open(url, timeout=timeout)) as f: json_doc = f.read() if len(json_doc)>0: result = json.loads( json_doc ) for volume in result: s = SearchResult() s.title = volume['title'] s.detail_item = volume['url'] s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED if volume.has_key('type') and len(volume["type"]): for t in volume["type"]: s.downloads[ t['type'] ] = t['link'] s.formats = ', '.join(s.downloads.keys()) yield s else: print( "scrape nothing." )
def search(self, query, max_results=10, timeout=60): url = 'http://zixo.pl/wyszukiwarka/?search=' + urllib.quote(query.encode('utf-8')) + '&product_type=0' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="productInline"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="productThumb"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="productThumb"]/img/@src')) title = ''.join(data.xpath('.//a[@class="title"]/text()')) author = ','.join(data.xpath('.//div[@class="productDescription"]/span[1]/a/text()')) price = ''.join(data.xpath('.//div[@class="priceList"]/span/text()')) price = re.sub('\.', ',', price) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://zixo.pl' + id.strip() s.drm = SearchResult.DRM_LOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.barnesandnoble.com/s/%s?keyword=%s&store=ebook' % (query.replace(' ', '-'), urllib.quote_plus(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "result-set")]/li[contains(@class, "result")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[contains(@class, "image-bounding-box")]/a/@href')) if not id: continue cover_url = ''.join(data.xpath('.//img[contains(@class, "product-image")]/@src')) title = ''.join(data.xpath('.//a[@class="title"]//text()')) author = ', '.join(data.xpath('.//a[@class="contributor"]//text()')) price = ''.join(data.xpath('.//div[@class="price-format"]//span[contains(@class, "price")]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Nook' yield s
def search(self, query, max_results=10, timeout=60): url = 'https://www.beam-shop.de/search?saltFieldLimitation=all&sSearch=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "product--box")]'): if counter <= 0: break id_ = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('./div/div[contains(@class, "product--info")]/a//img/@srcset')) if cover_url: cover_url = cover_url.split(',')[0].strip() author = data.xpath('.//a[@class="product--author"]/text()')[0].strip() title = data.xpath('.//a[@class="product--title"]/text()')[0].strip() price = data.xpath('.//div[@class="product--price"]/span/text()')[0].strip() counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id_ # s.formats = None yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + urllib2.quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): url = 'http://m.gutenberg.org/ebooks/search.mobile/?default_prefix=all&sort_order=title&query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@class="results"]/li[@class="booklink"]'): if counter <= 0: break id = ''.join(data.xpath('./a/@href')) id = id.split('.mobile')[0] title = ''.join(data.xpath('.//span[@class="title"]/text()')) author = ''.join(data.xpath('.//span[@class="subtitle"]/text()')) counter -= 1 s = SearchResult() s.cover_url = '' s.detail_item = id.strip() s.title = title.strip() s.author = author.strip() s.price = '$0.00' s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): """Searches LibGen for Books. Since the mirror links are not direct downloads, it should not provide these as `s.downloads`. """ debug_print = partial(module_debug_print, 'LibgenStore:search:') debug_print('search:query = ', query) libgen_results = self.libgen.search(query) for result in libgen_results.results[:min(max_results, len(libgen_results.results))]: debug_print('result.title = ', result.title) for mirror in result.mirrors[0:1]: # Calibre only shows 1 anyway debug_print('result.mirror.url = ', mirror.url) s = SearchResult() s.store_name = PLUGIN_NAME s.cover_url = result.image_url s.title = '{} ({}, {}{})'.format( result.title, result.language, mirror.size, mirror.unit) s.author = result.authors s.price = '0.00' s.detail_item = result.md5 s.drm = SearchResult.DRM_UNLOCKED s.formats = mirror.format s.plugin_author = PLUGIN_AUTHORS debug_print('s = ', s) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bewrite.net/mm5/merchant.mvc?Search_Code=B&Screen=SRCH&Search=' + urllib2.quote( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="content"]//table/tr[position() > 1]'): if counter <= 0: break id = ''.join(data.xpath('.//a/@href')) if not id: continue heading = ''.join(data.xpath('./td[2]//text()')) title, q, author = heading.partition('by ') cover_url = '' price = '' counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.legimi.com/pl/ebooki/?szukaj=' + urllib.parse.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = ''.join(data.xpath('.//a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//span[@class="listImage imageDarkLoader"]/img/@src')) title = ''.join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()')) author = ''.join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()')) price = ''.join(data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.legimi.com/' + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://bookoteka.pl/list?search=' + urllib.quote_plus(query) + '&cat=1&hp=1&type=1' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="EBOOK"]'): if counter <= 0: break id = ''.join(data.xpath('.//a[@class="item_link"]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//a[@class="item_link"]/img/@src')) title = ''.join(data.xpath('.//div[@class="shelf_title"]/a/text()')) author = ''.join(data.xpath('.//div[@class="shelf_authors"][1]/text()')) price = ''.join(data.xpath('.//span[@class="EBOOK"]/text()')) price = price.replace('.', ',') formats = ', '.join(data.xpath('.//a[@class="fancybox protected"]/text()')) counter -= 1 s = SearchResult() s.cover_url = 'http://bookoteka.pl' + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://bookoteka.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.kobobooks.com/search/search.html?q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[contains(@class, "flowview-items")]/li'): if counter <= 0: break id = ''.join(data.xpath('./a[contains(@class, "block-link")]/@href')) if not id: continue id = id[1:] price = ''.join(data.xpath('.//a[contains(@class, "primary-button")]//text()')) cover_url = ''.join(data.xpath('.//img[1]/@src')) cover_url = 'http:%s' % cover_url title = ''.join(data.xpath('.//p[contains(@class, "flowview-item-title")]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price.strip() s.detail_item = 'http://store.kobobooks.com/' + id.strip() s.formats = 'EPUB' s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=20, timeout=60): url = 'http://www.escapemagazine.pl/wyszukiwarka?query=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item item_short"]'): if counter <= 0: break id = ''.join(data.xpath('.//h2[@class="title"]/a[1]/@href')) if not id: continue title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ''.join(data.xpath('.//div[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@class="price_now"]/strong/text()')) + ' zł' cover_url = ''.join(data.xpath('.//img[@class="cover"]/@src')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = 'http://www.escapemagazine.pl' + id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = 'PDF' yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode( 'ascii', 'backslashreplace').replace('%', '%25').replace( '\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read().decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]//text()' price_xpath = './/ul[contains(@class, "rsltL") or contains(@class, "rsltGridList")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format = ''.join(data.xpath(format_xpath)) if 'kindle' not in format.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: author = author.split('by ', 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.formats = 'Kindle' yield s
def search(self, query, max_results=20, timeout=60): br = browser() counter = max_results page = 1 while counter: with closing( br.open( "http://www.publio.pl/e-booki,strona" + str(page) + ".html?q=" + urllib.quote(query), timeout=timeout, ) ) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="item"]'): if counter <= 0: break id = "".join(data.xpath('.//div[@class="img"]/a/@href')) if not id: continue cover_url = "".join(data.xpath('.//div[@class="img"]/a/img/@data-original')) title = "".join(data.xpath('.//div[@class="img"]/a/@title')) title2 = "".join(data.xpath('.//div[@class="desc"]/h5//text()')) if title2: title = title + ". " + title2 if ( "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/span/text()') ).strip() == "Seria:" ): series = "".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[last()]/a/@title') ) title = title + " (seria " + series + ")" author = ", ".join( data.xpath('./div[@class="desc"]/div[@class="detailShortList"]/div[@class="row"][1]/a/@title') ) price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/ins/text()')) if not price: price = "".join(data.xpath('.//div[@class="priceBox tk-museo-slab"]/text()')).strip() formats = ", ".join(data.xpath('.//div[@class="formats"]/a/img/@alt')) counter -= 1 s = SearchResult() s.cover_url = "http://www.publio.pl" + cover_url s.title = title.strip() s.author = author s.price = price s.detail_item = "http://www.publio.pl" + id.strip() s.drm = SearchResult.DRM_LOCKED if "DRM" in formats else SearchResult.DRM_UNLOCKED s.formats = formats.replace(" DRM", "").strip() yield s if not doc.xpath('boolean(//a[@class="next"])'): break page += 1
def search(self, query, max_results=10, timeout=60): url = ('http://www.ebook.nl/store/advanced_search_result.php?keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="books"]/div[@itemtype="http://schema.org/Book"]'): if counter <= 0: break id = ''.join(data.xpath('./meta[@itemprop="url"]/@content')).strip() if not id: continue cover_url = 'http://www.ebook.nl/store/' + ''.join(data.xpath('.//img[@itemprop="image"]/@src')) title = ''.join(data.xpath('./span[@itemprop="name"]/a/text()')).strip() author = ''.join(data.xpath('./span[@itemprop="author"]/a/text()')).strip() if author == ' ': author = '' price = ''.join(data.xpath('.//span[@itemprop="price"]//text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_UNKNOWN s.detail_item = id yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.foyles.co.uk/catalog/search/?query=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="doc-item"]'): if counter <= 0: break id_ = ''.join(data.xpath('.//p[@class="doc-cover"]/a/@href')).strip() if not id_: continue id_ = 'http://ebooks.foyles.co.uk' + id_ cover_url = ''.join(data.xpath('.//p[@class="doc-cover"]/a/img/@src')) title = ''.join(data.xpath('.//span[@class="title"]/a/text()')) author = ', '.join(data.xpath('.//span[@class="author"]/span[@class="author"]/text()')) price = ''.join(data.xpath('.//span[@itemprop="price"]/text()')).strip() format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = SearchResult.DRM_LOCKED s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.bubok.pt/resellers/calibre_search/' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[contains(@class, "libro")]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="url"]/text()')) title = ''.join(data.xpath('.//div[@class="titulo"]/text()')) author = ''.join(data.xpath('.//div[@class="autor"]/text()')) price = ''.join(data.xpath('.//div[@class="precio"]/text()')) formats = ''.join(data.xpath('.//div[@class="formatos"]/text()')) cover = ''.join(data.xpath('.//div[@class="portada"]/text()')) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.price = price.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.strip() s.cover_url = cover.strip() yield s
def search(self, query, max_results=10, timeout=60): url = ('http://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + urllib2.quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'http://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://ebooks.eharlequin.com/BANGSearch.dll?Type=FullText&FullTextField=All&FullTextCriteria=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//table[not(.//@class="sidelink")]/tr[.//ul[@id="details"]]'): if counter <= 0: break id = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/@href')) if not id: continue title = ''.join(data.xpath('.//ul[@id="details"]/li[@id="title-results"]/a/text()')) author = ''.join(data.xpath('.//ul[@id="details"]/li[@id="author"][1]//a/text()')) price = ''.join(data.xpath('.//div[@class="ourprice"]/font/text()')) cover_url = ''.join(data.xpath('.//a[@href="%s"]/img/@src' % id)) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = 'http://ebooks.eharlequin.com/' + id.strip() s.formats = 'EPUB' yield s
def search_amazon(query, max_results=10, timeout=60, write_html_to=None, base_url=SEARCH_BASE_URL, base_query=SEARCH_BASE_QUERY, field_keywords='k' ): uquery = base_query.copy() uquery[field_keywords] = query def asbytes(x): if isinstance(x, type('')): x = x.encode('utf-8') return x uquery = {asbytes(k):asbytes(v) for k, v in uquery.items()} url = base_url + '?' + urlencode(uquery) br = browser(user_agent=get_user_agent()) counter = max_results with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_html_to is not None: with open(write_html_to, 'wb') as f: f.write(raw) doc = html.fromstring(raw) for result in doc.xpath('//div[contains(@class, "s-result-list")]//div[@data-index and @data-asin]'): kformat = ''.join(result.xpath('.//a[contains(text(), "{}")]//text()'.format(KINDLE_EDITION))) # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. if 'kindle' not in kformat.lower(): continue asin = result.get('data-asin') if not asin: continue cover_url = ''.join(result.xpath('.//img/@src')) title = etree.tostring(result.xpath('.//h2')[0], method='text', encoding='unicode') adiv = result.xpath('.//div[contains(@class, "a-color-secondary")]')[0] aparts = etree.tostring(adiv, method='text', encoding='unicode').split() idx = aparts.index(BY) author = ' '.join(aparts[idx+1:]).split('|')[0].strip() price = '' for span in result.xpath('.//span[contains(@class, "a-price")]/span[contains(@class, "a-offscreen")]'): q = ''.join(span.xpath('./text()')) if q: price = q break counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.detail_item = asin.strip() s.price = price.strip() s.formats = 'Kindle' yield s
def open_search(url, query, max_results=10, timeout=60): description = Description(url) url_template = description.get_best_template() if not url_template: return oquery = Query(url_template) # set up initial values oquery.searchTerms = query oquery.count = max_results url = oquery.url() counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: doc = etree.fromstring(f.read()) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: if 'http://opds-spec.org/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/image/thumbnail' in rel: s.cover_url = href elif 'http://opds-spec.org/acquisition/buy' in rel: s.detail_item = href elif 'http://opds-spec.org/acquisition/sample' in rel: pass elif 'http://opds-spec.org/acquisition' in rel: if type: ext = guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = href s.formats = ', '.join(s.downloads.keys()).strip() s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "author"]//*[local-name() = "name"]//text()')).strip() price_e = data.xpath('.//*[local-name() = "price"][1]') if price_e: price_e = price_e[0] currency_code = price_e.get('currencycode', '') price = ''.join(price_e.xpath('.//text()')).strip() s.price = currency_code + ' ' + price s.price = s.price.strip() yield s
def search(self, query, max_results=10, timeout=60): url = "http://www.legimi.com/pl/ebooki/?szukaj=" + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@id="listBooks"]/div'): if counter <= 0: break id = "".join(data.xpath('.//a[@class="plainLink"]/@href')) if not id: continue cover_url = "".join(data.xpath(".//img[1]/@src")) title = "".join(data.xpath('.//span[@class="bookListTitle ellipsis"]/text()')) author = "".join(data.xpath('.//span[@class="bookListAuthor ellipsis"]/text()')) price = "".join(data.xpath('.//div[@class="bookListPrice"]/span/text()')) counter -= 1 s = SearchResult() s.cover_url = "http://www.legimi.com/" + cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = "http://www.legimi.com/" + id.strip() yield s
def search(self, query, max_results=10, timeout=60): url = self.search_url + query.encode('ascii', 'backslashreplace').replace('%', '%25').replace('\\x', '%').replace(' ', '+') br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read())#.decode('latin-1', 'replace')) data_xpath = '//div[contains(@class, "prod")]' format_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and not(contains(@class, "bld"))]/text()' asin_xpath = '@name' cover_xpath = './/img[@class="productImage"]/@src' title_xpath = './/h3[@class="newaps"]/a//text()' author_xpath = './/h3[@class="newaps"]//span[contains(@class, "reg")]/text()' price_xpath = './/ul[contains(@class, "rsltL")]//span[contains(@class, "lrg") and contains(@class, "bld")]/text()' for data in doc.xpath(data_xpath): if counter <= 0: break # Even though we are searching digital-text only Amazon will still # put in results for non Kindle books (author pages). Se we need # to explicitly check if the item is a Kindle book and ignore it # if it isn't. format_ = ''.join(data.xpath(format_xpath)) if 'kindle' not in format_.lower(): continue # We must have an asin otherwise we can't easily reference the # book later. asin = data.xpath(asin_xpath) if asin: asin = asin[0] else: continue cover_url = ''.join(data.xpath(cover_xpath)) title = ''.join(data.xpath(title_xpath)) author = ''.join(data.xpath(author_xpath)) try: if self.author_article: author = author.split(self.author_article, 1)[1].split(" (")[0] except: pass price = ''.join(data.xpath(price_xpath)) counter -= 1 s = SearchResult() s.cover_url = cover_url.strip() s.title = title.strip() s.author = author.strip() s.price = price.strip() s.detail_item = asin.strip() s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Kindle' yield s
def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] ## add this as the fist try if it looks like ozon ID if re.match("^\d{6,9}$", query): ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query search_urls.insert(0, ozon_detail) xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) s.price = format_price_in_RUR(s.price) yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.ebookshoppe.com/search.php?search_query=' + quote(query) br = browser() br.addheaders = [("Referer", "http://www.ebookshoppe.com/")] counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="ProductList"]/li'): if counter <= 0: break id = ''.join(data.xpath('./div[@class="ProductDetails"]/' 'strong/a/@href')).strip() if not id: continue cover_url = ''.join(data.xpath('./div[@class="ProductImage"]/a/img/@src')) title = ''.join(data.xpath('./div[@class="ProductDetails"]/strong/a/text()')) price = ''.join(data.xpath('./div[@class="ProductPriceRating"]/em/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.price = price s.drm = SearchResult.DRM_UNLOCKED s.detail_item = id self.get_author_and_formats(s, timeout) if not s.author: continue yield s
def search(self, query, max_results=10, timeout=60): base_url = 'https://www.millsandboon.co.uk' url = base_url + '/search.aspx??format=ebook&searchText=' + urllib2.quote(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//article[contains(@class, "group")]'): if counter <= 0: break id_ = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/@href')).strip() if not id_: continue cover_url = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@src')) title = ''.join(data.xpath('.//div[@class="img-wrapper"]/a/img/@alt')).strip() author = ''.join(data.xpath('.//a[@class="author"]/text()')) price = ''.join(data.xpath('.//div[@class="type-wrapper"]/ul/li[child::span[text()="eBook"]]/a/text()')) format_ = ''.join(data.xpath('.//p[@class="doc-meta-format"]/span[last()]/text()')) drm = SearchResult.DRM_LOCKED counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = id_ s.drm = drm s.formats = format_ yield s
def search(self, query, max_results=10, timeout=60): url = ('https://www.whsmith.co.uk/search?keywordCategoryId=wc_dept_ebooks&results=60' '&page=1&keywords=' + quote(query)) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//li[@class="product"]'): if counter <= 0: break id_ = ''.join(data.xpath('./a[@class="product_image_wrap"]/@href')) if not id_: continue id_ = 'https://www.whsmith.co.uk' + id_ cover_url = ''.join(data.xpath('.//img[@class="product_image"]/@src')) title = ''.join(data.xpath('.//h4[@class="product_title"]/text()')) author = ', '.join(data.xpath('.//span[@class="product_second"]/text()')) price = ''.join(data.xpath('.//span[@class="price"]/text()')) counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.drm = SearchResult.DRM_LOCKED s.detail_item = id_ s.formats = 'ePub' yield s
def search(self, query, max_results=10, timeout=60): url = 'http://www.google.com/search?tbm=bks&q=' + urllib.quote_plus(query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ol[@id="rso"]/li'): if counter <= 0: break id = ''.join(data.xpath('.//h3/a/@href')) if not id: continue title = ''.join(data.xpath('.//h3/a//text()')) authors = data.xpath('.//div[@class="f"]//a//text()') while authors and authors[-1].strip().lower() in ('preview', 'read', 'more editions'): authors = authors[:-1] if not authors: continue author = ', '.join(authors) counter -= 1 s = SearchResult() s.title = title.strip() s.author = author.strip() s.detail_item = id.strip() s.drm = SearchResult.DRM_UNKNOWN yield s
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/publication?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//div[@class="book-item"]'): if counter <= 0: break id = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//td[@class="w10 va-t"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="title"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="author"]/a/text()')) price = ''.join(data.xpath('.//div[@class="prices"]/span[1]/span/text()')) price = re.sub('\.', ',', price) formats = [ form[8:-4].split('_')[0] for form in data.xpath('.//p[3]/img/@src')] s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() # MOBI should be send first, if 'MOBI' in formats: t = copy.copy(s) t.title += ' MOBI' t.drm = SearchResult.DRM_UNLOCKED t.formats = 'MOBI' formats.remove('MOBI') counter -= 1 yield t # and the remaining formats (if any) next if formats: if 'epub' in formats: formats.remove('epub') formats.append('WOBLINK') if 'E Ink' in data.xpath('.//div[@class="prices"]/img/@title'): formats.insert(0, 'EPUB') s.drm = SearchResult.DRM_LOCKED s.formats = ', '.join(formats).upper() counter -= 1 yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + urllib.quote_plus(query) counter = max_results br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = ''.join(data.xpath('./*[local-name() = "id"]/text()')).strip() s.detail_item = fix_url(url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub('[^\d]', '', id)))) if not s.detail_item: continue s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout/4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): s.cover_data = base64.b64decode(href.replace('data:image/png;base64,', '')) yield s
def search(query, max_results=10, timeout=60): url = 'https://www.smashwords.com/books/search?query=' + urllib.parse.quote( query) br = browser() try: br.set_simple_cookie('adultOff', 'erotica', '.smashwords.com', path='/') except AttributeError: pass # old version of mechanize counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath( '//div[@id="pageContent"]//div[@class="library-book"]'): if counter <= 0: break data = html.fromstring(html.tostring(data)) id = None id_a = ''.join( data.xpath('//a[contains(@class, "library-title")]/@href')) if id_a: id = id_a.split('/')[-1] if not id: continue cover_url = ''.join( data.xpath('//img[contains(@class, "book-list-image")]/@src')) title = ''.join( data.xpath('.//a[contains(@class, "library-title")]/text()')) author = ''.join(data.xpath('.//a[@itemprop="author"]//text()')) price = ''.join(data.xpath('.//div[@class="subnote"]//text()')) if 'Price:' in price: try: price = price.partition('Price:')[2] price = re.sub('\s', ' ', price).strip() price = price.split(' ')[0].strip() except Exception: price = 'Unknown' if price == 'Free!': price = '$0.00' counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author.strip() s.price = price s.detail_item = '/books/view/' + id.strip() s.drm = SearchResult.DRM_UNLOCKED yield s
def search(self, query, max_results=10, timeout=60): br = browser() page = 1 counter = max_results while counter: with closing( br.open(u'https://cdp.pl/ksiazki/e-book.html?q=' + urllib.quote_plus(query) + '&p=' + str(page), timeout=timeout)) as f: doc = html.fromstring(f.read()) for data in doc.xpath('//ul[@class="grid-of-products"]/li'): if counter <= 0: break id = ''.join( data.xpath('.//a[@class="product-image"]/@href')) if not id: continue if 'ksiazki' not in id: continue cover_url = ''.join( data.xpath( './/a[@class="product-image"]/img/@data-src')) title = ''.join(data.xpath('.//h3[1]/a/@title')) price = ''.join( data.xpath('.//span[@class="custom_price"]/text()') ) + ',' + ''.join( data.xpath( './/span[@class="custom_price"]/sup/text()')) author = '' formats = '' with closing(br.open(id.strip(), timeout=timeout / 4)) as nf: idata = html.fromstring(nf.read()) author = ', '.join( idata.xpath( './/ul[@class="film-data"]/li[1]/p/text()')) formats = idata.xpath( '//div[@class="product-attributes-container"][2]/ul/li/span/text()' )[-1] counter -= 1 s = SearchResult() s.cover_url = cover_url s.title = title.strip() s.author = author s.price = price + ' zł' s.detail_item = id.strip() s.drm = SearchResult.DRM_UNLOCKED s.formats = formats.upper().strip() yield s if not doc.xpath('//span[@class="next-page"]/a'): break page += 1
def search(self, query, max_results=10, timeout=60): url = 'http://ebookstore.sony.com/search?keyword=%s' % urllib.quote_plus( query) br = browser() counter = max_results with closing(br.open(url, timeout=timeout)) as f: doc = html.fromstring(f.read()) for item in doc.xpath( '//div[contains(@class, "searchResult")]/' 'descendant::li[contains(@class, "hreview")]'): if counter <= 0: break curr = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="currency"]/@title' )).strip() amt = ''.join( item.xpath( 'descendant::div[@class="pricing"]/descendant::*[@class="amount"]/text()' )).strip() s = SearchResult() s.price = (curr + ' ' + amt) if (curr and amt) else _('Not Available') title = item.xpath('descendant::h3[@class="item"]') if not title: continue title = etree.tostring(title[0], method='text', encoding=unicode) if not title: continue s.title = title.strip() s.author = ''.join( item.xpath('descendant::li[contains(@class, "author")]/' 'a[@class="fn"]/text()')).strip() if not s.author: continue detail_url = ''.join( item.xpath('descendant::h3[@class="item"]' '/descendant::a[@class="fn" and @href]/@href')) if not detail_url: continue s.detail_item = detail_url counter -= 1 cover_url = ''.join( item.xpath('descendant::li[@class="coverart"]/' 'descendant::img[@src]/@src')) if cover_url: if cover_url.startswith('//'): cover_url = 'http:' + cover_url elif cover_url.startswith('/'): cover_url = 'http://ebookstore.sony.com' + cover_url s.cover_url = url_slash_cleaner(cover_url) s.drm = SearchResult.DRM_UNKNOWN s.formats = 'Sony' yield s
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'http://m.gutenberg.org/ebooks/search.opds/?query=' + quote_plus(query) counter = max_results br = browser(user_agent='calibre/'+__version__) with closing(br.open(url, timeout=timeout)) as f: raw = f.read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name() = "entry"]'): if counter <= 0: break counter -= 1 s = SearchResult() # We could use the <link rel="alternate" type="text/html" ...> tag from the # detail odps page but this is easier. id = fix_url(''.join(data.xpath('./*[local-name() = "id"]/text()')).strip()) s.detail_item = url_slash_cleaner('%s/ebooks/%s' % (web_url, re.sub(r'[^\d]', '', id))) s.title = ' '.join(data.xpath('./*[local-name() = "title"]//text()')).strip() s.author = ', '.join(data.xpath('./*[local-name() = "content"]//text()')).strip() if not s.title or not s.author: continue # Get the formats and direct download links. with closing(br.open(id, timeout=timeout/4)) as nf: ndoc = etree.fromstring(nf.read()) for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): type = link.get('type') href = link.get('href') if type: ext = mimetypes.guess_extension(type) if ext: ext = ext[1:].upper().strip() s.downloads[ext] = fix_url(href) s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue for link in data.xpath('./*[local-name() = "link"]'): rel = link.get('rel') href = link.get('href') type = link.get('type') if rel and href and type: href = fix_url(href) if rel in ('http://opds-spec.org/thumbnail', 'http://opds-spec.org/image/thumbnail'): if href.startswith('data:image/png;base64,'): cdata = href.replace('data:image/png;base64,', '') if not isinstance(cdata, bytes): cdata = cdata.encode('ascii') s.cover_data = base64.b64decode(cdata) yield s
def search(self, query, max_results=10, timeout=60): s = SearchResult() s.title = 'Amazon required that this<br>store be permanently closed.' s.author = '' s.price = '' s.detail_item = '' s.drm = SearchResult.DRM_UNKNOWN yield s
def run(self): url = 'http://www.mobileread.com/forums/ebooks.php?do=getlist&type=html' self.update_details.emit(_('Checking last download date.')) last_download = self.config.get('last_download', None) # Don't update the book list if our cache is less than one week old. if last_download and (time.time() - last_download) < 604800: return self.update_details.emit(_('Downloading book list from MobileRead.')) # Download the book list HTML file from MobileRead. br = browser() raw_data = None try: with closing(br.open(url, timeout=self.timeout)) as f: raw_data = f.read() except: return if not raw_data or not self._run: return self.update_details.emit(_('Processing books.')) # Turn books listed in the HTML file into SearchResults's. books = [] try: data = html.fromstring(raw_data) raw_books = data.xpath('//ul/li') self.total_changed.emit(len(raw_books)) for i, book_data in enumerate(raw_books): self.update_details.emit( _('%(num)s of %(tot)s books processed.') % dict( num=i, tot=len(raw_books))) book = SearchResult() book.detail_item = ''.join(book_data.xpath('.//a/@href')) book.formats = ''.join(book_data.xpath('.//i/text()')) book.formats = book.formats.strip() text = ''.join(book_data.xpath('.//a/text()')) if ':' in text: book.author, q, text = text.partition(':') book.author = book.author.strip() book.title = text.strip() books.append(book) if not self._run: books = [] break else: self.update_progress.emit(i) except: pass # Save the book list and it's create time. if books: self.config['book_list'] = self.seralize_books(books) self.config['last_download'] = time.time()
def search(self, query, max_results=100, timeout=180): url = 'http://www.e-knjiga.si/rezultati_cover.php?query=' + urllib2.quote(query) print("will search for: " + urllib2.quote(query) + ":\n " + url) br = browser() # counter = max_results with closing(br.open(url, timeout=timeout)) as f: html=etree.HTML(f.read()) #get list of books for book in html.xpath("//table[@class='zebra']"): print(etree.tostring(book, pretty_print=True, method="html")) author = book.find('.//tr/[0]/td/[1]').text title = book.find('.//tr/[0]/td/[2]/a').text details = 'http://www.e-knjiga.si/' + book.find('.//tr/[0]/td/[2]/a').get("href") ## get details fo = urllib2.urlopen(details) det=etree.HTML(fo.read()) fo.close() table=det.find(".//div[@id='center_container']").find('./table') cover='http://www.e-knjiga.si/' + table.find('.//tr/[1]/td/[1]/div/img').get("src") description=table.find(".//tr/[6]/td[@class='knjige_spremna']").text links=[] files=table.find('.//tr/[7]/td/[1]') for file in files.iter('a'): links.append("http://www.e-knjiga.si/"+file.get("href")) #print("Author: " + author) #print("Title: " + title) #print("Details: " + details) #print("Description: " + description) #print("Cover: " + cover) #print("Files: ") #print('\n '.join(links)) s = SearchResult() s.title = title s.author = author s.price = "0.00eur" s.drm = SearchResult.DRM_UNLOCKED s.detail_item = description for f in links: ftype = f.split(".")[-1] s.downloads[ftype] = f s.formats += ftype s.cover_url = cover yield s