def parse(self, url): info = {'url': url} html = pq(urlopen(url).content) # cover image url cover = html('#original-main-image') cover_url = cover.attr('src') info['cover_url'] = cover_url print cover_url # title and author title = html('.parseasinTitle') info['title'] = title.text() print info['title'] title_and_author = title.parents('.buying').text() author = title_and_author.replace(info['title'], '').strip() info['author'] = author print author # price list_price = html('#listPriceValue').text() actual_price = html('#actualPriceValue').text() info['price'] = list_price or actual_price print info['price'] key_mapping = { u'出版社': 'publisher', u'语种': 'language', u'条形码': 'isbn-13', u'商品尺寸': 'product dimensions', u'商品重量': 'shipping weight', u'外文书名': 'original title', } # basic info, such as publisher, lanuage, num of pages, IBSN, etc. for li in html('#SalesRank').parents('ul').children('li'): li = pq(li) k, v = parse_pair(pq(li).text()) if k in (u'用户评分', u'亚马逊热销商品排名'): continue if k == u'平装': info['binding'] = 'paperback' try: info['num of pages'] = int(v.replace('页', '')) except ValueError: info['num of pages'] = v elif k == u'精装': info['binding'] = 'hardcover' try: info['num of pages'] = int(v.replace('页', '')) except ValueError: info['num of pages'] = v else: k = key_mapping.get(k, k).lower() info[k] = v print u'{} => {}'.format(k, v) return info
def parse(self, url): info = {'url': url} html = pq(urlopen(url).content) # cover image url mainpic = html('#mainpic') cover_url = mainpic('a').attr('href') info['cover_url'] = cover_url print cover_url # title title = mainpic('img').attr('alt') info['title'] = title print title key_mapping = { u'作者': 'author', u'译者': 'translator', u'出版社': 'publisher', u'原作名': 'original title', u'出版年': 'published year', u'页数': 'num of pages', u'定价': 'price', u'装帧': 'paperback', u'丛书': 'series', u'副标题': 'subhead', } # author, publisher, pub year, price, isbn etc. desc = html('#info').html() for each in re.split(r'<br/?>', desc): each = each.strip() if each: k, v = parse_pair(pq(each).text()) if k == u'页数': try: info['num of pages'] = int(v) except ValueError: info['num of pages'] = v else: k = key_mapping.get(k, k).lower() info[k] = v print u'{} => {}'.format(k, v) # tags tags = [pq(a).text() for a in html('#db-tags-section')('a')] info['tags'] = tags print ' '.join(tags) return info
def parse(self, url): info = {'url': url} html = pq(urlopen(url).content) # cover image url mainpic = html('#mainpic') cover_url = mainpic('a').attr('href') info['cover_url'] = cover_url print cover_url # title title = mainpic('img').attr('alt') info['title'] = title print title key_mapping = { u'作者': 'author', u'译者': 'translator', u'出版社': 'publisher', u'原作名': 'original title', u'出版年': 'published year', u'页数': 'num of pages', u'定价': 'price', u'装帧': 'paperback', u'丛书': 'series', u'副标题': 'subhead', } # author, publisher, pub year, price, isbn etc. desc = html('#info').html() for each in re.split(r'<br/?>', desc): each = each.strip() if each: k, v = parse_pair(pq(each).text()) if k == u'页数': try: info['num of pages'] = int(v) except ValueError: info['num of pages'] = v else: k = key_mapping.get(k, k).lower() info[k] = v print u'{} => {}'.format(k, v) # tags tags = [ pq(a).text() for a in html('#db-tags-section')('a') ] info['tags'] = tags print ' '.join(tags) return info
def parse(self, url): info = {'url': url} html = pq(urlopen(url).content) # cover image url cover = html('#main-image') cover_url = cover.attr('src') info['cover_url'] = cover_url print cover_url # title and author title = html('.parseasinTitle') info['title'] = title.text() print info['title'] author = title.siblings('span').text() info['author'] = author print author # price for label in html('.rentalPriceLabel'): label = pq(label) if label.text().strip().lower() == 'buy new': price = label.siblings('.rentPrice').text() info['price'] = price print price break # basic info, such as publisher, lanuage, num of pages, IBSN, etc. for li in html('#SalesRank').parents('ul').children('li'): li = pq(li) k, v = parse_pair(pq(li).text()) k = k.lower() if k in ('shipping weight', 'average customer review', 'amazon best sellers rank'): continue if k in ('paperback', 'hardcover'): info['binding'] = k try: info['num of pages'] = int(v.replace('pages', '')) except ValueError: info['num of pages'] = v else: info[k] = v print u'{} => {}'.format(k, v) return info
def get_search_result(url): r = urlopen(url) html = pq(r.content) hrefs = [] for link in html('.r')('a'): href = link.attrib['href'] hrefs.append(href) logger.info('result %s: %s', link.text, href) info = {} for href in hrefs: hostpath = get_url_hostpath(href) for site, parser in parsers.iteritems(): if site not in info and hostpath.find(site) > -1: logger.debug('parsing %s', href) info[site] = parser.parse(href) return info
def parse(self, url): info = {'url': url} html = pq(urlopen(url).content) # tilte title = html('h1.f14').text() info['title'] = title # download url btn = html('.download_btn_box') host = get_url_host(url) download_url = join_url_hostpath(host, btn('a').attr('href')) info['download_url'] = download_url print download_url # file size file_size = btn('span').text() info['file_size'] = file_size print file_size return info