def crawlProducts(start_url, limit=999): current_url = start_url products = [] page_count = 1 total_count = 0 #全部商品数目 while True: content = fetchPageWithUrl(current_url) print 'fetch page %s' % page_count if not content: break doc = PyQuery(content) nodeList = PyQuery( doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0) )( 'ul.offer-list-row > li' ) #PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220 if len(nodeList) < 4: print len(nodeList) for num, node in enumerate(nodeList): nodeQ = PyQuery(node) name = nodeQ('div.title > a').attr('title') product_url = process_url(nodeQ('div.title > a').attr('href')) try: MOQ = 0 p_content = fetchPageWithUrl(product_url) p_doc = PyQuery(p_content) MOQ = extractNum( p_doc('tr.amount > td.ladder-1-1 > span.value').text()) if not MOQ or MOQ == 0: MOQ = extractNum( PyQuery( p_doc('tr.amount').remove('td.amount-title'). children('td').eq(0))('span.value').text()) except: """do nothing""" img_url = "http:" + nodeQ('div.image > a > img').attr( 'data-lazy-load-src') price = nodeQ('div.price').text() if nodeQ('div.attributes > span'): tags = '' for tagNode in nodeQ('div.attributes > span'): tagNodeQ = PyQuery(tagNode) text = tagNodeQ.attr('class') tags = tags + ' ' + text else: tags = '' sold = extractNum(nodeQ('div.booked-count').text()) total_count += 1 products.append([ name, product_url, img_url, price, tags, sold, page_count, num + 1, total_count, MOQ ]) next_url = parse_next_url(doc) if not next_url: break current_url = process_url(next_url) page_count += 1 if page_count > limit: break return products
def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product['product_url']: content = fetchContent(product['product_url'], False) doc=PyQuery(content) #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product['reviews'] = doc('p.satisfaction-number > a > em.value').text() product['star'] = doc('p.star-level > i').attr("class") product['total_sales'] = doc('p.bargain-number > a > em.value').text() if need_img_urls: url_list = get_img_urls(content) product['img_urls'] = ', '.join(url_list) else: product['img_urls'] = '' product['color'], product['size'] = '', '' for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')): tdQ = PyQuery(td) if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色': product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸': product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", "")) if not product['MOQ'] or product['MOQ'] == 0: product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) if product['MOQ'] == 1: #print product['product_url'] product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text() product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text() product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text() product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text() print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount'] return product
def parseSupplierContactPage(m): #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm? if m['url'].find('\?') > 0: """do nothing""" else: if m['url'].endswith("/"): m['url'] = m['url'][:-1] m['url'] = m['url'] + '?' #拼出联系页面的url contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url']) content = fetchContent(contact_page_url) doc = PyQuery(content) #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载 if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt'): m['trade_medal'] = doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt') else: m['trade_medal'] = '' m['supply-grade'] = len(doc('div.detail > div.supply-grade > span.disc > a.image > img')) m['biz-type'] = doc('div.detail > div.biz-type > span').text() if not m['biz-type']: m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text() aList = doc('div.contcat-desc > dl') bList = [] for item in aList: itemQ = PyQuery(item) text = itemQ.children('dt').text() #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉 if text.find(u"话") > 0: bList.append(itemQ.children('dd').text()) m['contact'] = ', '.join(bList) #根据json数据获取 满意度 #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1 #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true} if re.findall('shop/(.*)/page', contact_page_url): stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall("shop/(.*)/page", contact_page_url)[0] + '&sati=1' content2 = fetchContent(stat_url) json_data = json.loads(content2) m['satisfication'] = json_data['data']['sati']['satisfaction'] #抓全部商品数 和 动销 商品数 #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true' merchantId=re.findall('shop/(.*)/page', contact_page_url)[0] all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true' active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm' content3 = fetchContent(all_products_url) doc3 = PyQuery(content3) m['products_count'] = extractNum(doc3('li[class="offer-list-tab-title current"] > a > em').text()) if m['products_count'] == 0: m['products_count'] = doc3('ul[data-sp="paging-a"] > li > em.offer-count').text() content4 = fetchContent(active_product_url) doc4 = PyQuery(content4) m['active_products_count'] = extractNum(doc4('li[class="offer-list-tab-title current"] > a > em').text()) if m['active_products_count'] == 0: m['active_products_count'] = doc4('ul[data-sp="paging-a"] > li > em.offer-count').text() else: m['satisfication'] = ''
def crawlProducts(start_url, limit=999): current_url = start_url products = [] page_count = 1 total_count = 0 #全部商品数目 while True: content = fetchPageWithUrl(current_url) print 'fetch page %s' %page_count if not content: break doc = PyQuery(content) nodeList = PyQuery(doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0))('ul.offer-list-row > li')#PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220 if len(nodeList) < 4: print len(nodeList) for num, node in enumerate(nodeList): nodeQ = PyQuery(node) name = nodeQ('div.title > a').attr('title') product_url = process_url(nodeQ('div.title > a').attr('href')) try: MOQ = 0 p_content = fetchPageWithUrl(product_url) p_doc = PyQuery(p_content) MOQ = extractNum(p_doc('tr.amount > td.ladder-1-1 > span.value').text()) if not MOQ or MOQ == 0: MOQ = extractNum(PyQuery(p_doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) except: """do nothing""" img_url = "http:" + nodeQ('div.image > a > img').attr('data-lazy-load-src') price = nodeQ('div.price').text() if nodeQ('div.attributes > span'): tags = '' for tagNode in nodeQ('div.attributes > span'): tagNodeQ = PyQuery(tagNode) text = tagNodeQ.attr('class') tags = tags + ' ' + text else: tags = '' sold = extractNum(nodeQ('div.booked-count').text()) total_count += 1 products.append([name, product_url, img_url, price, tags, sold, page_count, num+1, total_count, MOQ]) next_url = parse_next_url(doc) if not next_url: break current_url = process_url(next_url) page_count += 1 if page_count > limit: break return products
def crawlProductsByCategory(cate_list, ws, merchant_info): row = 1 for index, atuple in enumerate(cate_list): try: cate_num = extractNum(atuple[2]) products = crawlProducts(atuple[0]) for product in products: #这六个是店铺信息 ws.write(row, 0, merchant_info[0]) ws.write(row, 1, merchant_info[1]) ws.write(row, 2, merchant_info[2]) ws.write(row, 3, merchant_info[3]) ws.write(row, 4, merchant_info[4]) ws.write(row, 5, merchant_info[5]) #店铺内一级品类 ws.write(row, 6, atuple[1]) #该品类数目 ws.write(row, 7, cate_num) #抓取的商品信息 [name, product_url, img_url, price, tags, sold] ws.write(row, 8, product[0]) ws.write(row, 9, product[1]) ws.write(row, 10, product[2]) ws.write(row, 11, product[3]) ws.write(row, 12, product[4]) ws.write(row, 13, str(product[5])) ws.write(row, 14, str(product[-1])) row += 1 except Exception, e: print e print 'category', (index + 1), "of", len(cate_list), "failed!"
def parsePage(content): doc = PyQuery(content) merchantNodeList = doc('div.mod-company-list > div.item') merchantList = [] for node in merchantNodeList: nodeQ = PyQuery(node) m = Merchant() m['city'] = nodeQ('div.origin').find('div.container > a').text() m['name'] = nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).text() m['url'] = "http:" + nodeQ("dl[class='info basic-info'] > dt > a").eq( -1).attr('href') m['main_products'] = PyQuery( nodeQ("dl[class='info basic-info'] > dd").eq(1)).children( "span.value").text() m['address'] = PyQuery( nodeQ("dl[class='info basic-info'] > dd").eq(2)).children( "span.value").text() m['satisfaction_rate'] = nodeQ( "dl[class='info basic-info'] > dd.probability > span > em.value" ).eq(0).text() m['retention_rates'] = nodeQ( "dl[class='info basic-info'] > dd.probability > span > em.value" ).eq(1).text() m['weekly_sales'] = extractNum(nodeQ('dd > span.num').text()) #增加以下字段 诚信经营几年 是否支持支付宝 是否保障 m['years'] = nodeQ('dd.service > a.icon-goldcxt > em').text() m['isAlipay'] = u'是' if nodeQ('dd.service > a.icon-alipay') else u"否" m['isTrust'] = u'是' if nodeQ('dd.service > a.icon-trust') else u"否" #进入商户页面抓取信息 parseSupplierContactPage(m) merchantList.append(m) return merchantList
def insert_into_product_scores(self, product_info_list): #product_info_list: [name, category_id, price, img_url, merchantId, productId, url] day = datetime.date.today().strftime("%Y-%m-%d") sql = """insert into product_scores (product_name, category_id, price, img_url, merchant_id, product_id, product_url, ct_status, score, calc_date, score_type) values (%s, %s, %s, %s, %s, %s, %s, 4, 0.08, %s, 'HUMAN_SET')""" product_info_list[2] = float(extractNum(product_info_list[2])) #scores表中price是double product_info_list.append(day) self.db.cursor.execute(sql, product_info_list)
def myAssertGet_element(self, p_title): pList = self.get_element('class=>sum').find_elements_by_tag_name('p') for element in pList: p = element.text if p_title in p: print(p) return utils.extractNum(p) else: pass
def myAssertGet_element(self,p_title): pList=self.get_element('class=>sum').find_elements_by_tag_name('p') for element in pList: p=element.text if p_title in p: print(p) return utils.extractNum(p) else: pass
def process_dhgate(entance_url, item): current_url = entance_url count = 0 wb = xlwt.Workbook(encoding="utf-8") ws = wb.add_sheet("sheet1") ws.write(0, 0, "name") ws.write(0, 1, "price") ws.write(0, 2, "product_url") ws.write(0, 3, "img_url") ws.write(0, 4, "reviews") ws.write(0, 5, "sold") while True: if not current_url: break content = fetch_page_content(current_url) doc = PyQuery(content) nodeList = doc("div.listitem") for node in nodeList: nodeQ = PyQuery(node) name = nodeQ('h3.pro-title > a').text() product_url = nodeQ('h3.pro-title > a').attr('href') img_url = nodeQ('div.photo > a.pic > img').attr('src') price = nodeQ('ul.pricewrap > li.price').text() reviews = extractNum(nodeQ('span.reviewnum').text()) spanList = nodeQ('span') orders = None for span in spanList: spanQ = PyQuery(span) if spanQ.text().startswith("Sold"): orders = extractNum(spanQ.text()) count += 1 ws.write(count, 0, name) ws.write(count, 1, price) ws.write(count, 2, product_url) ws.write(count, 3, img_url) ws.write(count, 4, reviews) ws.write(count, 5, orders) current_url = parseDhgateNextPageUrl(doc, current_url) if count >= 100: break wb.save("./output/dhgate-" + item + ".xls")
def process_dhgate(entance_url, item): current_url = entance_url count = 0 wb = xlwt.Workbook(encoding="utf-8") ws = wb.add_sheet("sheet1") ws.write(0, 0, "name") ws.write(0, 1, "price") ws.write(0, 2, "product_url") ws.write(0, 3, "img_url") ws.write(0, 4, "reviews") ws.write(0, 5, "sold") while True: if not current_url: break content = fetch_page_content(current_url) doc = PyQuery(content) nodeList = doc("div.listitem") for node in nodeList: nodeQ = PyQuery(node) name = nodeQ("h3.pro-title > a").text() product_url = nodeQ("h3.pro-title > a").attr("href") img_url = nodeQ("div.photo > a.pic > img").attr("src") price = nodeQ("ul.pricewrap > li.price").text() reviews = extractNum(nodeQ("span.reviewnum").text()) spanList = nodeQ("span") orders = None for span in spanList: spanQ = PyQuery(span) if spanQ.text().startswith("Sold"): orders = extractNum(spanQ.text()) count += 1 ws.write(count, 0, name) ws.write(count, 1, price) ws.write(count, 2, product_url) ws.write(count, 3, img_url) ws.write(count, 4, reviews) ws.write(count, 5, orders) current_url = parseDhgateNextPageUrl(doc, current_url) if count >= 100: break wb.save("./output/dhgate-" + item + ".xls")
def process_aliexpress(entance_url, item): current_url = entance_url count = 0 wb = xlwt.Workbook(encoding="utf-8") ws = wb.add_sheet("sheet1") ws.write(0, 0, "name") ws.write(0, 1, "price") ws.write(0, 2, "product_url") ws.write(0, 3, "img_url") ws.write(0, 4, "reviews") ws.write(0, 5, "sold") while True: content = fetch_page_content(current_url) doc = PyQuery(content) #失败: http://www.aliexpress.com/wholesale?SearchText=wall+clock&shipCountry=us&SortType=total_tranpro_desc nodeList = doc("li.list-item") #只能获取3个 doc_part = PyQuery(doc('script#lazy-render').text()) otherNodeList = doc_part("li.list-item") nodeList.extend(otherNodeList) for node in nodeList: nodeQ = PyQuery(node) name = nodeQ('div.detail > h3 > a').eq(0).attr('title') product_url = nodeQ('div.detail> h3 > a').eq(0).attr('href') img_url = nodeQ('div.img > a > img.picCore').attr('src') price = nodeQ('span.price').text() reviews = extractNum( nodeQ('div.rate-history > a[class="rate-num "]').text()) orders = extractNum(nodeQ('a[class="order-num-a "] > em').text()) count += 1 ws.write(count, 0, name) ws.write(count, 1, price) ws.write(count, 2, product_url) ws.write(count, 3, img_url) ws.write(count, 4, reviews) ws.write(count, 5, orders) current_url = parseAliexpressNextPageUrl(doc, current_url) if count >= 100: break wb.save("./output/aliexpress-" + item + ".xls")
def parseProductDetails(self, product_page_content, product_info): doc = pq(product_page_content) #http://www.zaful.com/spaghetti-strap-solid-color-openwork-romper-p_45934.html if doc('div[class="text_box last_box"] > div.text_tit > strong').text().strip().lower() == 'reviews': if doc('div[class="text_box last_box"] > div.text_tit > p.rating > span').eq(1).text(): product_info['reviews'] = extractNum(doc('div[class="text_box last_box"] > div.text_tit > p.rating > span').eq(1).text()) product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url']) imgNodeList = doc('ul.js_scrollableDiv > li.thumbnail_list > a > img') imgList = [] for node in imgNodeList: if pq(node).attr('bigimg'): imgList.append(pq(node).attr('bigimg')) product_info['img_urls'] = ', '.join(imgList)
def process_aliexpress(entance_url, item): current_url = entance_url count = 0 wb = xlwt.Workbook(encoding="utf-8") ws = wb.add_sheet("sheet1") ws.write(0, 0, "name") ws.write(0, 1, "price") ws.write(0, 2, "product_url") ws.write(0, 3, "img_url") ws.write(0, 4, "reviews") ws.write(0, 5, "sold") while True: content = fetch_page_content(current_url) doc = PyQuery(content) # 失败: http://www.aliexpress.com/wholesale?SearchText=wall+clock&shipCountry=us&SortType=total_tranpro_desc nodeList = doc("li.list-item") # 只能获取3个 doc_part = PyQuery(doc("script#lazy-render").text()) otherNodeList = doc_part("li.list-item") nodeList.extend(otherNodeList) for node in nodeList: nodeQ = PyQuery(node) name = nodeQ("div.detail > h3 > a").eq(0).attr("title") product_url = nodeQ("div.detail> h3 > a").eq(0).attr("href") img_url = nodeQ("div.img > a > img.picCore").attr("src") price = nodeQ("span.price").text() reviews = extractNum(nodeQ('div.rate-history > a[class="rate-num "]').text()) orders = extractNum(nodeQ('a[class="order-num-a "] > em').text()) count += 1 ws.write(count, 0, name) ws.write(count, 1, price) ws.write(count, 2, product_url) ws.write(count, 3, img_url) ws.write(count, 4, reviews) ws.write(count, 5, orders) current_url = parseAliexpressNextPageUrl(doc, current_url) if count >= 100: break wb.save("./output/aliexpress-" + item + ".xls")
def parseProductsByCategory(self, category_page_content, category_info): doc = pq(category_page_content) productList = [] productNodeList = doc('div#category-prods > div.product') for node in productNodeList: nodeQ = pq(node) productInfo = self.newProduct() productInfo['name'] = nodeQ('div.info > p > a').remove('span').text() productInfo['product_url'] = 'http://www.basspro.com' + nodeQ('div.info > p > a').attr('href') productInfo['sku_id'] = self.extractSkuId(productInfo['product_url']) productInfo['price'] = nodeQ('div.info > div.pricing > p > a.price').text() productInfo['reviews'] = extractNum(nodeQ('div.info > div.reviews > p.reviews > a').text()) productInfo['img_url'] = nodeQ('div.thumb > a > img').attr('src') productInfo.set_categories(category_info) productList.append(productInfo) return productList
def parseProductDetails(self, product_page_content, product_info): doc = pq(product_page_content) #http://www.zaful.com/spaghetti-strap-solid-color-openwork-romper-p_45934.html if doc('div[class="text_box last_box"] > div.text_tit > strong').text( ).strip().lower() == 'reviews': if doc('div[class="text_box last_box"] > div.text_tit > p.rating > span' ).eq(1).text(): product_info['reviews'] = extractNum( doc('div[class="text_box last_box"] > div.text_tit > p.rating > span' ).eq(1).text()) product_info['likes'] = self.crawler.fetchSocialLikes( product_info['product_url']) imgNodeList = doc('ul.js_scrollableDiv > li.thumbnail_list > a > img') imgList = [] for node in imgNodeList: if pq(node).attr('bigimg'): imgList.append(pq(node).attr('bigimg')) product_info['img_urls'] = ', '.join(imgList)
def parseProductsByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productNodeList = doc("div#view_1 > a.productbox") productList = [] for productNode in productNodeList: productNodeQ = PyQuery(productNode) productInfo = self.newProduct() productInfo['name'] = productNodeQ('p[class="sb_prod_name emphasis"]').text() productInfo['product_url'] = productNodeQ.attr('href') productInfo['img_url'] = productNodeQ("img.blocklevel").attr('src') productInfo['reviews'] = extractNum(productNodeQ('div.reviewbox').remove('span').text()) #没有找到价格 怀疑是JS加载 #productInfo['price'] = productNodeQ('span.is_price_value').remove('span').text() productInfo['sku_id'] = self.extractSkuId(productInfo['product_url']) productInfo['label_price'] = productNodeQ('span.wasprice').text().strip() productInfo.set_categories(category_info) productList.append(productInfo) return productList
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['name'] = doc('h1#div_product_name').text() product_info['sku_id'] = doc('span#div_product_itemno').text() product_info['price'] = doc('span#div_product_price').text() product_info['label_price'] = doc('span#div_retail_price').text() product_info['img_url'] = self.merchant.filteruri(doc('img#target_img').attr('src')) #获取reviews数目 product_info['reviews'] = '0' bNodeList = doc('b') for item in bNodeList: text = PyQuery(item).text() if text.startswith("Customer Reviews"): product_info['reviews'] = extractNum(text) break #获取品类路径 nodeList = doc('a.nav-location') if PyQuery(nodeList[0]).text().strip() == 'Home': nodeList = nodeList[1:] for i, node in enumerate(nodeList): product_info['level' + str(i+1) + '_category'] = PyQuery(node).text().strip()
def parseProductsByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productNodeList = doc("div#view_1 > a.productbox") productList = [] for productNode in productNodeList: productNodeQ = PyQuery(productNode) productInfo = self.newProduct() productInfo['name'] = productNodeQ( 'p[class="sb_prod_name emphasis"]').text() productInfo['product_url'] = productNodeQ.attr('href') productInfo['img_url'] = productNodeQ("img.blocklevel").attr('src') productInfo['reviews'] = extractNum( productNodeQ('div.reviewbox').remove('span').text()) #没有找到价格 怀疑是JS加载 #productInfo['price'] = productNodeQ('span.is_price_value').remove('span').text() productInfo['sku_id'] = self.extractSkuId( productInfo['product_url']) productInfo['label_price'] = productNodeQ( 'span.wasprice').text().strip() productInfo.set_categories(category_info) productList.append(productInfo) return productList
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) #为js加载, 内容没有成功地获取 #此处的likes为facebook, twitter等的合计 #product_info['likes'] = doc('div.addthis_toolbox > a.addthis_counter > a.addthis_button_expanded').text().strip() product_info['reviews'] = extractNum(doc('a#js_gotoReviwBlock').text()) #调用api来获取facebook的likes数目 try: product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url']) except: print 'fetch facebook like failed...' + '@' + 'http://graph.facebook.com/' + product_info['product_url'] product_info['likes'] = '0' #获取图片url product_info['img_urls'] = [] imageNodeList = doc('div.goodImg_list > ul.slider > li > a > img') for imageNode in imageNodeList: image_url = PyQuery(imageNode).attr('bigimg') if not image_url: image_url = PyQuery(imageNode).attr('src') if image_url: product_info['img_urls'].append(image_url) product_info['img_urls'] = ', '.join(product_info['img_urls'])
def parsePage(content): doc = PyQuery(content) merchantNodeList = doc('div.mod-company-list > div.item') merchantList = [] for node in merchantNodeList: nodeQ = PyQuery(node) m = Merchant(); m['city'] = nodeQ('div.origin').find('div.container > a').text() m['name'] = nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).text() m['url'] = "http:" + nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).attr('href') m['main_products'] = PyQuery(nodeQ("dl[class='info basic-info'] > dd").eq(1)).children("span.value").text() m['address'] = PyQuery(nodeQ("dl[class='info basic-info'] > dd").eq(2)).children("span.value").text() m['satisfaction_rate'] = nodeQ("dl[class='info basic-info'] > dd.probability > span > em.value").eq(0).text() m['retention_rates'] = nodeQ("dl[class='info basic-info'] > dd.probability > span > em.value").eq(1).text() m['weekly_sales'] = extractNum(nodeQ('dd > span.num').text()) #增加以下字段 诚信经营几年 是否支持支付宝 是否保障 m['years'] = nodeQ('dd.service > a.icon-goldcxt > em').text() m['isAlipay'] = u'是' if nodeQ('dd.service > a.icon-alipay') else u"否" m['isTrust'] = u'是' if nodeQ('dd.service > a.icon-trust') else u"否" #进入商户页面抓取信息 parseSupplierContactPage(m) merchantList.append(m) return merchantList
def parseProductsByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productNodeList = doc('ul#catePageList > li') productList = [] productNodeList = productNodeList[:36] #最后一个是next page方框按钮 for node in productNodeList: nodeQ, productInfo = PyQuery(node), self.newProduct() productInfo['name'] = nodeQ('p.all_proNam > a').attr('title') productInfo['product_url'] = nodeQ('p.all_proNam > a').attr('href') productInfo['sku_id'] = re.findall( 'p_(\d+)\.html', productInfo['product_url'])[0] if re.findall( 'p_(\d+)\.html', productInfo['product_url']) else '' productInfo['img_url'] = nodeQ( 'p.all_proImg > a.proImg_a > img').attr('data-original') productInfo['price'] = nodeQ( 'div.all_price > span.my_shop_price').attr('orgp') productInfo['reviews'] = extractNum( nodeQ('div.all_proStart > a').filter(lambda i: PyQuery( this).attr('title') == 'Customer Reviews').text()) productInfo['likes'] = nodeQ('i.addFavorNum').text() productInfo.set_categories(category_info) productList.append(productInfo) return productList
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['name'] = doc('h1#div_product_name').text() product_info['sku_id'] = doc('span#div_product_itemno').text() product_info['price'] = doc('span#div_product_price').text() product_info['label_price'] = doc('span#div_retail_price').text() product_info['img_url'] = self.merchant.filteruri( doc('img#target_img').attr('src')) #获取reviews数目 product_info['reviews'] = '0' bNodeList = doc('b') for item in bNodeList: text = PyQuery(item).text() if text.startswith("Customer Reviews"): product_info['reviews'] = extractNum(text) break #获取品类路径 nodeList = doc('a.nav-location') if PyQuery(nodeList[0]).text().strip() == 'Home': nodeList = nodeList[1:] for i, node in enumerate(nodeList): product_info['level' + str(i + 1) + '_category'] = PyQuery(node).text().strip()
def parseProductsByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productNodeList = doc("ul#catePageList > li") productList = [] productNodeList = productNodeList[:36] # 最后一个是next page方框按钮 for node in productNodeList: nodeQ, productInfo = PyQuery(node), self.newProduct() productInfo["name"] = nodeQ("p.all_proNam > a").attr("title") productInfo["product_url"] = nodeQ("p.all_proNam > a").attr("href") productInfo["sku_id"] = ( re.findall("p_(\d+)\.html", productInfo["product_url"])[0] if re.findall("p_(\d+)\.html", productInfo["product_url"]) else "" ) productInfo["img_url"] = nodeQ("p.all_proImg > a.proImg_a > img").attr("data-original") productInfo["price"] = nodeQ("div.all_price > span.my_shop_price").attr("orgp") productInfo["reviews"] = extractNum( nodeQ("div.all_proStart > a").filter(lambda i: PyQuery(this).attr("title") == "Customer Reviews").text() ) productInfo["likes"] = nodeQ("i.addFavorNum").text() productInfo.set_categories(category_info) productList.append(productInfo) return productList
def parseSupplierContactPage(m): #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm? if m['url'].find('\?') > 0: """do nothing""" else: if m['url'].endswith("/"): m['url'] = m['url'][:-1] m['url'] = m['url'] + '?' #拼出联系页面的url contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url']) content = fetchContent(contact_page_url) doc = PyQuery(content) #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载 if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq( 0).attr('alt'): m['trade_medal'] = doc( 'div.detail > div.trade-medal > span.disc > a.image > img').eq( 0).attr('alt') else: m['trade_medal'] = '' m['supply-grade'] = len( doc('div.detail > div.supply-grade > span.disc > a.image > img')) m['biz-type'] = doc('div.detail > div.biz-type > span').text() if not m['biz-type']: m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text() aList = doc('div.contcat-desc > dl') bList = [] for item in aList: itemQ = PyQuery(item) text = itemQ.children('dt').text() #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉 if text.find(u"话") > 0: bList.append(itemQ.children('dd').text()) m['contact'] = ', '.join(bList) #根据json数据获取 满意度 #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1 #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true} if re.findall('shop/(.*)/page', contact_page_url): stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall( "shop/(.*)/page", contact_page_url)[0] + '&sati=1' content2 = fetchContent(stat_url) json_data = json.loads(content2) m['satisfication'] = json_data['data']['sati']['satisfaction'] #抓全部商品数 和 动销 商品数 #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true' merchantId = re.findall('shop/(.*)/page', contact_page_url)[0] all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true' active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm' content3 = fetchContent(all_products_url) doc3 = PyQuery(content3) m['products_count'] = extractNum( doc3('li[class="offer-list-tab-title current"] > a > em').text()) if m['products_count'] == 0: m['products_count'] = doc3( 'ul[data-sp="paging-a"] > li > em.offer-count').text() content4 = fetchContent(active_product_url) doc4 = PyQuery(content4) m['active_products_count'] = extractNum( doc4('li[class="offer-list-tab-title current"] > a > em').text()) if m['active_products_count'] == 0: m['active_products_count'] = doc4( 'ul[data-sp="paging-a"] > li > em.offer-count').text() else: m['satisfication'] = ''
def processPrice(self, price): if price.startswith("HKD"): price = extractNum(price) return str(round((float(price) / 7.750200), 2))