def crawlProducts(start_url, limit=999):
    current_url = start_url
    products = []
    page_count = 1
    total_count = 0  #全部商品数目
    while True:
        content = fetchPageWithUrl(current_url)
        print 'fetch page %s' % page_count
        if not content:
            break
        doc = PyQuery(content)
        nodeList = PyQuery(
            doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0)
        )(
            'ul.offer-list-row > li'
        )  #PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220
        if len(nodeList) < 4:
            print len(nodeList)
        for num, node in enumerate(nodeList):
            nodeQ = PyQuery(node)
            name = nodeQ('div.title > a').attr('title')
            product_url = process_url(nodeQ('div.title > a').attr('href'))
            try:
                MOQ = 0
                p_content = fetchPageWithUrl(product_url)
                p_doc = PyQuery(p_content)
                MOQ = extractNum(
                    p_doc('tr.amount > td.ladder-1-1 > span.value').text())
                if not MOQ or MOQ == 0:
                    MOQ = extractNum(
                        PyQuery(
                            p_doc('tr.amount').remove('td.amount-title').
                            children('td').eq(0))('span.value').text())
            except:
                """do nothing"""
            img_url = "http:" + nodeQ('div.image > a > img').attr(
                'data-lazy-load-src')
            price = nodeQ('div.price').text()
            if nodeQ('div.attributes > span'):
                tags = ''
                for tagNode in nodeQ('div.attributes > span'):
                    tagNodeQ = PyQuery(tagNode)
                    text = tagNodeQ.attr('class')
                    tags = tags + ' ' + text
            else:
                tags = ''
            sold = extractNum(nodeQ('div.booked-count').text())
            total_count += 1
            products.append([
                name, product_url, img_url, price, tags, sold, page_count,
                num + 1, total_count, MOQ
            ])
        next_url = parse_next_url(doc)
        if not next_url:
            break
        current_url = process_url(next_url)
        page_count += 1
        if page_count > limit:
            break
    return products
Example #2
0
def parseProductPage(product, need_img_urls=False):
    """进入商品详情页, 抓取四个新字段
       delivery reviews star total_sales
    """
    if product['product_url']:
       content = fetchContent(product['product_url'], False)
       doc=PyQuery(content)
       #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了
       product['reviews'] = doc('p.satisfaction-number > a > em.value').text()
       product['star'] = doc('p.star-level > i').attr("class")
       product['total_sales'] = doc('p.bargain-number > a > em.value').text()
       if need_img_urls:
           url_list = get_img_urls(content)
           product['img_urls'] = ', '.join(url_list)
       else:
           product['img_urls'] = ''
       product['color'], product['size'] = '', ''
       for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')):
            tdQ = PyQuery(td)
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色':
                product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸':
                product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
       product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", ""))
       if not product['MOQ'] or product['MOQ'] == 0:
           product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
       if product['MOQ'] == 1:
           #print product['product_url']
           product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text()
           product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text()
           product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text()
           product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text()
           print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount']
    return product
def parseSupplierContactPage(m):
    #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm?
    if m['url'].find('\?') > 0:
        """do nothing"""
    else:
        if m['url'].endswith("/"):
            m['url'] = m['url'][:-1]
        m['url'] = m['url'] + '?'
    #拼出联系页面的url
    contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url'])
    content = fetchContent(contact_page_url)
    doc = PyQuery(content)
    #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载
    if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt'):
        m['trade_medal'] = doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(0).attr('alt')
    else:
        m['trade_medal'] = ''
    m['supply-grade'] = len(doc('div.detail > div.supply-grade > span.disc > a.image > img'))
    m['biz-type'] = doc('div.detail > div.biz-type > span').text()
    if not m['biz-type']:
        m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text()
    aList = doc('div.contcat-desc > dl')
    bList = []
    for item in aList:
        itemQ = PyQuery(item)
        text = itemQ.children('dt').text()
        #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉
        if text.find(u"话") > 0:
            bList.append(itemQ.children('dd').text())
    m['contact'] = ', '.join(bList)
    #根据json数据获取 满意度
    #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1
    #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true}
    if re.findall('shop/(.*)/page', contact_page_url):
        stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall("shop/(.*)/page", contact_page_url)[0] + '&sati=1'
        content2 = fetchContent(stat_url)
        json_data = json.loads(content2)
        m['satisfication'] = json_data['data']['sati']['satisfaction']
        #抓全部商品数 和 动销 商品数
        #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true'
        merchantId=re.findall('shop/(.*)/page', contact_page_url)[0]
        all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true'
        active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm'
        content3 = fetchContent(all_products_url)
        doc3 = PyQuery(content3)
        m['products_count'] = extractNum(doc3('li[class="offer-list-tab-title current"] > a > em').text())
        if m['products_count'] == 0:
            m['products_count'] = doc3('ul[data-sp="paging-a"] > li > em.offer-count').text() 
        content4 = fetchContent(active_product_url)
        doc4 = PyQuery(content4)
        m['active_products_count'] = extractNum(doc4('li[class="offer-list-tab-title current"] > a > em').text())
        if m['active_products_count'] == 0:
            m['active_products_count'] = doc4('ul[data-sp="paging-a"] > li > em.offer-count').text() 
    else:
        m['satisfication'] = ''
def crawlProducts(start_url, limit=999):
    current_url = start_url
    products = []
    page_count = 1
    total_count = 0 #全部商品数目
    while True:
        content = fetchPageWithUrl(current_url)
        print 'fetch page %s' %page_count
        if not content:
            break
        doc = PyQuery(content)
        nodeList = PyQuery(doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0))('ul.offer-list-row > li')#PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220
        if len(nodeList) < 4:
           print len(nodeList)
        for num, node in enumerate(nodeList):
            nodeQ = PyQuery(node)
            name = nodeQ('div.title > a').attr('title')
            product_url = process_url(nodeQ('div.title > a').attr('href'))
            try:
                MOQ = 0
                p_content = fetchPageWithUrl(product_url)
                p_doc = PyQuery(p_content)
                MOQ = extractNum(p_doc('tr.amount > td.ladder-1-1 > span.value').text())
                if not MOQ or MOQ == 0:
                    MOQ = extractNum(PyQuery(p_doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
            except:
                """do nothing"""
            img_url = "http:" + nodeQ('div.image > a > img').attr('data-lazy-load-src')
            price = nodeQ('div.price').text()
            if nodeQ('div.attributes > span'):
                tags = ''
                for tagNode in nodeQ('div.attributes > span'):
                    tagNodeQ = PyQuery(tagNode)
                    text = tagNodeQ.attr('class')
                    tags = tags + ' ' + text
            else:
                tags = ''
            sold = extractNum(nodeQ('div.booked-count').text())
            total_count += 1
            products.append([name, product_url, img_url, price, tags, sold, page_count, num+1, total_count, MOQ])
        next_url = parse_next_url(doc)
        if not next_url:
            break
        current_url = process_url(next_url)
        page_count += 1
        if page_count > limit:
            break
    return products
def crawlProductsByCategory(cate_list, ws, merchant_info):
    row = 1
    for index, atuple in enumerate(cate_list):
        try:
            cate_num = extractNum(atuple[2])
            products = crawlProducts(atuple[0])
            for product in products:
                #这六个是店铺信息
                ws.write(row, 0, merchant_info[0])
                ws.write(row, 1, merchant_info[1])
                ws.write(row, 2, merchant_info[2])
                ws.write(row, 3, merchant_info[3])
                ws.write(row, 4, merchant_info[4])
                ws.write(row, 5, merchant_info[5])
                #店铺内一级品类
                ws.write(row, 6, atuple[1])
                #该品类数目
                ws.write(row, 7, cate_num)
                #抓取的商品信息 [name, product_url, img_url, price, tags, sold]
                ws.write(row, 8, product[0])
                ws.write(row, 9, product[1])
                ws.write(row, 10, product[2])
                ws.write(row, 11, product[3])
                ws.write(row, 12, product[4])
                ws.write(row, 13, str(product[5]))
                ws.write(row, 14, str(product[-1]))
                row += 1
        except Exception, e:
            print e
            print 'category', (index + 1), "of", len(cate_list), "failed!"
def crawlProductsByCategory(cate_list, ws, merchant_info):
    row = 1
    for index, atuple in enumerate(cate_list):
        try:
            cate_num = extractNum(atuple[2])
            products = crawlProducts(atuple[0])
            for product in products:
                #这六个是店铺信息
                ws.write(row, 0, merchant_info[0])
                ws.write(row, 1, merchant_info[1])
                ws.write(row, 2, merchant_info[2])
                ws.write(row, 3, merchant_info[3])
                ws.write(row, 4, merchant_info[4])
                ws.write(row, 5, merchant_info[5])
                #店铺内一级品类
                ws.write(row, 6, atuple[1])
                #该品类数目
                ws.write(row, 7, cate_num)
                #抓取的商品信息 [name, product_url, img_url, price, tags, sold]
                ws.write(row, 8, product[0])
                ws.write(row, 9, product[1])
                ws.write(row, 10, product[2])
                ws.write(row, 11, product[3])
                ws.write(row, 12, product[4])
                ws.write(row, 13, str(product[5]))
                ws.write(row, 14, str(product[-1]))
                row += 1
        except Exception, e:
            print e
            print 'category', (index + 1), "of", len(cate_list), "failed!"
Example #7
0
def parsePage(content):
    doc = PyQuery(content)
    merchantNodeList = doc('div.mod-company-list > div.item')
    merchantList = []
    for node in merchantNodeList:
        nodeQ = PyQuery(node)
        m = Merchant()
        m['city'] = nodeQ('div.origin').find('div.container > a').text()
        m['name'] = nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).text()
        m['url'] = "http:" + nodeQ("dl[class='info basic-info'] > dt > a").eq(
            -1).attr('href')
        m['main_products'] = PyQuery(
            nodeQ("dl[class='info basic-info'] > dd").eq(1)).children(
                "span.value").text()
        m['address'] = PyQuery(
            nodeQ("dl[class='info basic-info'] > dd").eq(2)).children(
                "span.value").text()
        m['satisfaction_rate'] = nodeQ(
            "dl[class='info basic-info'] > dd.probability > span > em.value"
        ).eq(0).text()
        m['retention_rates'] = nodeQ(
            "dl[class='info basic-info'] > dd.probability > span > em.value"
        ).eq(1).text()
        m['weekly_sales'] = extractNum(nodeQ('dd > span.num').text())
        #增加以下字段 诚信经营几年 是否支持支付宝 是否保障
        m['years'] = nodeQ('dd.service > a.icon-goldcxt > em').text()
        m['isAlipay'] = u'是' if nodeQ('dd.service > a.icon-alipay') else u"否"
        m['isTrust'] = u'是' if nodeQ('dd.service > a.icon-trust') else u"否"
        #进入商户页面抓取信息
        parseSupplierContactPage(m)
        merchantList.append(m)
    return merchantList
 def insert_into_product_scores(self, product_info_list):
     #product_info_list: [name, category_id, price, img_url, merchantId, productId, url]
     day = datetime.date.today().strftime("%Y-%m-%d")
     sql = """insert into product_scores (product_name, category_id, price, img_url, merchant_id, product_id, product_url, ct_status, score, calc_date, score_type) values (%s, %s, %s, %s, %s, %s, %s, 4, 0.08, %s, 'HUMAN_SET')"""
     product_info_list[2] = float(extractNum(product_info_list[2])) #scores表中price是double
     product_info_list.append(day)
     self.db.cursor.execute(sql, product_info_list)
Example #9
0
 def myAssertGet_element(self, p_title):
     pList = self.get_element('class=>sum').find_elements_by_tag_name('p')
     for element in pList:
         p = element.text
         if p_title in p:
             print(p)
             return utils.extractNum(p)
         else:
             pass
Example #10
0
 def myAssertGet_element(self,p_title):
     pList=self.get_element('class=>sum').find_elements_by_tag_name('p')
     for element in pList:
         p=element.text
         if p_title in p:
             print(p)
             return utils.extractNum(p)
         else:
             pass
Example #11
0
def process_dhgate(entance_url, item):
    current_url = entance_url
    count = 0
    wb = xlwt.Workbook(encoding="utf-8")
    ws = wb.add_sheet("sheet1")
    ws.write(0, 0, "name")
    ws.write(0, 1, "price")
    ws.write(0, 2, "product_url")
    ws.write(0, 3, "img_url")
    ws.write(0, 4, "reviews")
    ws.write(0, 5, "sold")
    while True:
        if not current_url:
            break
        content = fetch_page_content(current_url)
        doc = PyQuery(content)
        nodeList = doc("div.listitem")
        for node in nodeList:
            nodeQ = PyQuery(node)
            name = nodeQ('h3.pro-title > a').text()
            product_url = nodeQ('h3.pro-title > a').attr('href')
            img_url = nodeQ('div.photo > a.pic > img').attr('src')
            price = nodeQ('ul.pricewrap > li.price').text()
            reviews = extractNum(nodeQ('span.reviewnum').text())
            spanList = nodeQ('span')
            orders = None
            for span in spanList:
                spanQ = PyQuery(span)
                if spanQ.text().startswith("Sold"):
                    orders = extractNum(spanQ.text())
            count += 1
            ws.write(count, 0, name)
            ws.write(count, 1, price)
            ws.write(count, 2, product_url)
            ws.write(count, 3, img_url)
            ws.write(count, 4, reviews)
            ws.write(count, 5, orders)
        current_url = parseDhgateNextPageUrl(doc, current_url)
        if count >= 100:
            break
    wb.save("./output/dhgate-" + item + ".xls")
def process_dhgate(entance_url, item):
    current_url = entance_url
    count = 0
    wb = xlwt.Workbook(encoding="utf-8")
    ws = wb.add_sheet("sheet1")
    ws.write(0, 0, "name")
    ws.write(0, 1, "price")
    ws.write(0, 2, "product_url")
    ws.write(0, 3, "img_url")
    ws.write(0, 4, "reviews")
    ws.write(0, 5, "sold")
    while True:
        if not current_url:
            break
        content = fetch_page_content(current_url)
        doc = PyQuery(content)
        nodeList = doc("div.listitem")
        for node in nodeList:
            nodeQ = PyQuery(node)
            name = nodeQ("h3.pro-title > a").text()
            product_url = nodeQ("h3.pro-title > a").attr("href")
            img_url = nodeQ("div.photo > a.pic > img").attr("src")
            price = nodeQ("ul.pricewrap > li.price").text()
            reviews = extractNum(nodeQ("span.reviewnum").text())
            spanList = nodeQ("span")
            orders = None
            for span in spanList:
                spanQ = PyQuery(span)
                if spanQ.text().startswith("Sold"):
                    orders = extractNum(spanQ.text())
            count += 1
            ws.write(count, 0, name)
            ws.write(count, 1, price)
            ws.write(count, 2, product_url)
            ws.write(count, 3, img_url)
            ws.write(count, 4, reviews)
            ws.write(count, 5, orders)
        current_url = parseDhgateNextPageUrl(doc, current_url)
        if count >= 100:
            break
    wb.save("./output/dhgate-" + item + ".xls")
Example #13
0
def process_aliexpress(entance_url, item):
    current_url = entance_url
    count = 0
    wb = xlwt.Workbook(encoding="utf-8")
    ws = wb.add_sheet("sheet1")
    ws.write(0, 0, "name")
    ws.write(0, 1, "price")
    ws.write(0, 2, "product_url")
    ws.write(0, 3, "img_url")
    ws.write(0, 4, "reviews")
    ws.write(0, 5, "sold")
    while True:
        content = fetch_page_content(current_url)
        doc = PyQuery(content)
        #失败: http://www.aliexpress.com/wholesale?SearchText=wall+clock&shipCountry=us&SortType=total_tranpro_desc
        nodeList = doc("li.list-item")  #只能获取3个
        doc_part = PyQuery(doc('script#lazy-render').text())
        otherNodeList = doc_part("li.list-item")
        nodeList.extend(otherNodeList)
        for node in nodeList:
            nodeQ = PyQuery(node)
            name = nodeQ('div.detail > h3 > a').eq(0).attr('title')
            product_url = nodeQ('div.detail> h3 > a').eq(0).attr('href')
            img_url = nodeQ('div.img > a > img.picCore').attr('src')
            price = nodeQ('span.price').text()
            reviews = extractNum(
                nodeQ('div.rate-history > a[class="rate-num "]').text())
            orders = extractNum(nodeQ('a[class="order-num-a "] > em').text())
            count += 1
            ws.write(count, 0, name)
            ws.write(count, 1, price)
            ws.write(count, 2, product_url)
            ws.write(count, 3, img_url)
            ws.write(count, 4, reviews)
            ws.write(count, 5, orders)
        current_url = parseAliexpressNextPageUrl(doc, current_url)
        if count >= 100:
            break
    wb.save("./output/aliexpress-" + item + ".xls")
Example #14
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = pq(product_page_content)
     #http://www.zaful.com/spaghetti-strap-solid-color-openwork-romper-p_45934.html
     if doc('div[class="text_box last_box"] > div.text_tit > strong').text().strip().lower() == 'reviews':
         if doc('div[class="text_box last_box"] > div.text_tit > p.rating > span').eq(1).text():
             product_info['reviews'] = extractNum(doc('div[class="text_box last_box"] > div.text_tit > p.rating > span').eq(1).text())
     product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url'])
     imgNodeList = doc('ul.js_scrollableDiv > li.thumbnail_list > a > img')
     imgList = []
     for node in  imgNodeList:
         if pq(node).attr('bigimg'):
             imgList.append(pq(node).attr('bigimg'))
     product_info['img_urls'] = ', '.join(imgList)
def process_aliexpress(entance_url, item):
    current_url = entance_url
    count = 0
    wb = xlwt.Workbook(encoding="utf-8")
    ws = wb.add_sheet("sheet1")
    ws.write(0, 0, "name")
    ws.write(0, 1, "price")
    ws.write(0, 2, "product_url")
    ws.write(0, 3, "img_url")
    ws.write(0, 4, "reviews")
    ws.write(0, 5, "sold")
    while True:
        content = fetch_page_content(current_url)
        doc = PyQuery(content)
        # 失败: http://www.aliexpress.com/wholesale?SearchText=wall+clock&shipCountry=us&SortType=total_tranpro_desc
        nodeList = doc("li.list-item")  # 只能获取3个
        doc_part = PyQuery(doc("script#lazy-render").text())
        otherNodeList = doc_part("li.list-item")
        nodeList.extend(otherNodeList)
        for node in nodeList:
            nodeQ = PyQuery(node)
            name = nodeQ("div.detail > h3 > a").eq(0).attr("title")
            product_url = nodeQ("div.detail> h3 > a").eq(0).attr("href")
            img_url = nodeQ("div.img > a > img.picCore").attr("src")
            price = nodeQ("span.price").text()
            reviews = extractNum(nodeQ('div.rate-history > a[class="rate-num "]').text())
            orders = extractNum(nodeQ('a[class="order-num-a "] > em').text())
            count += 1
            ws.write(count, 0, name)
            ws.write(count, 1, price)
            ws.write(count, 2, product_url)
            ws.write(count, 3, img_url)
            ws.write(count, 4, reviews)
            ws.write(count, 5, orders)
        current_url = parseAliexpressNextPageUrl(doc, current_url)
        if count >= 100:
            break
    wb.save("./output/aliexpress-" + item + ".xls")
Example #16
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = pq(category_page_content)
     productList = []
     productNodeList = doc('div#category-prods > div.product')
     for node in productNodeList:
         nodeQ = pq(node)
         productInfo = self.newProduct()
         productInfo['name'] = nodeQ('div.info > p > a').remove('span').text()
         productInfo['product_url'] = 'http://www.basspro.com' + nodeQ('div.info > p > a').attr('href')
         productInfo['sku_id'] = self.extractSkuId(productInfo['product_url'])
         productInfo['price'] = nodeQ('div.info > div.pricing > p > a.price').text()
         productInfo['reviews'] = extractNum(nodeQ('div.info > div.reviews > p.reviews > a').text())
         productInfo['img_url'] = nodeQ('div.thumb > a > img').attr('src')
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Example #17
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = pq(product_page_content)
     #http://www.zaful.com/spaghetti-strap-solid-color-openwork-romper-p_45934.html
     if doc('div[class="text_box last_box"] > div.text_tit > strong').text(
     ).strip().lower() == 'reviews':
         if doc('div[class="text_box last_box"] > div.text_tit > p.rating > span'
                ).eq(1).text():
             product_info['reviews'] = extractNum(
                 doc('div[class="text_box last_box"] > div.text_tit > p.rating > span'
                     ).eq(1).text())
     product_info['likes'] = self.crawler.fetchSocialLikes(
         product_info['product_url'])
     imgNodeList = doc('ul.js_scrollableDiv > li.thumbnail_list > a > img')
     imgList = []
     for node in imgNodeList:
         if pq(node).attr('bigimg'):
             imgList.append(pq(node).attr('bigimg'))
     product_info['img_urls'] = ', '.join(imgList)
Example #18
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productNodeList = doc("div#view_1 > a.productbox")
     productList = []
     for productNode in productNodeList:
         productNodeQ = PyQuery(productNode)
         productInfo = self.newProduct()
         productInfo['name'] = productNodeQ('p[class="sb_prod_name emphasis"]').text()
         productInfo['product_url'] = productNodeQ.attr('href')
         productInfo['img_url'] = productNodeQ("img.blocklevel").attr('src')
         productInfo['reviews'] = extractNum(productNodeQ('div.reviewbox').remove('span').text())
         #没有找到价格 怀疑是JS加载
         #productInfo['price'] = productNodeQ('span.is_price_value').remove('span').text()
         productInfo['sku_id'] = self.extractSkuId(productInfo['product_url'])
         productInfo['label_price'] = productNodeQ('span.wasprice').text().strip()
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     product_info['name'] = doc('h1#div_product_name').text()
     product_info['sku_id'] = doc('span#div_product_itemno').text()
     product_info['price'] = doc('span#div_product_price').text()
     product_info['label_price'] = doc('span#div_retail_price').text()
     product_info['img_url'] = self.merchant.filteruri(doc('img#target_img').attr('src'))
     #获取reviews数目
     product_info['reviews'] = '0'
     bNodeList = doc('b')
     for item in bNodeList:
         text = PyQuery(item).text()
         if text.startswith("Customer Reviews"):
             product_info['reviews'] = extractNum(text)
             break
     #获取品类路径
     nodeList = doc('a.nav-location')
     if PyQuery(nodeList[0]).text().strip() == 'Home':
         nodeList = nodeList[1:]
     for i, node in enumerate(nodeList):
         product_info['level' + str(i+1) + '_category'] = PyQuery(node).text().strip()
Example #20
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productNodeList = doc("div#view_1 > a.productbox")
     productList = []
     for productNode in productNodeList:
         productNodeQ = PyQuery(productNode)
         productInfo = self.newProduct()
         productInfo['name'] = productNodeQ(
             'p[class="sb_prod_name emphasis"]').text()
         productInfo['product_url'] = productNodeQ.attr('href')
         productInfo['img_url'] = productNodeQ("img.blocklevel").attr('src')
         productInfo['reviews'] = extractNum(
             productNodeQ('div.reviewbox').remove('span').text())
         #没有找到价格 怀疑是JS加载
         #productInfo['price'] = productNodeQ('span.is_price_value').remove('span').text()
         productInfo['sku_id'] = self.extractSkuId(
             productInfo['product_url'])
         productInfo['label_price'] = productNodeQ(
             'span.wasprice').text().strip()
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Example #21
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     #为js加载, 内容没有成功地获取
     #此处的likes为facebook, twitter等的合计
     #product_info['likes'] = doc('div.addthis_toolbox > a.addthis_counter > a.addthis_button_expanded').text().strip()
     product_info['reviews'] = extractNum(doc('a#js_gotoReviwBlock').text())
     #调用api来获取facebook的likes数目
     try:
         product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url'])
     except:
         print 'fetch facebook like failed...' + '@' + 'http://graph.facebook.com/' + product_info['product_url']
         product_info['likes'] = '0'         
     #获取图片url
     product_info['img_urls'] = []
     imageNodeList = doc('div.goodImg_list > ul.slider > li > a > img')
     for imageNode in imageNodeList:
         image_url = PyQuery(imageNode).attr('bigimg')
         if not image_url:
             image_url = PyQuery(imageNode).attr('src')
         if image_url:
             product_info['img_urls'].append(image_url)
     product_info['img_urls'] = ', '.join(product_info['img_urls'])
Example #22
0
def parsePage(content):
    doc = PyQuery(content)
    merchantNodeList = doc('div.mod-company-list > div.item')
    merchantList = []
    for node in merchantNodeList:
        nodeQ = PyQuery(node)
        m = Merchant();
        m['city'] = nodeQ('div.origin').find('div.container > a').text()
        m['name'] = nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).text()
        m['url'] = "http:" + nodeQ("dl[class='info basic-info'] > dt > a").eq(-1).attr('href')
        m['main_products'] = PyQuery(nodeQ("dl[class='info basic-info'] > dd").eq(1)).children("span.value").text()
        m['address'] = PyQuery(nodeQ("dl[class='info basic-info'] > dd").eq(2)).children("span.value").text()
        m['satisfaction_rate'] = nodeQ("dl[class='info basic-info'] > dd.probability > span > em.value").eq(0).text()
        m['retention_rates'] = nodeQ("dl[class='info basic-info'] > dd.probability > span > em.value").eq(1).text()
        m['weekly_sales'] = extractNum(nodeQ('dd > span.num').text())
        #增加以下字段 诚信经营几年 是否支持支付宝 是否保障
        m['years'] = nodeQ('dd.service > a.icon-goldcxt > em').text()
        m['isAlipay'] = u'是' if nodeQ('dd.service > a.icon-alipay') else u"否"
        m['isTrust'] = u'是' if nodeQ('dd.service > a.icon-trust') else u"否"
        #进入商户页面抓取信息
        parseSupplierContactPage(m)
        merchantList.append(m)
    return merchantList
Example #23
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productNodeList = doc('ul#catePageList > li')
     productList = []
     productNodeList = productNodeList[:36]  #最后一个是next page方框按钮
     for node in productNodeList:
         nodeQ, productInfo = PyQuery(node), self.newProduct()
         productInfo['name'] = nodeQ('p.all_proNam > a').attr('title')
         productInfo['product_url'] = nodeQ('p.all_proNam > a').attr('href')
         productInfo['sku_id'] = re.findall(
             'p_(\d+)\.html', productInfo['product_url'])[0] if re.findall(
                 'p_(\d+)\.html', productInfo['product_url']) else ''
         productInfo['img_url'] = nodeQ(
             'p.all_proImg > a.proImg_a > img').attr('data-original')
         productInfo['price'] = nodeQ(
             'div.all_price > span.my_shop_price').attr('orgp')
         productInfo['reviews'] = extractNum(
             nodeQ('div.all_proStart > a').filter(lambda i: PyQuery(
                 this).attr('title') == 'Customer Reviews').text())
         productInfo['likes'] = nodeQ('i.addFavorNum').text()
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Example #24
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     product_info['name'] = doc('h1#div_product_name').text()
     product_info['sku_id'] = doc('span#div_product_itemno').text()
     product_info['price'] = doc('span#div_product_price').text()
     product_info['label_price'] = doc('span#div_retail_price').text()
     product_info['img_url'] = self.merchant.filteruri(
         doc('img#target_img').attr('src'))
     #获取reviews数目
     product_info['reviews'] = '0'
     bNodeList = doc('b')
     for item in bNodeList:
         text = PyQuery(item).text()
         if text.startswith("Customer Reviews"):
             product_info['reviews'] = extractNum(text)
             break
     #获取品类路径
     nodeList = doc('a.nav-location')
     if PyQuery(nodeList[0]).text().strip() == 'Home':
         nodeList = nodeList[1:]
     for i, node in enumerate(nodeList):
         product_info['level' + str(i + 1) +
                      '_category'] = PyQuery(node).text().strip()
Example #25
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productNodeList = doc("ul#catePageList > li")
     productList = []
     productNodeList = productNodeList[:36]  # 最后一个是next page方框按钮
     for node in productNodeList:
         nodeQ, productInfo = PyQuery(node), self.newProduct()
         productInfo["name"] = nodeQ("p.all_proNam > a").attr("title")
         productInfo["product_url"] = nodeQ("p.all_proNam > a").attr("href")
         productInfo["sku_id"] = (
             re.findall("p_(\d+)\.html", productInfo["product_url"])[0]
             if re.findall("p_(\d+)\.html", productInfo["product_url"])
             else ""
         )
         productInfo["img_url"] = nodeQ("p.all_proImg > a.proImg_a > img").attr("data-original")
         productInfo["price"] = nodeQ("div.all_price > span.my_shop_price").attr("orgp")
         productInfo["reviews"] = extractNum(
             nodeQ("div.all_proStart > a").filter(lambda i: PyQuery(this).attr("title") == "Customer Reviews").text()
         )
         productInfo["likes"] = nodeQ("i.addFavorNum").text()
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Example #26
0
def parseSupplierContactPage(m):
    #http://yongjia.1688.com/shop/aofudianqi/page/contactinfo.htm?
    if m['url'].find('\?') > 0:
        """do nothing"""
    else:
        if m['url'].endswith("/"):
            m['url'] = m['url'][:-1]
        m['url'] = m['url'] + '?'
    #拼出联系页面的url
    contact_page_url = re.sub("\?.*$", '/page/contactinfo.htm', m['url'])
    content = fetchContent(contact_page_url)
    doc = PyQuery(content)
    #m['satisfication'] = doc('div.detail > div.sat-rate > span.disc > a').text() 动态加载
    if doc('div.detail > div.trade-medal > span.disc > a.image > img').eq(
            0).attr('alt'):
        m['trade_medal'] = doc(
            'div.detail > div.trade-medal > span.disc > a.image > img').eq(
                0).attr('alt')
    else:
        m['trade_medal'] = ''
    m['supply-grade'] = len(
        doc('div.detail > div.supply-grade > span.disc > a.image > img'))
    m['biz-type'] = doc('div.detail > div.biz-type > span').text()
    if not m['biz-type']:
        m['biz-type'] = doc('div.detail > div.smt-biz-type > span').text()
    aList = doc('div.contcat-desc > dl')
    bList = []
    for item in aList:
        itemQ = PyQuery(item)
        text = itemQ.children('dt').text()
        #text = re.sub('\s*','', itemQ.children('dt').text()) #摆不平这个 空格去不掉
        if text.find(u"话") > 0:
            bList.append(itemQ.children('dd').text())
    m['contact'] = ', '.join(bList)
    #根据json数据获取 满意度
    #http://rate.1688.com/stat/trade/winport.json?memberId=aofudianqi&sati=1
    #{"data":{"items":[],"sati":{"satisfactionRate":0,"satisfaction":4.6,"remarkCount":428},"dsr":null},"success":true}
    if re.findall('shop/(.*)/page', contact_page_url):
        stat_url = 'http://rate.1688.com/stat/trade/winport.json?memberId=' + re.findall(
            "shop/(.*)/page", contact_page_url)[0] + '&sati=1'
        content2 = fetchContent(stat_url)
        json_data = json.loads(content2)
        m['satisfication'] = json_data['data']['sati']['satisfaction']
        #抓全部商品数 和 动销 商品数
        #'http://yiwu.1688.com/shop/ywzxbh03/page/offerlist.htm?tradenumFilter=true'
        merchantId = re.findall('shop/(.*)/page', contact_page_url)[0]
        all_products_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm?tradenumFilter=true'
        active_product_url = 'http://yiwu.1688.com/shop/' + merchantId + '/page/offerlist.htm'
        content3 = fetchContent(all_products_url)
        doc3 = PyQuery(content3)
        m['products_count'] = extractNum(
            doc3('li[class="offer-list-tab-title current"] > a > em').text())
        if m['products_count'] == 0:
            m['products_count'] = doc3(
                'ul[data-sp="paging-a"] > li > em.offer-count').text()
        content4 = fetchContent(active_product_url)
        doc4 = PyQuery(content4)
        m['active_products_count'] = extractNum(
            doc4('li[class="offer-list-tab-title current"] > a > em').text())
        if m['active_products_count'] == 0:
            m['active_products_count'] = doc4(
                'ul[data-sp="paging-a"] > li > em.offer-count').text()
    else:
        m['satisfication'] = ''
Example #27
0
 def processPrice(self, price):
     if price.startswith("HKD"):
         price = extractNum(price)
         return str(round((float(price) / 7.750200), 2))
Example #28
0
 def processPrice(self, price):
     if price.startswith("HKD"):
         price = extractNum(price)
         return str(round((float(price) / 7.750200), 2))