Exemple #1
0
    def crawl(self):

        wareId = str(self.key)
        url = "http://item.yhd.com/item/%s"%wareId
        html_stream = ProcessData.get_web_data(url)
  #      print html_stream.text.encode('utf-8')
        tree = etree.HTML(html_stream.text)
        self.crawler_data(tree)
Exemple #2
0
 def crawl(self): 
     key = str(self.key)
     count = 2 #页数初始值为3
     pages = 1 #从第一页开始
     for i in xrange(1,count):
         url = self.get_url(key,pages)
         html_stream = ProcessData.get_web_data(url)
   #      print html_stream.text.encode('utf-8')
         tree = etree.HTML(html_stream.text)
         self.crawler_data(tree)
Exemple #3
0
    def crawl(self):
        # wareId = '1229271'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ecid = '124'
        wareId = self.key
        ecid =  self.data['uuid']
        category_data = extract_category(self)
        pages = 1
        count = True
        while count: 
            number = 0    #去重
            url = self.get_url(wareId,pages)
            # print '++++++++= ',url
            html_stream = ProcessData.get_web_data(url)
            try:
                tree = etree.HTML(html_stream.text)
            except:
                print 'error: ',url
                break
            xpath = "//div[@id='comments-list']/div[@class='mc']"
            dom = tree.xpath(xpath)
            if dom == []:
                count = False
                continue
            for item in dom:
                datas = self.handle(item)
                comment_data={
                    # 'uuid': uuid.uuid1(),         #primary key
                    'ecid': ecid,        #commodity table foreign key
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'comment_id': datas['commentid'],  #review id
                    'score': datas['score'],         #commodity score
                    'pubtime': datas['commenttime'],
                    'buytime': datas['buytime'],
                    'user_id': datas['url'],
                    # 'usernickName': groups[i]['usernickName'],
                    'useful': datas['useful'],
                    'reply': datas['reply'],
                    'content': datas['comment'],
                    'province': datas['province']

                }
                comment_data.update(category_data)
                model = EcCommentModel(comment_data)
                is_saved = export(model)
                if is_saved == True:
                    pass
                else:
                    number += 1
            if number > 10:
                break
            pages += 1
Exemple #4
0
    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)
Exemple #5
0
 def crawl(self):
     skulist = []
     goodsNo = str(self.key)
     ids = self.data.get('uuid')
     category_data = extract_category(self)
     url = self.get_detail_url(goodsNo)
     html = ProcessData.get_web_data(url)
     tree = etree.HTML(html.text)
     r = tree.xpath(
         "//div[@class='wap_tab_con']/div[2]/table[@class='parameter']/tbody/tr"
     )
     i = len(r)
     standard = {}
     r1 = tree.xpath("//table[@class='parameter']/tbody/tr")
     for x in r1:
         m1 = x.xpath("td[@class='bg']")
         m2 = x.xpath("td[@class='bgv']")
         if len(m1) != 0 and len(m2) != 0:
             standard[m1[0].text] = m2[0].text
     rpack = tree.xpath("//div[@class='wap_tab_con']/div[3]")
     ecparkinglist = rpack[0].text
     rafter = tree.xpath("//div[@class='wap_tab_con']/div[4]")
     ecaftersale = rafter[0].text
     ecbrands = standard[u'品牌'] if standard.get(u'品牌') else ''
     # for k,v in standard.items():
     #     print k.encode('utf-8'),v.encode('utf-8')
     # print ecbrands.encode('utf-8')
     json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
     skulist = json['skuList']
     for sku in skulist:
         ecnowprice = sku['skuPrice']
         ecnmaket = sku['skuPriceDesc']
         ecname = sku['skuName']
         adword = sku['promWords']
         skuid = sku['skuID']
         ecimglist = sku['skuSourceImgUrl']
         source_id = goodsNo + '-' + skuid
         crawl_data = {
             'id': ids,
             'source': self.data.get('source'),
             'source_id': source_id,
             'summary': standard,
             'introduce': {},
             'name': ecname,
             'brand': ecbrands
         }
         crawl_data.update(category_data)
         model = EcDetailModel(crawl_data)
         export(model)
Exemple #6
0
    def crawl(self):
        wareId = str(self.key)
        url = "http://item.yhd.com/item/%s"%wareId
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        crawl_data = self.crawler_data(tree)
        product_id = self.parse_productId(tree)
        model = EcDetailModel(crawl_data)
        export(model)

        comment_data = {
            'uuid': model['id'],
            'status': crawl_data['status'],
            'brand': brand,
            'series': series,
            'version': version,
            'is_Bbc': crawl_data['comment']['is_Bbc'],
            'priorcategory': self.data['priorcategory'],
            'source_id': wareId,
        }
        Scheduler.schedule(CommentCrawler.type, key=product_id, data=comment_data)
Exemple #7
0
    def crawl(wareId,ids ):
        import sys
        reload(sys)
        sys.setdefaultencoding("utf-8")

        url = 'http://item.jd.com/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        if html_stream=={}:
            return {}
        html_stream.encoding = 'gb2312'
        tree = etree.HTML(html_stream.text)
        xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()"
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find(':') >0:
                item = item.split(':',1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''
Exemple #8
0
    def crawl(self):
        skulist = []
        goodsNo = str(self.key)
        category_data = extract_category(self)
        url = self.get_detail_url(goodsNo)
        html = ProcessData.get_web_data(url)
        tree = etree.HTML(html.text)
        xpath = {
            "introduce": "//div[@class='guigecanshu']/text()",
            "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()",
            # "number": "//span[@class='fr ccc']/text()"
        }

        summary = self.parse_summary(tree, xpath["summary"])
        introduce = self.parse_intr(tree, xpath["introduce"])
        # number =  self.parse_number(tree, xpath["number"])

        version = get_version(summary, introduce)
        series = get_series(summary, introduce)
        brand = get_brand(summary, introduce)

        json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
        isBbc_str = json["isBbc"]
        isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N"
        status_str = json["onSale"]
        status = 0 if status_str == "N" or status_str == "n" else 1

        skulist = json['skuList']
        for sku in skulist:
            ecname = sku['skuName']
            ecimglist = sku['skuSourceImgUrl']

        detail_data = {
            'source': self.data.get('source'),
            'source_id': goodsNo,
            'summary': summary,
            'introduce': introduce,
            'name': ecname,
            'images': ecimglist,
            'status': status,
            'brand': brand,
            'version': version,
            'series': series,
            'comment': {
                'is_Bbc': isBbc,
                'skuID': self.data['skuID'],
            },
        }
        detail_data.update(category_data)
        detail_data.update(get_ctime())
        model = EcDetailModel(detail_data)
        export(model)
        comment_data = {
            'uuid': model["id"],
            'brand': brand,
            'version': version,
            'series': series,
            'is_Bbc': isBbc,
            'status': status,
            'priorcategory': self.data['priorcategory'],
            'skuID': self.data['skuID'],
        }
        Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)
Exemple #9
0
 def get_response(self, key, page):
     url = "http://club.yhd.com/review/%s-%s.html" % (key,str(page))
     response = ProcessData.get_web_data(url)
     response.encoding = "utf-8"
     return response
Exemple #10
0
 def get_response(self, key):
     url = self.get_url(key)
     return ProcessData.get_web_data(url)