Python extract_category Examples, processdata.extract_category Python Examples

Example #1

0

Show file

File: newegg.py Project: jshliu/crawler

    def crawl(self):
        json_data = ProcessData.get_json_data(self.get_json_url(self.key))
        is_Bbc = self.get_is_Bbc(json_data)
        status = self.get_status(json_data)
        response = self.get_response(self.key)
        tree = etree.HTML(response.text)
        info = self.get_info(tree)
        crawl_data = {
            "source": self.data["source"],
            "source_id": self.key,
            "status": status,
            "comment": {
                "is_Bbc": is_Bbc,
            },
        }
        crawl_data.update(info)
        crawl_data.update(extract_category(self))
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)

        comment_data = {
            "uuid": model["id"],
            "status": model["status"],
            "version": model["version"],
            "series": model["series"],
            "brand": model["brand"],
            "is_Bbc": model["comment"]["is_Bbc"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)

Example #2

0

Show file

File: newegg.py Project: jshliu/crawler

 def crawl(self):
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         json_data = self.get_response(self.key, page)
         if page == 1: page_count = self.get_page_count(json_data)
         for item in json_data["ProductReviewList"]:
             review = item["ReviewDetail"]
             info = self.get_info(review)
             crawl_data = {
                 "eid": self.data["uuid"],
                 "brand": self.data["brand"],
                 "version": self.data["version"],
                 "series": self.data["series"],
                 "source": self.data["source"],
                 "source_id": self.key,
                 "status": self.data["status"],
                 "comment": {
                     "is_Bbc": self.data["is_Bbc"],
                 },
             }
             crawl_data.update(info)
             crawl_data.update(category_data)
             crawl_data.update(get_ctime())
             model = EcCommentModel(crawl_data)
             export(model)
         page += 1

Example #3

0

Show file

    def crawl(self):
        global COOKIE
        category_data = extract_category(self)
        response = self.get_response(self.key)
        if COOKIE != response.headers.get("set-cookie", ""):
            COOKIE = response.headers.get("set-cookie", "")
        tree = etree.HTML(response.text)
        info = self.get_info(tree)

        crawl_data = {
            'source': "amazon",
            'source_id': self.key,
            'status': 1,
        }

        crawl_data.update(info)
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)
        comment_data = {
            "uuid": model["id"],
            "brand": model["brand"],
            "version": model["version"],
            "series": model["series"],
            "is_Bbc": model["comment"]["is_Bbc"],
            'status': model["status"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)

Example #4

0

Show file

    def crawl(self):
        global COOKIE
        category_data = extract_category(self)
        page = 1  # 从第一页开始
        pageSize = 5
        while page <= pageSize:
            newurl = self.get_url(self.key, page)
            html_stream = ProcessData.get_web_data(newurl)
            if COOKIE != html_stream.headers.get("set-cookie", ""):
                COOKIE = html_stream.headers.get("set-cookie", "")
            html = etree.HTML(html_stream.content)
            if page == 1:
                pageSize = self.get_PageSize(html)
            items = html.xpath(self.xpath["item"])
            for item in items:
                info = self.get_info(item)
                crawl_data = {
                    "eid": self.data["uuid"],
                    "brand": self.data["brand"],
                    "version": self.data["version"],
                    "series": self.data["series"],
                    "source": self.data["source"],
                    "status": self.data["status"],
                    "source_id": self.key,
                    "comment": {
                        "is_Bbc": self.data["is_Bbc"],
                    }
                }
                crawl_data.update(info)
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())

                model = EcCommentModel(crawl_data)
                export(model)
            page += 1

Example #5

0

Show file

    def crawler_data(self,tree):
        category_data = extract_category(self)      
        info = self.get_info(tree)
        summary = info["summary"]
        introduce = info["introduce"]
        images = info["images"]
        images = self.convert_img(images)
        brand = self.get_brand(summary, introduce, tree)
        version = get_version(summary, introduce)
        series = get_series(summary, introduce)      

        crawl_data = {
            'source': self.data.get('source'),
            'source_id': str(self.key),
            'name': info['name'],
            'images': images,
            'intro_img': info['intro_img'],
            'summary': summary,
            'introduce': introduce,
            'status': info['status'],
            'version': version,
            'brand': brand,
            'series': series,
            'comment': {
                'is_Bbc': info['is_Bbc'],
            },
        }
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        return crawl_data

Example #6

0

Show file

    def crawl(self):
        key = self.key
        category_data = extract_category(self)
        count = 3
        page = 1
        while page <= count:
            response = self.get_response(key, page)
            tree = etree.HTML(response.text)
            if page == 1:
                count = self.get_count(tree)
            items = tree.xpath(self.xpath["item"])
            for item in items:
                info = self.get_info(item)
                crawl_data = {
                    'eid': self.data['uuid'],
                    'source_id': self.data['source_id'],
                    'brand': self.data['brand'],
                    'series': self.data['series'],
                    'version': self.data['version'],
                    'source': self.data['source'],
                    'status': self.data["status"],
                    'comment': {
                        'is_Bbc': self.data['is_Bbc'],
                    }
                }
                crawl_data.update(info)
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())
                model = EcCommentModel(crawl_data)
                export(model)

            page += 1

Example #7

0

Show file

    def crawl(self):
        # fid = '1662'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        fid = self.key
        category_data = extract_category(self)

        count = 3 #页数初始值为3
        pages = 1 #从第一页开始

        while pages <= count:
            url = self.get_url(fid,pages)
            try:
                jsons = ProcessData.get_json_data(url)
                if pages==1 : count = math.ceil(int(jsons['wareCount'])/100)
                lists = jsons['wareInfo']
            except Exception,e:
                self.logger.error(url)
                self.logger.error(e)
                print 'error ',url
                return
            if lists == []:
                return {}
            for i in range(len(lists)):
                ids = uuid.uuid1() #cassandra 主键
                wareId = lists[i]['wareId']

                try:
                    f = lambda x: int(x[:-1])/100.00
                    ecsumscores = float(f(lists[i]['good'])) #商品总评分
                except:
                    ecsumscores = 0

                crawl_data = {
                    # 'id': uuid.uuid1(),
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'summary': {},
                    'title': lists[i]['wname'],
                    'adword': lists[i]['adword'],
                    'price': float(lists[i]['jdPrice']),
                    'original_price': float(lists[i]['martPrice']),
                    'score': ecsumscores
                }
                crawl_data.update(category_data)
                data = {
                    # 'uuid': ids,
                    'priorcategory': self.data['priorcategory'],
                    'presentcategory': self.data['priorcategory']
#                    'presentcategory': self.data['presentcategory']
                }

                model = EcBasicModel(crawl_data)
                export(model)
                data["uuid"] = model["id"]
                Scheduler.schedule(DetailCrawler.type, key=wareId, data=data)
                Scheduler.schedule(CommentCrawler.type, key=wareId, data=data)


            pages += 1

Example #8

0

Show file

    def crawler_data(self,tree):
        category_data = extract_category(self)

        XPATH = self.search_list_xpath
        if len(tree.xpath(XPATH('list'))) == 0:
            XPATH = self.product_list_xpath
        dom = tree.xpath(XPATH('list'))
        for item in dom:
            crawl_data = {}
            craw = [
                'title','adword',
                'price','original_price',
                'source_id','score',
            ]

            for value in craw: 
                crawl_data[value] = self.mackining(item.xpath(XPATH(value)))
            crawl_data['price'] = float(crawl_data['price'])
            try:
                f = lambda x: int(x[:-1])/100.00
                crawl_data['score'] = float(f(crawl_data['score']))
            except:
                crawl_data['score'] = 0
            crawl_data.update(category_data)
            crawl_data['source'] = 'yhd'
            model = EcBasicModel(crawl_data)
            export(model)
            data = {
                'priorcategory': self.data['priorcategory'],
                'presentcategory': self.data['priorcategory']           
            }            
            data["uuid"] = model["id"]
            Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)

Example #9

0

Show file

    def crawl(self):
        # wareId = '1229271'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ecid = '124'
        wareId = self.key
        ecid =  self.data['uuid']
        category_data = extract_category(self)
        pages = 1
        count = True
        while count: 
            number = 0    #去重
            url = self.get_url(wareId,pages)
            # print '++++++++= ',url
            html_stream = ProcessData.get_web_data(url)
            try:
                tree = etree.HTML(html_stream.text)
            except:
                print 'error: ',url
                break
            xpath = "//div[@id='comments-list']/div[@class='mc']"
            dom = tree.xpath(xpath)
            if dom == []:
                count = False
                continue
            for item in dom:
                datas = self.handle(item)
                comment_data={
                    # 'uuid': uuid.uuid1(),         #primary key
                    'ecid': ecid,        #commodity table foreign key
                    'source_id': wareId,
                    'source': self.data.get('source'),
                    'comment_id': datas['commentid'],  #review id
                    'score': datas['score'],         #commodity score
                    'pubtime': datas['commenttime'],
                    'buytime': datas['buytime'],
                    'user_id': datas['url'],
                    # 'usernickName': groups[i]['usernickName'],
                    'useful': datas['useful'],
                    'reply': datas['reply'],
                    'content': datas['comment'],
                    'province': datas['province']

                }
                comment_data.update(category_data)
                model = EcCommentModel(comment_data)
                is_saved = export(model)
                if is_saved == True:
                    pass
                else:
                    number += 1
            if number > 10:
                break
            pages += 1

Example #10

0

Show file

    def crawl(self):
        catId = str(self.key)

        category_data = extract_category(self)
        totalpage = self.get_page(catId)
        if totalpage == 0:
            return {}
        for i in range(1, totalpage + 1):
            url = self.get_url(catId, i)
            jsons = ProcessData.get_json_data(url)
            try:
                goodsList = jsons['goodsList']
            except Exception, e:
                self.logger.error(url)
                self.logger.error(e)
                print "get goodsList fail"

            for j in range(len(goodsList)):
                goods = goodsList[j]
                goodsNo = goods['goodsNo']
                goodsName = goods['goodsName']
                skuID = goods['skuID']

                goods_find = self.has_goods(goodsNo)
                if not goods_find:
                    data = {
                        'priorcategory': self.data['priorcategory'],
                        'skuID': skuID,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=goodsNo,
                                       data=data)
                    continue
                adword = self.extract_adword(goods['ad'])
                crawl_data = {
                    'id': goods_find['uuid'],
                    'source_id': goodsNo,
                    'source': self.data.get('source'),
                    'title': goods['goodsName'],
                    'adword': adword,
                    'status': goods_find['status'],
                    'price': float(goods['lowestSalePrice']),
                    'brand': goods_find['brand'],
                    'version': goods_find['version'],
                    'series': goods_find['series'],
                    'comment': {
                        'is_Bbc': goods_find['isBbc'],
                        'skuId': goods_find['skuID'],
                    },
                }
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())
                model = EcBasicModel(crawl_data)
                export(model)

Example #11

0

Show file

    def crawl(self):
        global COOKIE
        keyid = self.key
        category_data = extract_category(self)
        priorcategory = self.data["priorcategory"]
        count = 3
        page = 1  # 从第一页开始
        while page <= count:
            url = self.get_url(keyid, page)
            html_stream = ProcessData.get_web_data(url)
            if COOKIE != html_stream.headers.get("set-cookie", ""):
                COOKIE = html_stream.headers.get("set-cookie", "")
            html = etree.HTML(html_stream.content)
            if page == 1:
                count = self.getPageSize(html)
            items = html.xpath(self.xpath["item"])
            if not len(items):
                if html.xpath("//input[@id='captchacharacters']"):
                    time.sleep(random.randint(1, 3))
                    continue
                else:
                    self.remove_task(keyid)

            for item in items:
                source_id = self.get_source_id(item)
                task_data = self.has_goods(source_id)
                if not task_data:
                    data = {
                        'priorcategory': priorcategory,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=source_id,
                                       data=data)
                else:
                    info = self.get_info(item)
                    crawl_data = {
                        'id': task_data["uuid"],
                        'source_id': source_id,
                        'source': "amazon",
                        'brand': task_data["brand"],
                        'version': task_data["version"],
                        'series': task_data["series"],
                        'status': task_data["status"],
                        "comment": {
                            "is_Bbc": task_data["is_Bbc"],
                        }
                    }
                    crawl_data.update(info)
                    crawl_data.update(category_data)
                    crawl_data.update(get_ctime())
                    model = EcBasicModel(crawl_data)
                    export(model)
            page += 1

Example #12

0

Show file

    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)

Example #13

0

Show file

 def crawl(self):
     skulist = []
     goodsNo = str(self.key)
     ids = self.data.get('uuid')
     category_data = extract_category(self)
     url = self.get_detail_url(goodsNo)
     html = ProcessData.get_web_data(url)
     tree = etree.HTML(html.text)
     r = tree.xpath(
         "//div[@class='wap_tab_con']/div[2]/table[@class='parameter']/tbody/tr"
     )
     i = len(r)
     standard = {}
     r1 = tree.xpath("//table[@class='parameter']/tbody/tr")
     for x in r1:
         m1 = x.xpath("td[@class='bg']")
         m2 = x.xpath("td[@class='bgv']")
         if len(m1) != 0 and len(m2) != 0:
             standard[m1[0].text] = m2[0].text
     rpack = tree.xpath("//div[@class='wap_tab_con']/div[3]")
     ecparkinglist = rpack[0].text
     rafter = tree.xpath("//div[@class='wap_tab_con']/div[4]")
     ecaftersale = rafter[0].text
     ecbrands = standard[u'品牌'] if standard.get(u'品牌') else ''
     # for k,v in standard.items():
     #     print k.encode('utf-8'),v.encode('utf-8')
     # print ecbrands.encode('utf-8')
     json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
     skulist = json['skuList']
     for sku in skulist:
         ecnowprice = sku['skuPrice']
         ecnmaket = sku['skuPriceDesc']
         ecname = sku['skuName']
         adword = sku['promWords']
         skuid = sku['skuID']
         ecimglist = sku['skuSourceImgUrl']
         source_id = goodsNo + '-' + skuid
         crawl_data = {
             'id': ids,
             'source': self.data.get('source'),
             'source_id': source_id,
             'summary': standard,
             'introduce': {},
             'name': ecname,
             'brand': ecbrands
         }
         crawl_data.update(category_data)
         model = EcDetailModel(crawl_data)
         export(model)

Example #14

0

Show file

    def crawl(self): 
        key = str(self.key)

        category_data = extract_category(self)
        page = 1 #从第一页开始
        while True:
            items = self.get_init_list(key, page)
            if not items: 
                break
            self.save_list(items, category_data=category_data)

            more_items = self.get_more_list(key, page)
            self.save_list(more_items, category_data=category_data)

            page += 1

Example #15

0

Show file

    def crawlHtml(self, html):

        ids = self.data['uuid']
        source = "amazon"
        source_id = self.key
        category_data = extract_category(self)
        summary = {}
        ecbrands = ""
        ecnames = ""
        introduce = {}
        # 获取  productDetailsTable
        prodDetails = html.xpath(
            "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li"
        )

        for proditem in prodDetails:

            k = proditem.xpath("b/text()")[0].strip()[:-1]

            if k == "用户评分":
                summary[k] = proditem.xpath(
                    "span[@class='crAvgStars']/span/a/span/span/text()"
                )[0].strip()[2:-1]
                # print
            elif k == "亚马逊热销商品排名":
                print "a"
            else:
                summary[k] = proditem.xpath("text()")[0].strip()

        crawl_data = {
            'id': ids,
            'source': source,
            'source_id': source_id,
            'summary': summary,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        # print crawl_data
        model = EcDetailModel(crawl_data)
        export(model)

Example #16

0

Show file

 def crawl(self):
     catId = str(self.key)
     category_data = extract_category(self)
     totalpage = self.get_page(catId)
     if totalpage == 0:
         return {}
     for i in range(1, totalpage + 1):
         url = self.get_url(catId, i)
         jsons = ProcessData.get_json_data(url)
         try:
             goodsList = jsons['goodsList']
         except Exception, e:
             self.logger.error(url)
             self.logger.error(e)
             print "get goodsList fail"
         for j in range(len(goodsList)):
             goods = goodsList[j]
             goodsName = goods['goodsName']
             goodsNo = goods['goodsNo']
             skuID = goods['skuID']
             # print goodsNo
             # print skuID
             crawl_data = {
                 # 'id': uuid.uuid1(),
                 'source_id': goodsNo,
                 'source': self.data.get('source'),
                 'title': goods['goodsName'],
                 'adword': goods['ad'],
                 'price': float(goods['lowestSalePrice']),
                 'original_price': float(goods['highestSalePrice']),
                 #'score': ecsumscores
             }
             crawl_data.update(category_data)
             model = EcBasicModel(crawl_data)
             export(model)
             data = {
                 'priorcategory': self.data['priorcategory'],
                 'presentcategory': self.data['priorcategory']
             }
             data["uuid"] = model["id"]
             Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data)
             Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=data)

Example #17

0

Show file

    def crawler_data(self,tree):
        ids =  self.data.get('uuid')
        category_data = extract_category(self)
        introduce = tree.xpath(self.ware_xpath('introduce'))
        specifications = tree.xpath(self.ware_xpath('specifications'))
        introd = {}
        ecnorms = {}
        for item in introduce:
            item = item.strip()
            if item == '': continue
            item = item.split(u'：',1)
            try:
                introd[item[0]] = item[1]
            except:
                pass
        for item in specifications:
            label = item.xpath(self.ware_xpath('label'))
            names = []
            values = []
            for i in label:
                i = i.strip()
                if i.strip() == '':  continue
                names.append(i)
            dd = item.xpath(self.ware_xpath('item'))
            for i in dd:
                i = i.strip()
                if i.strip() == '':  continue        
                values.append(i)
            ecnorms.update(map(lambda x,y:[x,y],names,values))

        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': str(self.key),
            'summary': ecnorms,
            'introduce': introd,
            'version': ecnorms.get(u'型号',''),
            'brand': ecnorms.get(u'商品品牌','')
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)

Example #18

0

Show file

File: amazon.py Project: xxguo/crawler

    def crawlHtml(self, html):

        ids = self.data['uuid']
        source = "amazon"
        source_id = self.key
        category_data = extract_category(self)
        summary = {}
        ecbrands = ""
        ecnames = ""
        introduce = {}
        # 获取  productDetailsTable
        prodDetails = html.xpath(
            "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li")

        for proditem in prodDetails:

            k = proditem.xpath("b/text()")[0].strip()[:-1]

            if k == "用户评分":
                summary[k] = proditem.xpath(
                    "span[@class='crAvgStars']/span/a/span/span/text()")[0].strip()[2:-1]
                # print
            elif k == "亚马逊热销商品排名":
                print "a"
            else:
                summary[k] = proditem.xpath("text()")[0].strip()

        crawl_data = {
            'id': ids,
            'source': source,
            'source_id': source_id,
            'summary': summary,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        # print crawl_data
        model = EcDetailModel(crawl_data)
        export(model)

Example #19

0

Show file

    def crawl(self):
        ecid = self.data['uuid']
        goodsNo = str(self.key)
        category_data = extract_category(self)
        totalpage = int(self.get_page(goodsNo))
        if totalpage == 0:
            return
        for i in range(totalpage + 1):
            url = self.get_url(goodsNo, i)
            json = ProcessData.get_json_data(url)
            appraise = json['appraiseArray']

            for item in appraise:
                commentid = item['id']
                summary = item['summary']
                score = item['appraiseGrade']
                userorderid = item['appraiseName']
                commenttime = ProcessData.str_datetime(item['appraiseTime'])
                comment_data = {
                    'eid': ecid,  #commodity table foreign key
                    'source_id': goodsNo,
                    'source': self.data.get('source'),
                    'comment_id': item['id'],  #review id
                    'score': item['appraiseGrade'],  #commodity score
                    'pubtime': ProcessData.str_datetime(item['appraiseTime']),
                    'user_name': item['appraiseName'],
                    'content': item['summary'],
                    'brand': self.data['brand'],
                    'version': self.data['version'],
                    'series': self.data['series'],
                    'comment': {
                        'is_Bbc': self.data['is_Bbc'],
                        'skuID': self.data['skuID'],
                    }
                }
                comment_data.update(category_data)
                comment_data.update(get_ctime())
                model = EcCommentModel(comment_data)
                export(model)

Example #20

0

Show file

 def crawl(self):
     ecid = self.data['uuid']
     goodsNo = str(self.key)
     category_data = extract_category(self)
     totalpage = int(self.get_page(goodsNo))
     if totalpage == 0:
         print "get_page fail"
         return {}
     for i in range(totalpage):
         url = self.get_url(goodsNo, i)
         json = ProcessData.get_json_data(url)
         try:
             appraise = json['appraiseArray']
         except Exception, e:
             self.logger.error(url)
             self.logger.error(e)
             print "get appraise fail"
         for item in appraise:
             commentid = item['id']
             summary = item['summary']
             score = item['appraiseGrade']
             userorderid = item['appraiseName']
             commenttime = ProcessData.str_datetime(item['appraiseTime'])
             # print commentid
             # print summary.encode('utf-8')
             comment_data = {
                 'ecid': ecid,  #commodity table foreign key
                 'source_id': goodsNo,
                 'source': self.data.get('source'),
                 'comment_id': item['id'],  #review id
                 'score': item['appraiseGrade'],  #commodity score
                 'pubtime': ProcessData.str_datetime(item['appraiseTime']),
                 'user_id': item['appraiseName'],
                 'content': item['summary']
             }
             comment_data.update(category_data)
             model = EcCommentModel(comment_data)
             export(model)

Example #21

0

Show file

File: newegg.py Project: jshliu/crawler

 def crawl(self):
     CatID = self.key
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         jsons = self.get_response(CatID, page)
         if page == 1: page_count = self.get_page_count(jsons)
         for goods in jsons['ProductListItems']:
             source_id = goods["Code"]
             task_data = self.has_goods(source_id)
             if task_data:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "title": goods["Title"],
                     "price": goods["Price"]["CurrentPrice"],
                     "source_id": source_id,
                     "source": self.data["source"],
                     "status": task_data["status"],
                     "brand": task_data["brand"],
                     "version": task_data["version"],
                     "series": task_data["series"],
                     "comment": {
                         "is_Bbc": task_data["isBbc"],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
             else:
                 detail_data = {
                     "priorcategory": self.data["priorcategory"],
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=detail_data)
         page += 1

Example #22

0

Show file

 def crawl(self):
     category_data = extract_category(self)
     page_size = self.get_page_size(self.key)
     page = 1
     while page <= page_size:
         json_data = ProcessData.get_json_data(self.get_url(self.key, page))
         reviews = json_data.get("commodityReviews", [])
         if not reviews:
             return
         for review in reviews:
             crawl_data = {
                 "comment_id": self.get_comment_id(review),
                 "content": review["content"],
                 "tags": self.get_tags(review),
                 "show_pic": self.get_show_pic(review),
                 "pubtime": self.get_pubtime(review),
                 "score": float(review["qualityStar"]),
                 "useful": int(review["usefulCnt"]),
                 "reply": 1 if review.get("replyInfo", {}) else 0,
                 "user_name": review.get("userInfo",
                                         {}).get("nickName", ""),
                 "eid": self.data["uuid"],
                 "brand": self.data["brand"],
                 "version": self.data["version"],
                 "series": self.data["series"],
                 "source": self.data["source"],
                 "source_id": self.key,
                 "status": self.data["status"],
                 "comment": {
                     "is_Bbc": self.data["is_Bbc"],
                 },
             }
             crawl_data.update(category_data)
             crawl_data.update(get_ctime())
             model = EcCommentModel(crawl_data)
             export(model)
         page += 1

Example #23

0

Show file

File: amazon.py Project: xxguo/crawler

    def crawl(self):

        # id号
        ids = self.data['uuid']
        # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1"
        # source_id=""
        # 商品url信息
        url = self.key

        print "url:" + url

        source = "amazon"

        category_data = extract_category(self)

        # 获取该url的流信息
        html_stream = ProcessData.get_web_data(url)

        # 获取商品列表的html 信息
        html = etree.HTML(html_stream.text)

        # 获取商品的详细信息
        prodDetails = html.xpath("//div[@id='prodDetails']")

        if len(prodDetails) == 0:
            # 获取模版也具有基本信息的数据
            detailed = getDetailedGoods(
                type=self.type,
                key=self.key,
                data=self.data
            ).crawlHtml(html)
        else:
            # 打印商品样式
            style = prodDetails[0].xpath("div[@class='disclaim']/strong")
           # print style[0].text

        # 获取具体商品信息
            goodinfo = prodDetails[0].xpath(
                "div[@class='wrapper CNlocale']//table/tbody/tr")

        # 商品
            summary = {}
            ecbrands = ""
            ecnames = ""
            introduce = {}

            for info in goodinfo:
                # print
                # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text
                if info.xpath("td[@class='label']") != []:
                    if info.xpath("td[@class='label']")[0].text == "用户评分":
                        summary[info.xpath("td[@class='label']")[0].text] = info.xpath("td[@class='value']")[
                            0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    elif info.xpath("td[@class='label']")[0].text.strip() == "品牌":
                        ecbrands = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    else:
                        summary[info.xpath("td[@class='label']")[0].text] = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip()

                    # 存入cassandra中
            crawl_data = {
                'id': ids,
                'source': source,
                'source_id': url,
                'summary': summary,
                'introduce': introduce,
                'name': ecnames,
                'brand': ecbrands
            }

            crawl_data.update(category_data)
            # print crawl_data
            model = EcDetailModel(crawl_data)
            export(model)

Example #24

0

Show file

File: amazon.py Project: xxguo/crawler

    def crawl(self):
        # 获取key 信息
        # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071"
        keyid = self.key
        source = "amazon"
        score = 0  # 评分
        # 获取原始分类
        category_data = extract_category(self)
        # priorcategory
        priorcategory = self.data["priorcategory"]
        presentcategory = self.data["presentcategory"]

        count = getPageSize(self.get_url(keyid, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        content = "//div[@id='mainResults']/div"

        while page <= count:
            # 获取url信息
            url = self.get_url(keyid, page)

            # print url
            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(url)

            # self.logger.info("执行页面:"+url)
            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)

            # 获取整个商品的某一个商品的选项,返回的是一个列表
            itempath = html.xpath(content)

            if itempath != None and itempath != []:
                # print itempath
                for item in itempath:
                    title = item.xpath("h3[@class='newaps']/a")
                # crawl_data=[]  #存储数据
                # jg=item.xpath("")
                    # 价格
                    pric = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='newp']/div")

                    if pric == None:

                        pric = item.xpath("ul/li[@class='newp']/div")

                    # 商品评分
                    socreitmem = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a")

                    if socreitmem != []:
                        scoreinfo = socreitmem[0].get('alt')
                        if scoreinfo != None:
                            score = float(scoreinfo[2:-1])

                    for t in title:
                        # 获取商品的标题和url
                        original_price = u"￥0.00"

                        if pric == None or pric == []:
                            price = u"￥0.00"
                        else:
                            try:
                                price = pric[0].xpath("a/span")[0].text
                            except:
                                print url
                                print "出错价格" + pric

                        if pric != None and pric != [] and pric[0].xpath("a/del") != []:
                            # 有原价
                            original_price = pric[0].xpath("a/del")[0].text
                        else:
                            # 如果没有原价，那就可以现价一样
                            original_price = price

                # i+=1
                    # 把信息存储到mongodb中
                        data = {
                            'priorcategory': priorcategory,
                            'presentcategory': presentcategory
                        }

                        if price != None and price.strip() != '' and pric != [] and pric[0] != '':

                            # self.logger.info("价格:"+price)
                            # 把信息存储到cassandra中
                            try:
                                float(price.strip()[1:].replace(",", ""))
                                # float(original_price.strip()[1:].replace(",","")
                            except:
                                self.logger.error("错误price:" + price)
                                self.logger.error("错误price:" + original_price)

                            crawl_data = {
                                # 'id': uuid.uuid1(),
                                'source_id': t.get("href"),
                                'source': source,
                                'summary': {},
                                'title': t.xpath("span")[0].text,
                                'adword': '',
                                'price': float(price.strip()[1:].replace(",", "")),
                                'original_price': float(original_price.strip()[1:].replace(",", "")),
                                'score': 0
                            }

                            crawl_data.update(category_data)
                # 保存到cassandra数据库中category_data
                            model = EcBasicModel(crawl_data)
                            export(model)
                            data["uuid"] = model["id"]

                            # print "执行存储cassandra...."
                            Scheduler.schedule(
                                DetailCrawler.type, key=t.get("href"), data=data)
                            Scheduler.schedule(
                                CommentCrawler.type, key=t.get("href"), data=data)
                    # print repr(json.dumps(crawl_data))
            page += 1

Example #25

0

Show file

    def crawl(self):

        #商品id, 需要获取
        goodid = self.data['uuid']
        # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1"
        source = "amazon"

        url = self.key
        source_id = url
        category_data = extract_category(self)

        count = getCommSize(self.get_url(url, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        while page <= count:
            newurl = self.get_url(url, page)
            print newurl
            # productReviews

            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(newurl)

            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)
            # 获取评论区
            comment = html.xpath("//table[@id='productReviews']//tr/td/div")

            for comitem in comment:
                # None

                # 评论内容
                item = comitem.xpath("div[@class='reviewText']//text()")

                # 评分
                scoreitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span/span/span")
                # 发布时间
                pubtimeitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr"
                )

                # 用户的链接地址
                user_iditem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a"
                )

                # 有用信息
                usefulitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']")

                oninfo = ""
                for i in item:
                    oninfo += i

                # 有用和无用信息
                if usefulitem != None and usefulitem != []:
                    tmpuseful = usefulitem[0].text.strip()
                else:
                    tmpuseful = "0"

                if tmpuseful == "":
                    tmpuseful = "0"
                elif tmpuseful != "0":
                    tmpuseful = tmpuseful[0:tmpuseful.index("/")]

                # 日期
                pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d')
                if pubtimeitem != None and pubtimeitem != []:
                    pubtim = datetime.strptime(
                        pubtimeitem[0].text.replace("年", "-").replace(
                            "月", "-").replace("日", ""), '%Y-%m-%d')

                # 把日期的字符串类型，转换成日期类型

                sorce = "0.0"

                if scoreitem != None and scoreitem != []:
                    sorce = scoreitem[0].text[2:-1].strip()
                    # print "评分:"+sorce

            #  print user_iditem
                userid = ''
                if user_iditem != None and user_iditem != []:
                    userid = str(user_iditem[0].get("href"))

                comment_data = {
                    "ecid": goodid,
                    "source_id": source_id,
                    "source": source,
                    "comment_id": "",
                    "pubtime": pubtim,
                    "buytime": pubtim,
                    "score": float(sorce),
                    "user_id": userid,
                    "useful": int(tmpuseful),
                    'reply': 0,
                    "content": oninfo.strip()
                }
                #                print comment_data
                # 把原始和现有分类存储到数据库中
                comment_data.update(category_data)

                model = EcCommentModel(comment_data)
                export(model)
            page += 1

Example #26

0

Show file

    def crawl(self):

        # id号
        ids = self.data['uuid']
        # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1"
        # source_id=""
        # 商品url信息
        url = self.key

        print "url:" + url

        source = "amazon"

        category_data = extract_category(self)

        # 获取该url的流信息
        html_stream = ProcessData.get_web_data(url)

        # 获取商品列表的html 信息
        html = etree.HTML(html_stream.text)

        # 获取商品的详细信息
        prodDetails = html.xpath("//div[@id='prodDetails']")

        if len(prodDetails) == 0:
            # 获取模版也具有基本信息的数据
            detailed = getDetailedGoods(type=self.type,
                                        key=self.key,
                                        data=self.data).crawlHtml(html)
        else:
            # 打印商品样式
            style = prodDetails[0].xpath("div[@class='disclaim']/strong")
            # print style[0].text

            # 获取具体商品信息
            goodinfo = prodDetails[0].xpath(
                "div[@class='wrapper CNlocale']//table/tbody/tr")

            # 商品
            summary = {}
            ecbrands = ""
            ecnames = ""
            introduce = {}

            for info in goodinfo:
                # print
                # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text
                if info.xpath("td[@class='label']") != []:
                    if info.xpath("td[@class='label']")[0].text == "用户评分":
                        summary[info.xpath(
                            "td[@class='label']")[0].text] = info.xpath(
                                "td[@class='value']")[0].xpath(
                                    "//div[@id='averageCustomerReviewRating']"
                                )[0].text.strip()[2:-1]
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1]
                    elif info.xpath(
                            "td[@class='label']")[0].text.strip() == "品牌":
                        ecbrands = info.xpath(
                            "td[@class='value']")[0].text.strip()
                    else:
                        summary[info.xpath("td[@class='label']")
                                [0].text] = info.xpath(
                                    "td[@class='value']")[0].text.strip()
                    # print
                    # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip()

                    # 存入cassandra中
            crawl_data = {
                'id': ids,
                'source': source,
                'source_id': url,
                'summary': summary,
                'introduce': introduce,
                'name': ecnames,
                'brand': ecbrands
            }

            crawl_data.update(category_data)
            # print crawl_data
            model = EcDetailModel(crawl_data)
            export(model)

Example #27

0

Show file

    def crawl(self):
        # 获取key 信息
        # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071"
        keyid = self.key
        source = "amazon"
        score = 0  # 评分
        # 获取原始分类
        category_data = extract_category(self)
        # priorcategory
        priorcategory = self.data["priorcategory"]
        presentcategory = self.data["presentcategory"]

        count = getPageSize(self.get_url(keyid, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        content = "//div[@id='mainResults']/div"

        while page <= count:
            # 获取url信息
            url = self.get_url(keyid, page)

            # print url
            # 获取该url的流信息
            html_stream = ProcessData.get_web_data(url)

            # self.logger.info("执行页面:"+url)
            # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)

            # 获取整个商品的某一个商品的选项,返回的是一个列表
            itempath = html.xpath(content)

            if itempath != None and itempath != []:
                # print itempath
                for item in itempath:
                    title = item.xpath("h3[@class='newaps']/a")
                    # crawl_data=[]  #存储数据
                    # jg=item.xpath("")
                    # 价格
                    pric = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='newp']/div")

                    if pric == None:

                        pric = item.xpath("ul/li[@class='newp']/div")

                    # 商品评分
                    socreitmem = item.xpath(
                        "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a"
                    )

                    if socreitmem != []:
                        scoreinfo = socreitmem[0].get('alt')
                        if scoreinfo != None:
                            score = float(scoreinfo[2:-1])

                    for t in title:
                        # 获取商品的标题和url
                        original_price = u"￥0.00"

                        if pric == None or pric == []:
                            price = u"￥0.00"
                        else:
                            try:
                                price = pric[0].xpath("a/span")[0].text
                            except:
                                print url
                                print "出错价格" + pric

                        if pric != None and pric != [] and pric[0].xpath(
                                "a/del") != []:
                            # 有原价
                            original_price = pric[0].xpath("a/del")[0].text
                        else:
                            # 如果没有原价，那就可以现价一样
                            original_price = price

                # i+=1
                # 把信息存储到mongodb中
                        data = {
                            'priorcategory': priorcategory,
                            'presentcategory': presentcategory
                        }

                        if price != None and price.strip(
                        ) != '' and pric != [] and pric[0] != '':

                            # self.logger.info("价格:"+price)
                            # 把信息存储到cassandra中
                            try:
                                float(price.strip()[1:].replace(",", ""))
                                # float(original_price.strip()[1:].replace(",","")
                            except:
                                self.logger.error("错误price:" + price)
                                self.logger.error("错误price:" + original_price)

                            crawl_data = {
                                # 'id': uuid.uuid1(),
                                'source_id':
                                t.get("href"),
                                'source':
                                source,
                                'summary': {},
                                'title':
                                t.xpath("span")[0].text,
                                'adword':
                                '',
                                'price':
                                float(price.strip()[1:].replace(",", "")),
                                'original_price':
                                float(original_price.strip()[1:].replace(
                                    ",", "")),
                                'score':
                                0
                            }

                            crawl_data.update(category_data)
                            # 保存到cassandra数据库中category_data
                            model = EcBasicModel(crawl_data)
                            export(model)
                            data["uuid"] = model["id"]

                            # print "执行存储cassandra...."
                            Scheduler.schedule(DetailCrawler.type,
                                               key=t.get("href"),
                                               data=data)
                            Scheduler.schedule(CommentCrawler.type,
                                               key=t.get("href"),
                                               data=data)
                    # print repr(json.dumps(crawl_data))
            page += 1

Example #28

0

Show file

File: amazon.py Project: xxguo/crawler

    def crawl(self):

        #商品id, 需要获取
        goodid = self.data['uuid']
       # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1"
        source = "amazon"

        url = self.key
        source_id = url
        category_data = extract_category(self)

        count = getCommSize(self.get_url(url, 1))  # 页数初始值为3
        page = 1  # 从第一页开始

        while page <= count:
            newurl = self.get_url(url, page)
            print newurl
        # productReviews

         # 获取该url的流信息
            html_stream = ProcessData.get_web_data(newurl)

        # 获取商品列表的html 信息
            html = etree.HTML(html_stream.text)
            # 获取评论区
            comment = html.xpath("//table[@id='productReviews']//tr/td/div")

            for comitem in comment:
                # None

                 # 评论内容
                item = comitem.xpath("div[@class='reviewText']//text()")

                # 评分
                scoreitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span/span/span")
                # 发布时间
                pubtimeitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr")

                # 用户的链接地址
                user_iditem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a")

                # 有用信息
                usefulitem = comitem.xpath(
                    "div[@style='margin-bottom:0.5em;']")

                oninfo = ""
                for i in item:
                    oninfo += i

                # 有用和无用信息
                if usefulitem != None and usefulitem != []:
                    tmpuseful = usefulitem[0].text.strip()
                else:
                    tmpuseful = "0"

                if tmpuseful == "":
                    tmpuseful = "0"
                elif tmpuseful != "0":
                    tmpuseful = tmpuseful[0:tmpuseful.index("/")]

                # 日期
                pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d')
                if pubtimeitem != None and pubtimeitem != []:
                    pubtim = datetime.strptime(pubtimeitem[0].text.replace(
                        "年", "-").replace("月", "-").replace("日", ""), '%Y-%m-%d')

                # 把日期的字符串类型，转换成日期类型

                sorce = "0.0"

                if scoreitem != None and scoreitem != []:
                    sorce = scoreitem[0].text[2:-1].strip()
                    # print "评分:"+sorce

              #  print user_iditem
                userid = ''
                if user_iditem != None and user_iditem != []:
                    userid = str(user_iditem[0].get("href"))

                comment_data = {
                    "ecid": goodid,
                    "source_id": source_id,
                    "source": source,
                    "comment_id": "",
                    "pubtime": pubtim,
                    "buytime": pubtim,
                    "score": float(sorce),
                    "user_id": userid,
                    "useful": int(tmpuseful),
                    'reply': 0,
                    "content": oninfo.strip()
                }
#                print comment_data
            # 把原始和现有分类存储到数据库中
                comment_data.update(category_data)

                model = EcCommentModel(comment_data)
                export(model)
            page += 1

Example #29

0

Show file

    def crawl(self):
        skulist = []
        goodsNo = str(self.key)
        category_data = extract_category(self)
        url = self.get_detail_url(goodsNo)
        html = ProcessData.get_web_data(url)
        tree = etree.HTML(html.text)
        xpath = {
            "introduce": "//div[@class='guigecanshu']/text()",
            "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()",
            # "number": "//span[@class='fr ccc']/text()"
        }

        summary = self.parse_summary(tree, xpath["summary"])
        introduce = self.parse_intr(tree, xpath["introduce"])
        # number =  self.parse_number(tree, xpath["number"])

        version = get_version(summary, introduce)
        series = get_series(summary, introduce)
        brand = get_brand(summary, introduce)

        json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
        isBbc_str = json["isBbc"]
        isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N"
        status_str = json["onSale"]
        status = 0 if status_str == "N" or status_str == "n" else 1

        skulist = json['skuList']
        for sku in skulist:
            ecname = sku['skuName']
            ecimglist = sku['skuSourceImgUrl']

        detail_data = {
            'source': self.data.get('source'),
            'source_id': goodsNo,
            'summary': summary,
            'introduce': introduce,
            'name': ecname,
            'images': ecimglist,
            'status': status,
            'brand': brand,
            'version': version,
            'series': series,
            'comment': {
                'is_Bbc': isBbc,
                'skuID': self.data['skuID'],
            },
        }
        detail_data.update(category_data)
        detail_data.update(get_ctime())
        model = EcDetailModel(detail_data)
        export(model)
        comment_data = {
            'uuid': model["id"],
            'brand': brand,
            'version': version,
            'series': series,
            'is_Bbc': isBbc,
            'status': status,
            'priorcategory': self.data['priorcategory'],
            'skuID': self.data['skuID'],
        }
        Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)