Ejemplo n.º 1
0
    def crawler_data(self,tree):
        category_data = extract_category(self)      
        info = self.get_info(tree)
        summary = info["summary"]
        introduce = info["introduce"]
        images = info["images"]
        images = self.convert_img(images)
        brand = self.get_brand(summary, introduce, tree)
        version = get_version(summary, introduce)
        series = get_series(summary, introduce)      

        crawl_data = {
            'source': self.data.get('source'),
            'source_id': str(self.key),
            'name': info['name'],
            'images': images,
            'intro_img': info['intro_img'],
            'summary': summary,
            'introduce': introduce,
            'status': info['status'],
            'version': version,
            'brand': brand,
            'series': series,
            'comment': {
                'is_Bbc': info['is_Bbc'],
            },
        }
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        return crawl_data
Ejemplo n.º 2
0
    def crawl(self):
        key = self.key
        category_data = extract_category(self)
        count = 3
        page = 1
        while page <= count:
            response = self.get_response(key, page)
            tree = etree.HTML(response.text)
            if page == 1:
                count = self.get_count(tree)
            items = tree.xpath(self.xpath["item"])
            for item in items:
                info = self.get_info(item)
                crawl_data = {
                    'eid': self.data['uuid'],
                    'source_id': self.data['source_id'],
                    'brand': self.data['brand'],
                    'series': self.data['series'],
                    'version': self.data['version'],
                    'source': self.data['source'],
                    'status': self.data["status"],
                    'comment': {
                        'is_Bbc': self.data['is_Bbc'],
                    }
                }
                crawl_data.update(info)
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())
                model = EcCommentModel(crawl_data)
                export(model)

            page += 1
Ejemplo n.º 3
0
    def crawl(self):
        json_data = ProcessData.get_json_data(self.get_json_url(self.key))
        is_Bbc = self.get_is_Bbc(json_data)
        status = self.get_status(json_data)
        response = self.get_response(self.key)
        tree = etree.HTML(response.text)
        info = self.get_info(tree)
        crawl_data = {
            "source": self.data["source"],
            "source_id": self.key,
            "status": status,
            "comment": {
                "is_Bbc": is_Bbc,
            },
        }
        crawl_data.update(info)
        crawl_data.update(extract_category(self))
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)

        comment_data = {
            "uuid": model["id"],
            "status": model["status"],
            "version": model["version"],
            "series": model["series"],
            "brand": model["brand"],
            "is_Bbc": model["comment"]["is_Bbc"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)
Ejemplo n.º 4
0
 def crawl(self):
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         json_data = self.get_response(self.key, page)
         if page == 1: page_count = self.get_page_count(json_data)
         for item in json_data["ProductReviewList"]:
             review = item["ReviewDetail"]
             info = self.get_info(review)
             crawl_data = {
                 "eid": self.data["uuid"],
                 "brand": self.data["brand"],
                 "version": self.data["version"],
                 "series": self.data["series"],
                 "source": self.data["source"],
                 "source_id": self.key,
                 "status": self.data["status"],
                 "comment": {
                     "is_Bbc": self.data["is_Bbc"],
                 },
             }
             crawl_data.update(info)
             crawl_data.update(category_data)
             crawl_data.update(get_ctime())
             model = EcCommentModel(crawl_data)
             export(model)
         page += 1
Ejemplo n.º 5
0
    def crawl(self):
        global COOKIE
        category_data = extract_category(self)
        page = 1  # 从第一页开始
        pageSize = 5
        while page <= pageSize:
            newurl = self.get_url(self.key, page)
            html_stream = ProcessData.get_web_data(newurl)
            if COOKIE != html_stream.headers.get("set-cookie", ""):
                COOKIE = html_stream.headers.get("set-cookie", "")
            html = etree.HTML(html_stream.content)
            if page == 1:
                pageSize = self.get_PageSize(html)
            items = html.xpath(self.xpath["item"])
            for item in items:
                info = self.get_info(item)
                crawl_data = {
                    "eid": self.data["uuid"],
                    "brand": self.data["brand"],
                    "version": self.data["version"],
                    "series": self.data["series"],
                    "source": self.data["source"],
                    "status": self.data["status"],
                    "source_id": self.key,
                    "comment": {
                        "is_Bbc": self.data["is_Bbc"],
                    }
                }
                crawl_data.update(info)
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())

                model = EcCommentModel(crawl_data)
                export(model)
            page += 1
Ejemplo n.º 6
0
    def crawl(self):
        global COOKIE
        category_data = extract_category(self)
        response = self.get_response(self.key)
        if COOKIE != response.headers.get("set-cookie", ""):
            COOKIE = response.headers.get("set-cookie", "")
        tree = etree.HTML(response.text)
        info = self.get_info(tree)

        crawl_data = {
            'source': "amazon",
            'source_id': self.key,
            'status': 1,
        }

        crawl_data.update(info)
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        model = EcDetailModel(crawl_data)
        export(model)
        comment_data = {
            "uuid": model["id"],
            "brand": model["brand"],
            "version": model["version"],
            "series": model["series"],
            "is_Bbc": model["comment"]["is_Bbc"],
            'status': model["status"],
        }
        Scheduler.schedule(CommentCrawler.type,
                           key=self.key,
                           data=comment_data)
Ejemplo n.º 7
0
    def crawl(self):
        catId = str(self.key)

        category_data = extract_category(self)
        totalpage = self.get_page(catId)
        if totalpage == 0:
            return {}
        for i in range(1, totalpage + 1):
            url = self.get_url(catId, i)
            jsons = ProcessData.get_json_data(url)
            try:
                goodsList = jsons['goodsList']
            except Exception, e:
                self.logger.error(url)
                self.logger.error(e)
                print "get goodsList fail"

            for j in range(len(goodsList)):
                goods = goodsList[j]
                goodsNo = goods['goodsNo']
                goodsName = goods['goodsName']
                skuID = goods['skuID']

                goods_find = self.has_goods(goodsNo)
                if not goods_find:
                    data = {
                        'priorcategory': self.data['priorcategory'],
                        'skuID': skuID,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=goodsNo,
                                       data=data)
                    continue
                adword = self.extract_adword(goods['ad'])
                crawl_data = {
                    'id': goods_find['uuid'],
                    'source_id': goodsNo,
                    'source': self.data.get('source'),
                    'title': goods['goodsName'],
                    'adword': adword,
                    'status': goods_find['status'],
                    'price': float(goods['lowestSalePrice']),
                    'brand': goods_find['brand'],
                    'version': goods_find['version'],
                    'series': goods_find['series'],
                    'comment': {
                        'is_Bbc': goods_find['isBbc'],
                        'skuId': goods_find['skuID'],
                    },
                }
                crawl_data.update(category_data)
                crawl_data.update(get_ctime())
                model = EcBasicModel(crawl_data)
                export(model)
Ejemplo n.º 8
0
    def crawl(self):
        global COOKIE
        keyid = self.key
        category_data = extract_category(self)
        priorcategory = self.data["priorcategory"]
        count = 3
        page = 1  # 从第一页开始
        while page <= count:
            url = self.get_url(keyid, page)
            html_stream = ProcessData.get_web_data(url)
            if COOKIE != html_stream.headers.get("set-cookie", ""):
                COOKIE = html_stream.headers.get("set-cookie", "")
            html = etree.HTML(html_stream.content)
            if page == 1:
                count = self.getPageSize(html)
            items = html.xpath(self.xpath["item"])
            if not len(items):
                if html.xpath("//input[@id='captchacharacters']"):
                    time.sleep(random.randint(1, 3))
                    continue
                else:
                    self.remove_task(keyid)

            for item in items:
                source_id = self.get_source_id(item)
                task_data = self.has_goods(source_id)
                if not task_data:
                    data = {
                        'priorcategory': priorcategory,
                    }
                    Scheduler.schedule(DetailCrawler.type,
                                       key=source_id,
                                       data=data)
                else:
                    info = self.get_info(item)
                    crawl_data = {
                        'id': task_data["uuid"],
                        'source_id': source_id,
                        'source': "amazon",
                        'brand': task_data["brand"],
                        'version': task_data["version"],
                        'series': task_data["series"],
                        'status': task_data["status"],
                        "comment": {
                            "is_Bbc": task_data["is_Bbc"],
                        }
                    }
                    crawl_data.update(info)
                    crawl_data.update(category_data)
                    crawl_data.update(get_ctime())
                    model = EcBasicModel(crawl_data)
                    export(model)
            page += 1
Ejemplo n.º 9
0
 def get_crawl_data(self, item, **args):
     crawl_data = self.parse_data(item)
     task_data = args['task_data']
     crawl_data['id'] = task_data['uuid']
     crawl_data['status'] = task_data['status']
     crawl_data['brand'] = task_data['brand']
     crawl_data['series'] = task_data['series']
     crawl_data['version'] = task_data['version']
     crawl_data['comment'] = {
         'is_Bbc': task_data['is_Bbc'],
     }
     crawl_data['source_id'] = args['source_id']
     crawl_data.update(args['category_data'])
     crawl_data['source'] = self.data['source']
     crawl_data.update(get_ctime())
     return crawl_data
Ejemplo n.º 10
0
 def crawl(self):
     page_size = 0
     page = 0
     while page <= page_size:
         url = self.get_url(self.key, page)
         json_data = ProcessData.get_json_data(url)
         if page == 0:
             page_size = self.get_page_size(json_data)
         for goods in json_data["goods"]:
             source_id = goods["partnumber"]
             task_data = self.has_goods(self.key)
             if not task_data:
                 data = {
                     "priorcategory": self.data["priorcategory"],
                     "status": 1 if int(goods["saleStatus"]) == 0 else 0,
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=data)
             else:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "source": self.data["source"],
                     "source_id": source_id,
                     "title": goods["catentdesc"],
                     "adword":
                     extract_adword(goods.get("auxdescription", "")),
                     "price": float(goods["price"]),
                     'status': task_data['status'],
                     'brand': task_data['brand'],
                     'version': task_data['version'],
                     'series': task_data['series'],
                     'comment': {
                         'is_Bbc': task_data['is_Bbc'],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
         page += 1
Ejemplo n.º 11
0
    def crawl(self):
        ecid = self.data['uuid']
        goodsNo = str(self.key)
        category_data = extract_category(self)
        totalpage = int(self.get_page(goodsNo))
        if totalpage == 0:
            return
        for i in range(totalpage + 1):
            url = self.get_url(goodsNo, i)
            json = ProcessData.get_json_data(url)
            appraise = json['appraiseArray']

            for item in appraise:
                commentid = item['id']
                summary = item['summary']
                score = item['appraiseGrade']
                userorderid = item['appraiseName']
                commenttime = ProcessData.str_datetime(item['appraiseTime'])
                comment_data = {
                    'eid': ecid,  #commodity table foreign key
                    'source_id': goodsNo,
                    'source': self.data.get('source'),
                    'comment_id': item['id'],  #review id
                    'score': item['appraiseGrade'],  #commodity score
                    'pubtime': ProcessData.str_datetime(item['appraiseTime']),
                    'user_name': item['appraiseName'],
                    'content': item['summary'],
                    'brand': self.data['brand'],
                    'version': self.data['version'],
                    'series': self.data['series'],
                    'comment': {
                        'is_Bbc': self.data['is_Bbc'],
                        'skuID': self.data['skuID'],
                    }
                }
                comment_data.update(category_data)
                comment_data.update(get_ctime())
                model = EcCommentModel(comment_data)
                export(model)
Ejemplo n.º 12
0
 def crawl(self):
     CatID = self.key
     category_data = extract_category(self)
     page = 1
     page_count = 1
     while page <= page_count:
         jsons = self.get_response(CatID, page)
         if page == 1: page_count = self.get_page_count(jsons)
         for goods in jsons['ProductListItems']:
             source_id = goods["Code"]
             task_data = self.has_goods(source_id)
             if task_data:
                 crawl_data = {
                     "id": task_data["uuid"],
                     "title": goods["Title"],
                     "price": goods["Price"]["CurrentPrice"],
                     "source_id": source_id,
                     "source": self.data["source"],
                     "status": task_data["status"],
                     "brand": task_data["brand"],
                     "version": task_data["version"],
                     "series": task_data["series"],
                     "comment": {
                         "is_Bbc": task_data["isBbc"],
                     },
                 }
                 crawl_data.update(category_data)
                 crawl_data.update(get_ctime())
                 model = EcBasicModel(crawl_data)
                 export(model)
             else:
                 detail_data = {
                     "priorcategory": self.data["priorcategory"],
                 }
                 Scheduler.schedule(DetailCrawler.type,
                                    key=source_id,
                                    data=detail_data)
         page += 1
Ejemplo n.º 13
0
 def crawl(self):
     category_data = extract_category(self)
     page_size = self.get_page_size(self.key)
     page = 1
     while page <= page_size:
         json_data = ProcessData.get_json_data(self.get_url(self.key, page))
         reviews = json_data.get("commodityReviews", [])
         if not reviews:
             return
         for review in reviews:
             crawl_data = {
                 "comment_id": self.get_comment_id(review),
                 "content": review["content"],
                 "tags": self.get_tags(review),
                 "show_pic": self.get_show_pic(review),
                 "pubtime": self.get_pubtime(review),
                 "score": float(review["qualityStar"]),
                 "useful": int(review["usefulCnt"]),
                 "reply": 1 if review.get("replyInfo", {}) else 0,
                 "user_name": review.get("userInfo",
                                         {}).get("nickName", ""),
                 "eid": self.data["uuid"],
                 "brand": self.data["brand"],
                 "version": self.data["version"],
                 "series": self.data["series"],
                 "source": self.data["source"],
                 "source_id": self.key,
                 "status": self.data["status"],
                 "comment": {
                     "is_Bbc": self.data["is_Bbc"],
                 },
             }
             crawl_data.update(category_data)
             crawl_data.update(get_ctime())
             model = EcCommentModel(crawl_data)
             export(model)
         page += 1
Ejemplo n.º 14
0
    def crawl(self):
        skulist = []
        goodsNo = str(self.key)
        category_data = extract_category(self)
        url = self.get_detail_url(goodsNo)
        html = ProcessData.get_web_data(url)
        tree = etree.HTML(html.text)
        xpath = {
            "introduce": "//div[@class='guigecanshu']/text()",
            "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()",
            # "number": "//span[@class='fr ccc']/text()"
        }

        summary = self.parse_summary(tree, xpath["summary"])
        introduce = self.parse_intr(tree, xpath["introduce"])
        # number =  self.parse_number(tree, xpath["number"])

        version = get_version(summary, introduce)
        series = get_series(summary, introduce)
        brand = get_brand(summary, introduce)

        json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
        isBbc_str = json["isBbc"]
        isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N"
        status_str = json["onSale"]
        status = 0 if status_str == "N" or status_str == "n" else 1

        skulist = json['skuList']
        for sku in skulist:
            ecname = sku['skuName']
            ecimglist = sku['skuSourceImgUrl']

        detail_data = {
            'source': self.data.get('source'),
            'source_id': goodsNo,
            'summary': summary,
            'introduce': introduce,
            'name': ecname,
            'images': ecimglist,
            'status': status,
            'brand': brand,
            'version': version,
            'series': series,
            'comment': {
                'is_Bbc': isBbc,
                'skuID': self.data['skuID'],
            },
        }
        detail_data.update(category_data)
        detail_data.update(get_ctime())
        model = EcDetailModel(detail_data)
        export(model)
        comment_data = {
            'uuid': model["id"],
            'brand': brand,
            'version': version,
            'series': series,
            'is_Bbc': isBbc,
            'status': status,
            'priorcategory': self.data['priorcategory'],
            'skuID': self.data['skuID'],
        }
        Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)