Exemple #1
0
    def crawler_data(self,tree):
        category_data = extract_category(self)      
        info = self.get_info(tree)
        summary = info["summary"]
        introduce = info["introduce"]
        images = info["images"]
        images = self.convert_img(images)
        brand = self.get_brand(summary, introduce, tree)
        version = get_version(summary, introduce)
        series = get_series(summary, introduce)      

        crawl_data = {
            'source': self.data.get('source'),
            'source_id': str(self.key),
            'name': info['name'],
            'images': images,
            'intro_img': info['intro_img'],
            'summary': summary,
            'introduce': introduce,
            'status': info['status'],
            'version': version,
            'brand': brand,
            'series': series,
            'comment': {
                'is_Bbc': info['is_Bbc'],
            },
        }
        crawl_data.update(category_data)
        crawl_data.update(get_ctime())
        return crawl_data
Exemple #2
0
 def get_info(self, tree):
     name = self.get_name(tree)
     images = self.get_images(tree)
     summary = self.get_summary(tree)
     intro_img = self.get_intro_img(tree)
     info = {
         "name": name,
         "images": images,
         "summary": summary,
         "brand": get_brand(summary, {}),
         "version": get_version(summary, {}),
         "series": get_series(summary, {}),
         "address": get_address(summary, {}),
         "intro_img": intro_img,
     }
     return info
Exemple #3
0
 def get_info(self, tree):
     name = self.get_name(tree)
     images = self.get_images(tree)
     summary = self.get_summary(tree)
     brand = self.get_brand(tree)
     is_Bbc = self.get_is_Bbc(tree)
     info = {
         "name": name,
         "images": images,
         "summary": summary,
         "brand": brand if brand else get_brand(summary, {}),
         "version": get_version(summary, {}),
         "series": get_series(summary, {}),
         "address": get_address(summary, {}),
         "comment": {
             "is_Bbc": is_Bbc,
         },
     }
     return info
Exemple #4
0
    def crawl(self):
        skulist = []
        goodsNo = str(self.key)
        category_data = extract_category(self)
        url = self.get_detail_url(goodsNo)
        html = ProcessData.get_web_data(url)
        tree = etree.HTML(html.text)
        xpath = {
            "introduce": "//div[@class='guigecanshu']/text()",
            "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()",
            # "number": "//span[@class='fr ccc']/text()"
        }

        summary = self.parse_summary(tree, xpath["summary"])
        introduce = self.parse_intr(tree, xpath["introduce"])
        # number =  self.parse_number(tree, xpath["number"])

        version = get_version(summary, introduce)
        series = get_series(summary, introduce)
        brand = get_brand(summary, introduce)

        json = ProcessData.get_json_data(self.get_basic_url(goodsNo))
        isBbc_str = json["isBbc"]
        isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N"
        status_str = json["onSale"]
        status = 0 if status_str == "N" or status_str == "n" else 1

        skulist = json['skuList']
        for sku in skulist:
            ecname = sku['skuName']
            ecimglist = sku['skuSourceImgUrl']

        detail_data = {
            'source': self.data.get('source'),
            'source_id': goodsNo,
            'summary': summary,
            'introduce': introduce,
            'name': ecname,
            'images': ecimglist,
            'status': status,
            'brand': brand,
            'version': version,
            'series': series,
            'comment': {
                'is_Bbc': isBbc,
                'skuID': self.data['skuID'],
            },
        }
        detail_data.update(category_data)
        detail_data.update(get_ctime())
        model = EcDetailModel(detail_data)
        export(model)
        comment_data = {
            'uuid': model["id"],
            'brand': brand,
            'version': version,
            'series': series,
            'is_Bbc': isBbc,
            'status': status,
            'priorcategory': self.data['priorcategory'],
            'skuID': self.data['skuID'],
        }
        Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)