def crawler_data(self,tree): category_data = extract_category(self) info = self.get_info(tree) summary = info["summary"] introduce = info["introduce"] images = info["images"] images = self.convert_img(images) brand = self.get_brand(summary, introduce, tree) version = get_version(summary, introduce) series = get_series(summary, introduce) crawl_data = { 'source': self.data.get('source'), 'source_id': str(self.key), 'name': info['name'], 'images': images, 'intro_img': info['intro_img'], 'summary': summary, 'introduce': introduce, 'status': info['status'], 'version': version, 'brand': brand, 'series': series, 'comment': { 'is_Bbc': info['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) return crawl_data
def get_info(self, tree): name = self.get_name(tree) images = self.get_images(tree) summary = self.get_summary(tree) intro_img = self.get_intro_img(tree) info = { "name": name, "images": images, "summary": summary, "brand": get_brand(summary, {}), "version": get_version(summary, {}), "series": get_series(summary, {}), "address": get_address(summary, {}), "intro_img": intro_img, } return info
def get_info(self, tree): name = self.get_name(tree) images = self.get_images(tree) summary = self.get_summary(tree) brand = self.get_brand(tree) is_Bbc = self.get_is_Bbc(tree) info = { "name": name, "images": images, "summary": summary, "brand": brand if brand else get_brand(summary, {}), "version": get_version(summary, {}), "series": get_series(summary, {}), "address": get_address(summary, {}), "comment": { "is_Bbc": is_Bbc, }, } return info
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)