def crawler_data(self,tree): category_data = extract_category(self) info = self.get_info(tree) summary = info["summary"] introduce = info["introduce"] images = info["images"] images = self.convert_img(images) brand = self.get_brand(summary, introduce, tree) version = get_version(summary, introduce) series = get_series(summary, introduce) crawl_data = { 'source': self.data.get('source'), 'source_id': str(self.key), 'name': info['name'], 'images': images, 'intro_img': info['intro_img'], 'summary': summary, 'introduce': introduce, 'status': info['status'], 'version': version, 'brand': brand, 'series': series, 'comment': { 'is_Bbc': info['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) return crawl_data
def crawl(self): key = self.key category_data = extract_category(self) count = 3 page = 1 while page <= count: response = self.get_response(key, page) tree = etree.HTML(response.text) if page == 1: count = self.get_count(tree) items = tree.xpath(self.xpath["item"]) for item in items: info = self.get_info(item) crawl_data = { 'eid': self.data['uuid'], 'source_id': self.data['source_id'], 'brand': self.data['brand'], 'series': self.data['series'], 'version': self.data['version'], 'source': self.data['source'], 'status': self.data["status"], 'comment': { 'is_Bbc': self.data['is_Bbc'], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): json_data = ProcessData.get_json_data(self.get_json_url(self.key)) is_Bbc = self.get_is_Bbc(json_data) status = self.get_status(json_data) response = self.get_response(self.key) tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { "source": self.data["source"], "source_id": self.key, "status": status, "comment": { "is_Bbc": is_Bbc, }, } crawl_data.update(info) crawl_data.update(extract_category(self)) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "status": model["status"], "version": model["version"], "series": model["series"], "brand": model["brand"], "is_Bbc": model["comment"]["is_Bbc"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def crawl(self): category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: json_data = self.get_response(self.key, page) if page == 1: page_count = self.get_page_count(json_data) for item in json_data["ProductReviewList"]: review = item["ReviewDetail"] info = self.get_info(review) crawl_data = { "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): global COOKIE category_data = extract_category(self) page = 1 # 从第一页开始 pageSize = 5 while page <= pageSize: newurl = self.get_url(self.key, page) html_stream = ProcessData.get_web_data(newurl) if COOKIE != html_stream.headers.get("set-cookie", ""): COOKIE = html_stream.headers.get("set-cookie", "") html = etree.HTML(html_stream.content) if page == 1: pageSize = self.get_PageSize(html) items = html.xpath(self.xpath["item"]) for item in items: info = self.get_info(item) crawl_data = { "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "status": self.data["status"], "source_id": self.key, "comment": { "is_Bbc": self.data["is_Bbc"], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): global COOKIE category_data = extract_category(self) response = self.get_response(self.key) if COOKIE != response.headers.get("set-cookie", ""): COOKIE = response.headers.get("set-cookie", "") tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { 'source': "amazon", 'source_id': self.key, 'status': 1, } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "brand": model["brand"], "version": model["version"], "series": model["series"], "is_Bbc": model["comment"]["is_Bbc"], 'status': model["status"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsNo = goods['goodsNo'] goodsName = goods['goodsName'] skuID = goods['skuID'] goods_find = self.has_goods(goodsNo) if not goods_find: data = { 'priorcategory': self.data['priorcategory'], 'skuID': skuID, } Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) continue adword = self.extract_adword(goods['ad']) crawl_data = { 'id': goods_find['uuid'], 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': adword, 'status': goods_find['status'], 'price': float(goods['lowestSalePrice']), 'brand': goods_find['brand'], 'version': goods_find['version'], 'series': goods_find['series'], 'comment': { 'is_Bbc': goods_find['isBbc'], 'skuId': goods_find['skuID'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model)
def crawl(self): global COOKIE keyid = self.key category_data = extract_category(self) priorcategory = self.data["priorcategory"] count = 3 page = 1 # 从第一页开始 while page <= count: url = self.get_url(keyid, page) html_stream = ProcessData.get_web_data(url) if COOKIE != html_stream.headers.get("set-cookie", ""): COOKIE = html_stream.headers.get("set-cookie", "") html = etree.HTML(html_stream.content) if page == 1: count = self.getPageSize(html) items = html.xpath(self.xpath["item"]) if not len(items): if html.xpath("//input[@id='captchacharacters']"): time.sleep(random.randint(1, 3)) continue else: self.remove_task(keyid) for item in items: source_id = self.get_source_id(item) task_data = self.has_goods(source_id) if not task_data: data = { 'priorcategory': priorcategory, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: info = self.get_info(item) crawl_data = { 'id': task_data["uuid"], 'source_id': source_id, 'source': "amazon", 'brand': task_data["brand"], 'version': task_data["version"], 'series': task_data["series"], 'status': task_data["status"], "comment": { "is_Bbc": task_data["is_Bbc"], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def get_crawl_data(self, item, **args): crawl_data = self.parse_data(item) task_data = args['task_data'] crawl_data['id'] = task_data['uuid'] crawl_data['status'] = task_data['status'] crawl_data['brand'] = task_data['brand'] crawl_data['series'] = task_data['series'] crawl_data['version'] = task_data['version'] crawl_data['comment'] = { 'is_Bbc': task_data['is_Bbc'], } crawl_data['source_id'] = args['source_id'] crawl_data.update(args['category_data']) crawl_data['source'] = self.data['source'] crawl_data.update(get_ctime()) return crawl_data
def crawl(self): page_size = 0 page = 0 while page <= page_size: url = self.get_url(self.key, page) json_data = ProcessData.get_json_data(url) if page == 0: page_size = self.get_page_size(json_data) for goods in json_data["goods"]: source_id = goods["partnumber"] task_data = self.has_goods(self.key) if not task_data: data = { "priorcategory": self.data["priorcategory"], "status": 1 if int(goods["saleStatus"]) == 0 else 0, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: crawl_data = { "id": task_data["uuid"], "source": self.data["source"], "source_id": source_id, "title": goods["catentdesc"], "adword": extract_adword(goods.get("auxdescription", "")), "price": float(goods["price"]), 'status': task_data['status'], 'brand': task_data['brand'], 'version': task_data['version'], 'series': task_data['series'], 'comment': { 'is_Bbc': task_data['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: return for i in range(totalpage + 1): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) appraise = json['appraiseArray'] for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) comment_data = { 'eid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_name': item['appraiseName'], 'content': item['summary'], 'brand': self.data['brand'], 'version': self.data['version'], 'series': self.data['series'], 'comment': { 'is_Bbc': self.data['is_Bbc'], 'skuID': self.data['skuID'], } } comment_data.update(category_data) comment_data.update(get_ctime()) model = EcCommentModel(comment_data) export(model)
def crawl(self): CatID = self.key category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: jsons = self.get_response(CatID, page) if page == 1: page_count = self.get_page_count(jsons) for goods in jsons['ProductListItems']: source_id = goods["Code"] task_data = self.has_goods(source_id) if task_data: crawl_data = { "id": task_data["uuid"], "title": goods["Title"], "price": goods["Price"]["CurrentPrice"], "source_id": source_id, "source": self.data["source"], "status": task_data["status"], "brand": task_data["brand"], "version": task_data["version"], "series": task_data["series"], "comment": { "is_Bbc": task_data["isBbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) else: detail_data = { "priorcategory": self.data["priorcategory"], } Scheduler.schedule(DetailCrawler.type, key=source_id, data=detail_data) page += 1
def crawl(self): category_data = extract_category(self) page_size = self.get_page_size(self.key) page = 1 while page <= page_size: json_data = ProcessData.get_json_data(self.get_url(self.key, page)) reviews = json_data.get("commodityReviews", []) if not reviews: return for review in reviews: crawl_data = { "comment_id": self.get_comment_id(review), "content": review["content"], "tags": self.get_tags(review), "show_pic": self.get_show_pic(review), "pubtime": self.get_pubtime(review), "score": float(review["qualityStar"]), "useful": int(review["usefulCnt"]), "reply": 1 if review.get("replyInfo", {}) else 0, "user_name": review.get("userInfo", {}).get("nickName", ""), "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)