def get_page(self, goodsNo): json = ProcessData.get_json_data(self.get_url(goodsNo, 1)) try: totalpage = int(json['totalPage']) except Exception, e: self.logger.error(e) print "totalPage fail!"
def crawl(self): # fid = '1662' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] fid = self.key category_data = extract_category(self) count = 3 #页数初始值为3 pages = 1 #从第一页开始 while pages <= count: url = self.get_url(fid,pages) try: jsons = ProcessData.get_json_data(url) if pages==1 : count = math.ceil(int(jsons['wareCount'])/100) lists = jsons['wareInfo'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): ids = uuid.uuid1() #cassandra 主键 wareId = lists[i]['wareId'] try: f = lambda x: int(x[:-1])/100.00 ecsumscores = float(f(lists[i]['good'])) #商品总评分 except: ecsumscores = 0 crawl_data = { # 'id': uuid.uuid1(), 'source_id': wareId, 'source': self.data.get('source'), 'summary': {}, 'title': lists[i]['wname'], 'adword': lists[i]['adword'], 'price': float(lists[i]['jdPrice']), 'original_price': float(lists[i]['martPrice']), 'score': ecsumscores } crawl_data.update(category_data) data = { # 'uuid': ids, 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] # 'presentcategory': self.data['presentcategory'] } model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=wareId, data=data) Scheduler.schedule(CommentCrawler.type, key=wareId, data=data) pages += 1
def crawl(self): url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp" jsons = ProcessData.get_json_data(url) if jsons == {}: return {} category1 = jsons['firstLevelCategories'] for first_item in category1: name1 = first_item['goodsTypeName'] #1 lev name try: category2 = first_item['goodsTypeList'] except: pass for second_item in category2: name2 = second_item['goodsTypeName'] try: category3 = second_item['goodsTypeList'] except: pass for third_item in category3: try: third_id = third_item['goodsTypeId'] name3 = third_item['goodsTypeLongName'] except: pass priorcategory = [] priorcategory.append(name1) priorcategory.append(name2) priorcategory.append(name3) data = {'priorcategory': priorcategory} # if name3 != u"冰箱" and name3 != u"空调": # continue Scheduler.schedule(ListCrawler.type, key=third_id, data=data, interval=86400)
def crawl(self): json_data = ProcessData.get_json_data(self.get_json_url(self.key)) is_Bbc = self.get_is_Bbc(json_data) status = self.get_status(json_data) response = self.get_response(self.key) tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { "source": self.data["source"], "source_id": self.key, "status": status, "comment": { "is_Bbc": is_Bbc, }, } crawl_data.update(info) crawl_data.update(extract_category(self)) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "status": model["status"], "version": model["version"], "series": model["series"], "brand": model["brand"], "is_Bbc": model["comment"]["is_Bbc"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def get_data(self,CatID,pages): url = 'http://www.ows.newegg.com.cn/cat/%s'%(str(CatID)) list_urls = { 'page': str(pages), 'pagesize': 20, 'sort': 10 } return ProcessData.get_json_data(url,parameter=list_urls)
def get_page(self, goodsNo): json = ProcessData.get_json_data(self.get_url(goodsNo, 1)) try: totalnum = int(json['appraiseNumArray'][0]) except Exception, e: self.logger.error(e) print "totalPage fail!" return 0
def get_is_Bbc(self, key): url = "http://www.suning.com/emall/psl_10052_10051_000000000"\ "%s_9135_11082" % key data = ProcessData.get_json_data(url) isCShop = data["shopList"][0]["isCShop"] if int(isCShop) == 1: return "N" return "Y"
def get_page(self, catId): json = ProcessData.get_json_data(self.get_url(catId, 1)) try: totalpage = json['totalPage'] except Exception, e: self.logger.error(e) print "totalPage fail!" return 0
def get_page_size(self, key): url = "http://review.suning.com/mobile/getReviewCnt/general-000000000"\ "%s------.htm" % key json_data = ProcessData.get_json_data(url) count = int(json_data["reviewCounts"][0].get("totalCount", "0")) if count % 10 == 0: return count / 10 else: return count / 10 + 1
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsNo = goods['goodsNo'] goodsName = goods['goodsName'] skuID = goods['skuID'] goods_find = self.has_goods(goodsNo) if not goods_find: data = { 'priorcategory': self.data['priorcategory'], 'skuID': skuID, } Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) continue adword = self.extract_adword(goods['ad']) crawl_data = { 'id': goods_find['uuid'], 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': adword, 'status': goods_find['status'], 'price': float(goods['lowestSalePrice']), 'brand': goods_find['brand'], 'version': goods_find['version'], 'series': goods_find['series'], 'comment': { 'is_Bbc': goods_find['isBbc'], 'skuId': goods_find['skuID'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model)
def crawl(self): url = "http://interface.m.yhd.com/ \ mcategory/servlet/CentralMobileFacadeJsonServlet/ \ getNavCategoryWithKeywordByRootCategoryId? \ rootCategoryId=0&categoryNavId=0&provinceId=1" try: jsons = ProcessData.get_json_data(url.replace(' ','')) data = jsons['data'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url
def crawl(self): url = "http://interface.m.yhd.com/ \ mcategory/servlet/CentralMobileFacadeJsonServlet/ \ getNavCategoryWithKeywordByRootCategoryId? \ rootCategoryId=0&categoryNavId=0&provinceId=1" try: jsons = ProcessData.get_json_data(url.replace(' ','')) data = jsons['data'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url
def crawl(self): url = 'http://www.ows.newegg.com.cn/category.egg' jsons = ProcessData.get_json_data(url) for item1 in jsons: CatName1 = item1['CatName'].replace(" ", "") for item2 in item1['SubCategories']: CatName2 = item2['CatName'].replace(" ", "") for item3 in item2['SubCategories']: priorcategory = [] priorcategory.extend([ CatName1, CatName2, item3['CatName'].replace(" ", "") ]) self.handle(item3['CatID'], priorcategory)
def crawl(self): skulist = [] goodsNo = str(self.key) ids = self.data.get('uuid') category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) r = tree.xpath( "//div[@class='wap_tab_con']/div[2]/table[@class='parameter']/tbody/tr" ) i = len(r) standard = {} r1 = tree.xpath("//table[@class='parameter']/tbody/tr") for x in r1: m1 = x.xpath("td[@class='bg']") m2 = x.xpath("td[@class='bgv']") if len(m1) != 0 and len(m2) != 0: standard[m1[0].text] = m2[0].text rpack = tree.xpath("//div[@class='wap_tab_con']/div[3]") ecparkinglist = rpack[0].text rafter = tree.xpath("//div[@class='wap_tab_con']/div[4]") ecaftersale = rafter[0].text ecbrands = standard[u'品牌'] if standard.get(u'品牌') else '' # for k,v in standard.items(): # print k.encode('utf-8'),v.encode('utf-8') # print ecbrands.encode('utf-8') json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) skulist = json['skuList'] for sku in skulist: ecnowprice = sku['skuPrice'] ecnmaket = sku['skuPriceDesc'] ecname = sku['skuName'] adword = sku['promWords'] skuid = sku['skuID'] ecimglist = sku['skuSourceImgUrl'] source_id = goodsNo + '-' + skuid crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': source_id, 'summary': standard, 'introduce': {}, 'name': ecname, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): url = 'http://www.ows.newegg.com.cn/category.egg' try: jsons = ProcessData.get_json_data(url) except: print 'error ',url return for item1 in jsons: CatName1 = item1['CatName'] for item2 in item1['SubCategories']: CatName2 = item2['CatName'] for item3 in item2['SubCategories'] : priorcategory = [] priorcategory.extend([CatName1,CatName2,item3['CatName']]) self.handle(item3['CatID'],priorcategory)
def crawl(self): url = 'http://www.ows.newegg.com.cn/category.egg' try: jsons = ProcessData.get_json_data(url) except: print 'error ', url return for item1 in jsons: CatName1 = item1['CatName'] for item2 in item1['SubCategories']: CatName2 = item2['CatName'] for item3 in item2['SubCategories']: priorcategory = [] priorcategory.extend( [CatName1, CatName2, item3['CatName']]) self.handle(item3['CatID'], priorcategory)
def crawl(self): start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': '0', 'isDescription': 'true', 'isIcon': 'true', 'level': '0' } url = start_urls + quote(str(sencond_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsName = goods['goodsName'] goodsNo = goods['goodsNo'] skuID = goods['skuID'] # print goodsNo # print skuID crawl_data = { # 'id': uuid.uuid1(), 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': goods['ad'], 'price': float(goods['lowestSalePrice']), 'original_price': float(goods['highestSalePrice']), #'score': ecsumscores } crawl_data.update(category_data) model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=data)
def crawl(self): page_size = 0 page = 0 while page <= page_size: url = self.get_url(self.key, page) json_data = ProcessData.get_json_data(url) if page == 0: page_size = self.get_page_size(json_data) for goods in json_data["goods"]: source_id = goods["partnumber"] task_data = self.has_goods(self.key) if not task_data: data = { "priorcategory": self.data["priorcategory"], "status": 1 if int(goods["saleStatus"]) == 0 else 0, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: crawl_data = { "id": task_data["uuid"], "source": self.data["source"], "source_id": source_id, "title": goods["catentdesc"], "adword": extract_adword(goods.get("auxdescription", "")), "price": float(goods["price"]), 'status': task_data['status'], 'brand': task_data['brand'], 'version': task_data['version'], 'series': task_data['series'], 'comment': { 'is_Bbc': task_data['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def crawl(self): url = "http://mobile.gome.com.cn/mobile/product/allCategorys.jsp" jsons = ProcessData.get_json_data(url) if jsons == {}: return {} category1 = jsons['firstLevelCategories'] for first_item in category1: name1 = first_item['goodsTypeName'] #1 lev name try: category2 = first_item['goodsTypeList'] except: pass for second_item in category2: name2 = second_item['goodsTypeName'] #print name try: category3 = second_item['goodsTypeList'] except: pass for third_item in category3: try: third_id = third_item['goodsTypeId'] name3 = third_item['goodsTypeLongName'] except: pass # print third_id # print name3.encode('utf-8') priorcategory = [] priorcategory.append(name1) priorcategory.append(name2) priorcategory.append(name3) #presentcategory = priorcategory data = { 'priorcategory': priorcategory #'presentcategory':presentcategory } Scheduler.schedule(ListCrawler.type, key=third_id, data=data)
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: return for i in range(totalpage + 1): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) appraise = json['appraiseArray'] for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) comment_data = { 'eid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_name': item['appraiseName'], 'content': item['summary'], 'brand': self.data['brand'], 'version': self.data['version'], 'series': self.data['series'], 'comment': { 'is_Bbc': self.data['is_Bbc'], 'skuID': self.data['skuID'], } } comment_data.update(category_data) comment_data.update(get_ctime()) model = EcCommentModel(comment_data) export(model)
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: print "get_page fail" return {} for i in range(totalpage): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) try: appraise = json['appraiseArray'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get appraise fail" for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) # print commentid # print summary.encode('utf-8') comment_data = { 'ecid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_id': item['appraiseName'], 'content': item['summary'] } comment_data.update(category_data) model = EcCommentModel(comment_data) export(model)
def crawl(self): response = ProcessData.get_json_data(self.url) first_objs = response["shopKindInfo"] for first_obj in first_objs: for second_obj in first_obj["kindList2"]: for third_obj in second_obj["kindList3"]: key = third_obj["ci"] first_cat = first_obj["kindName"].replace(" ", "") second_cat = second_obj["kindName"].replace(" ", "") third_cat = third_obj["kindName"].replace(" ", "") data = { "priorcategory": [ first_cat, second_cat, third_cat, ] } # if second_cat != u"冰箱" and second_cat != u"空调": # continue Scheduler.schedule(ListCrawler.type, key=key, data=data, interval=86400)
def crawl(self): fid = self.key categorys = self.data['priorcategory'] # fid = '1625' # categorys = ["家居家装","清洁用品"] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" thrid_urls = { 'catelogyId':str(fid), 'isDescription':'false', 'isIcon':'false', 'level':'2' } url = start_urls + quote(str(thrid_urls)) try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return
def crawl(self): category_data = extract_category(self) page_size = self.get_page_size(self.key) page = 1 while page <= page_size: json_data = ProcessData.get_json_data(self.get_url(self.key, page)) reviews = json_data.get("commodityReviews", []) if not reviews: return for review in reviews: crawl_data = { "comment_id": self.get_comment_id(review), "content": review["content"], "tags": self.get_tags(review), "show_pic": self.get_show_pic(review), "pubtime": self.get_pubtime(review), "score": float(review["qualityStar"]), "useful": int(review["usefulCnt"]), "reply": 1 if review.get("replyInfo", {}) else 0, "user_name": review.get("userInfo", {}).get("nickName", ""), "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): # fid = '1620' # categorys = ["家居家装"] fid = self.key categorys = self.data['priorcategory'] start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body=" sencond_urls = { 'catelogyId': str(fid), 'isDescription': 'true', 'isIcon': 'true', 'level':'1' } url = start_urls + quote(str(sencond_urls)) #print 'url ',url try: jsons = ProcessData.get_json_data(url) lists = jsons['catelogyList'] except: print 'error ',url return if lists == []: return {} for i in range(len(lists)): cid = lists[i]['cid'] # presentcategory = [] priorcategory = [] priorcategory.extend(categorys) priorcategory.append(extract_title(lists[i]['name'])) data = { 'priorcategory':priorcategory, # 'presentcategory':presentcategory } Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)
def get_more_list(self, key, page): url = self.get_url(key, page, 1) json_data = ProcessData.get_json_data(url) tree = etree.HTML(json_data['value']) dom = tree.xpath(self.search_list_xpath('list')) return dom
def get_response(self, CatID, pages): url = 'http://www.ows.newegg.com.cn/cat/%s' % (str(CatID)) list_urls = {'page': str(pages), 'pagesize': 20, 'sort': 10} return ProcessData.get_json_data(url, parameter=list_urls)
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)