def crawl(self): json_data = ProcessData.get_json_data(self.get_json_url(self.key)) is_Bbc = self.get_is_Bbc(json_data) status = self.get_status(json_data) response = self.get_response(self.key) tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { "source": self.data["source"], "source_id": self.key, "status": status, "comment": { "is_Bbc": is_Bbc, }, } crawl_data.update(info) crawl_data.update(extract_category(self)) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "status": model["status"], "version": model["version"], "series": model["series"], "brand": model["brand"], "is_Bbc": model["comment"]["is_Bbc"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def crawl(self): category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: json_data = self.get_response(self.key, page) if page == 1: page_count = self.get_page_count(json_data) for item in json_data["ProductReviewList"]: review = item["ReviewDetail"] info = self.get_info(review) crawl_data = { "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): global COOKIE category_data = extract_category(self) response = self.get_response(self.key) if COOKIE != response.headers.get("set-cookie", ""): COOKIE = response.headers.get("set-cookie", "") tree = etree.HTML(response.text) info = self.get_info(tree) crawl_data = { 'source': "amazon", 'source_id': self.key, 'status': 1, } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcDetailModel(crawl_data) export(model) comment_data = { "uuid": model["id"], "brand": model["brand"], "version": model["version"], "series": model["series"], "is_Bbc": model["comment"]["is_Bbc"], 'status': model["status"], } Scheduler.schedule(CommentCrawler.type, key=self.key, data=comment_data)
def crawl(self): global COOKIE category_data = extract_category(self) page = 1 # 从第一页开始 pageSize = 5 while page <= pageSize: newurl = self.get_url(self.key, page) html_stream = ProcessData.get_web_data(newurl) if COOKIE != html_stream.headers.get("set-cookie", ""): COOKIE = html_stream.headers.get("set-cookie", "") html = etree.HTML(html_stream.content) if page == 1: pageSize = self.get_PageSize(html) items = html.xpath(self.xpath["item"]) for item in items: info = self.get_info(item) crawl_data = { "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "status": self.data["status"], "source_id": self.key, "comment": { "is_Bbc": self.data["is_Bbc"], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawler_data(self,tree): category_data = extract_category(self) info = self.get_info(tree) summary = info["summary"] introduce = info["introduce"] images = info["images"] images = self.convert_img(images) brand = self.get_brand(summary, introduce, tree) version = get_version(summary, introduce) series = get_series(summary, introduce) crawl_data = { 'source': self.data.get('source'), 'source_id': str(self.key), 'name': info['name'], 'images': images, 'intro_img': info['intro_img'], 'summary': summary, 'introduce': introduce, 'status': info['status'], 'version': version, 'brand': brand, 'series': series, 'comment': { 'is_Bbc': info['is_Bbc'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) return crawl_data
def crawl(self): key = self.key category_data = extract_category(self) count = 3 page = 1 while page <= count: response = self.get_response(key, page) tree = etree.HTML(response.text) if page == 1: count = self.get_count(tree) items = tree.xpath(self.xpath["item"]) for item in items: info = self.get_info(item) crawl_data = { 'eid': self.data['uuid'], 'source_id': self.data['source_id'], 'brand': self.data['brand'], 'series': self.data['series'], 'version': self.data['version'], 'source': self.data['source'], 'status': self.data["status"], 'comment': { 'is_Bbc': self.data['is_Bbc'], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): # fid = '1662' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] fid = self.key category_data = extract_category(self) count = 3 #页数初始值为3 pages = 1 #从第一页开始 while pages <= count: url = self.get_url(fid,pages) try: jsons = ProcessData.get_json_data(url) if pages==1 : count = math.ceil(int(jsons['wareCount'])/100) lists = jsons['wareInfo'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): ids = uuid.uuid1() #cassandra 主键 wareId = lists[i]['wareId'] try: f = lambda x: int(x[:-1])/100.00 ecsumscores = float(f(lists[i]['good'])) #商品总评分 except: ecsumscores = 0 crawl_data = { # 'id': uuid.uuid1(), 'source_id': wareId, 'source': self.data.get('source'), 'summary': {}, 'title': lists[i]['wname'], 'adword': lists[i]['adword'], 'price': float(lists[i]['jdPrice']), 'original_price': float(lists[i]['martPrice']), 'score': ecsumscores } crawl_data.update(category_data) data = { # 'uuid': ids, 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] # 'presentcategory': self.data['presentcategory'] } model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=wareId, data=data) Scheduler.schedule(CommentCrawler.type, key=wareId, data=data) pages += 1
def crawler_data(self,tree): category_data = extract_category(self) XPATH = self.search_list_xpath if len(tree.xpath(XPATH('list'))) == 0: XPATH = self.product_list_xpath dom = tree.xpath(XPATH('list')) for item in dom: crawl_data = {} craw = [ 'title','adword', 'price','original_price', 'source_id','score', ] for value in craw: crawl_data[value] = self.mackining(item.xpath(XPATH(value))) crawl_data['price'] = float(crawl_data['price']) try: f = lambda x: int(x[:-1])/100.00 crawl_data['score'] = float(f(crawl_data['score'])) except: crawl_data['score'] = 0 crawl_data.update(category_data) crawl_data['source'] = 'yhd' model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)
def crawl(self): # wareId = '1229271' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ecid = '124' wareId = self.key ecid = self.data['uuid'] category_data = extract_category(self) pages = 1 count = True while count: number = 0 #去重 url = self.get_url(wareId,pages) # print '++++++++= ',url html_stream = ProcessData.get_web_data(url) try: tree = etree.HTML(html_stream.text) except: print 'error: ',url break xpath = "//div[@id='comments-list']/div[@class='mc']" dom = tree.xpath(xpath) if dom == []: count = False continue for item in dom: datas = self.handle(item) comment_data={ # 'uuid': uuid.uuid1(), #primary key 'ecid': ecid, #commodity table foreign key 'source_id': wareId, 'source': self.data.get('source'), 'comment_id': datas['commentid'], #review id 'score': datas['score'], #commodity score 'pubtime': datas['commenttime'], 'buytime': datas['buytime'], 'user_id': datas['url'], # 'usernickName': groups[i]['usernickName'], 'useful': datas['useful'], 'reply': datas['reply'], 'content': datas['comment'], 'province': datas['province'] } comment_data.update(category_data) model = EcCommentModel(comment_data) is_saved = export(model) if is_saved == True: pass else: number += 1 if number > 10: break pages += 1
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsNo = goods['goodsNo'] goodsName = goods['goodsName'] skuID = goods['skuID'] goods_find = self.has_goods(goodsNo) if not goods_find: data = { 'priorcategory': self.data['priorcategory'], 'skuID': skuID, } Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) continue adword = self.extract_adword(goods['ad']) crawl_data = { 'id': goods_find['uuid'], 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': adword, 'status': goods_find['status'], 'price': float(goods['lowestSalePrice']), 'brand': goods_find['brand'], 'version': goods_find['version'], 'series': goods_find['series'], 'comment': { 'is_Bbc': goods_find['isBbc'], 'skuId': goods_find['skuID'], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model)
def crawl(self): global COOKIE keyid = self.key category_data = extract_category(self) priorcategory = self.data["priorcategory"] count = 3 page = 1 # 从第一页开始 while page <= count: url = self.get_url(keyid, page) html_stream = ProcessData.get_web_data(url) if COOKIE != html_stream.headers.get("set-cookie", ""): COOKIE = html_stream.headers.get("set-cookie", "") html = etree.HTML(html_stream.content) if page == 1: count = self.getPageSize(html) items = html.xpath(self.xpath["item"]) if not len(items): if html.xpath("//input[@id='captchacharacters']"): time.sleep(random.randint(1, 3)) continue else: self.remove_task(keyid) for item in items: source_id = self.get_source_id(item) task_data = self.has_goods(source_id) if not task_data: data = { 'priorcategory': priorcategory, } Scheduler.schedule(DetailCrawler.type, key=source_id, data=data) else: info = self.get_info(item) crawl_data = { 'id': task_data["uuid"], 'source_id': source_id, 'source': "amazon", 'brand': task_data["brand"], 'version': task_data["version"], 'series': task_data["series"], 'status': task_data["status"], "comment": { "is_Bbc": task_data["is_Bbc"], } } crawl_data.update(info) crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) page += 1
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): skulist = [] goodsNo = str(self.key) ids = self.data.get('uuid') category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) r = tree.xpath( "//div[@class='wap_tab_con']/div[2]/table[@class='parameter']/tbody/tr" ) i = len(r) standard = {} r1 = tree.xpath("//table[@class='parameter']/tbody/tr") for x in r1: m1 = x.xpath("td[@class='bg']") m2 = x.xpath("td[@class='bgv']") if len(m1) != 0 and len(m2) != 0: standard[m1[0].text] = m2[0].text rpack = tree.xpath("//div[@class='wap_tab_con']/div[3]") ecparkinglist = rpack[0].text rafter = tree.xpath("//div[@class='wap_tab_con']/div[4]") ecaftersale = rafter[0].text ecbrands = standard[u'品牌'] if standard.get(u'品牌') else '' # for k,v in standard.items(): # print k.encode('utf-8'),v.encode('utf-8') # print ecbrands.encode('utf-8') json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) skulist = json['skuList'] for sku in skulist: ecnowprice = sku['skuPrice'] ecnmaket = sku['skuPriceDesc'] ecname = sku['skuName'] adword = sku['promWords'] skuid = sku['skuID'] ecimglist = sku['skuSourceImgUrl'] source_id = goodsNo + '-' + skuid crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': source_id, 'summary': standard, 'introduce': {}, 'name': ecname, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): key = str(self.key) category_data = extract_category(self) page = 1 #从第一页开始 while True: items = self.get_init_list(key, page) if not items: break self.save_list(items, category_data=category_data) more_items = self.get_more_list(key, page) self.save_list(more_items, category_data=category_data) page += 1
def crawlHtml(self, html): ids = self.data['uuid'] source = "amazon" source_id = self.key category_data = extract_category(self) summary = {} ecbrands = "" ecnames = "" introduce = {} # 获取 productDetailsTable prodDetails = html.xpath( "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li" ) for proditem in prodDetails: k = proditem.xpath("b/text()")[0].strip()[:-1] if k == "用户评分": summary[k] = proditem.xpath( "span[@class='crAvgStars']/span/a/span/span/text()" )[0].strip()[2:-1] # print elif k == "亚马逊热销商品排名": print "a" else: summary[k] = proditem.xpath("text()")[0].strip() crawl_data = { 'id': ids, 'source': source, 'source_id': source_id, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): catId = str(self.key) category_data = extract_category(self) totalpage = self.get_page(catId) if totalpage == 0: return {} for i in range(1, totalpage + 1): url = self.get_url(catId, i) jsons = ProcessData.get_json_data(url) try: goodsList = jsons['goodsList'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get goodsList fail" for j in range(len(goodsList)): goods = goodsList[j] goodsName = goods['goodsName'] goodsNo = goods['goodsNo'] skuID = goods['skuID'] # print goodsNo # print skuID crawl_data = { # 'id': uuid.uuid1(), 'source_id': goodsNo, 'source': self.data.get('source'), 'title': goods['goodsName'], 'adword': goods['ad'], 'price': float(goods['lowestSalePrice']), 'original_price': float(goods['highestSalePrice']), #'score': ecsumscores } crawl_data.update(category_data) model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=goodsNo, data=data) Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=data)
def crawler_data(self,tree): ids = self.data.get('uuid') category_data = extract_category(self) introduce = tree.xpath(self.ware_xpath('introduce')) specifications = tree.xpath(self.ware_xpath('specifications')) introd = {} ecnorms = {} for item in introduce: item = item.strip() if item == '': continue item = item.split(u':',1) try: introd[item[0]] = item[1] except: pass for item in specifications: label = item.xpath(self.ware_xpath('label')) names = [] values = [] for i in label: i = i.strip() if i.strip() == '': continue names.append(i) dd = item.xpath(self.ware_xpath('item')) for i in dd: i = i.strip() if i.strip() == '': continue values.append(i) ecnorms.update(map(lambda x,y:[x,y],names,values)) crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': str(self.key), 'summary': ecnorms, 'introduce': introd, 'version': ecnorms.get(u'型号',''), 'brand': ecnorms.get(u'商品品牌','') } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawlHtml(self, html): ids = self.data['uuid'] source = "amazon" source_id = self.key category_data = extract_category(self) summary = {} ecbrands = "" ecnames = "" introduce = {} # 获取 productDetailsTable prodDetails = html.xpath( "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li") for proditem in prodDetails: k = proditem.xpath("b/text()")[0].strip()[:-1] if k == "用户评分": summary[k] = proditem.xpath( "span[@class='crAvgStars']/span/a/span/span/text()")[0].strip()[2:-1] # print elif k == "亚马逊热销商品排名": print "a" else: summary[k] = proditem.xpath("text()")[0].strip() crawl_data = { 'id': ids, 'source': source, 'source_id': source_id, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: return for i in range(totalpage + 1): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) appraise = json['appraiseArray'] for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) comment_data = { 'eid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_name': item['appraiseName'], 'content': item['summary'], 'brand': self.data['brand'], 'version': self.data['version'], 'series': self.data['series'], 'comment': { 'is_Bbc': self.data['is_Bbc'], 'skuID': self.data['skuID'], } } comment_data.update(category_data) comment_data.update(get_ctime()) model = EcCommentModel(comment_data) export(model)
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: print "get_page fail" return {} for i in range(totalpage): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) try: appraise = json['appraiseArray'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get appraise fail" for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) # print commentid # print summary.encode('utf-8') comment_data = { 'ecid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_id': item['appraiseName'], 'content': item['summary'] } comment_data.update(category_data) model = EcCommentModel(comment_data) export(model)
def crawl(self): CatID = self.key category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: jsons = self.get_response(CatID, page) if page == 1: page_count = self.get_page_count(jsons) for goods in jsons['ProductListItems']: source_id = goods["Code"] task_data = self.has_goods(source_id) if task_data: crawl_data = { "id": task_data["uuid"], "title": goods["Title"], "price": goods["Price"]["CurrentPrice"], "source_id": source_id, "source": self.data["source"], "status": task_data["status"], "brand": task_data["brand"], "version": task_data["version"], "series": task_data["series"], "comment": { "is_Bbc": task_data["isBbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) else: detail_data = { "priorcategory": self.data["priorcategory"], } Scheduler.schedule(DetailCrawler.type, key=source_id, data=detail_data) page += 1
def crawl(self): category_data = extract_category(self) page_size = self.get_page_size(self.key) page = 1 while page <= page_size: json_data = ProcessData.get_json_data(self.get_url(self.key, page)) reviews = json_data.get("commodityReviews", []) if not reviews: return for review in reviews: crawl_data = { "comment_id": self.get_comment_id(review), "content": review["content"], "tags": self.get_tags(review), "show_pic": self.get_show_pic(review), "pubtime": self.get_pubtime(review), "score": float(review["qualityStar"]), "useful": int(review["usefulCnt"]), "reply": 1 if review.get("replyInfo", {}) else 0, "user_name": review.get("userInfo", {}).get("nickName", ""), "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): # id号 ids = self.data['uuid'] # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1" # source_id="" # 商品url信息 url = self.key print "url:" + url source = "amazon" category_data = extract_category(self) # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取商品的详细信息 prodDetails = html.xpath("//div[@id='prodDetails']") if len(prodDetails) == 0: # 获取模版也具有基本信息的数据 detailed = getDetailedGoods( type=self.type, key=self.key, data=self.data ).crawlHtml(html) else: # 打印商品样式 style = prodDetails[0].xpath("div[@class='disclaim']/strong") # print style[0].text # 获取具体商品信息 goodinfo = prodDetails[0].xpath( "div[@class='wrapper CNlocale']//table/tbody/tr") # 商品 summary = {} ecbrands = "" ecnames = "" introduce = {} for info in goodinfo: # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text if info.xpath("td[@class='label']") != []: if info.xpath("td[@class='label']")[0].text == "用户评分": summary[info.xpath("td[@class='label']")[0].text] = info.xpath("td[@class='value']")[ 0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] elif info.xpath("td[@class='label']")[0].text.strip() == "品牌": ecbrands = info.xpath( "td[@class='value']")[0].text.strip() else: summary[info.xpath("td[@class='label']")[0].text] = info.xpath( "td[@class='value']")[0].text.strip() # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip() # 存入cassandra中 crawl_data = { 'id': ids, 'source': source, 'source_id': url, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): # 获取key 信息 # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071" keyid = self.key source = "amazon" score = 0 # 评分 # 获取原始分类 category_data = extract_category(self) # priorcategory priorcategory = self.data["priorcategory"] presentcategory = self.data["presentcategory"] count = getPageSize(self.get_url(keyid, 1)) # 页数初始值为3 page = 1 # 从第一页开始 content = "//div[@id='mainResults']/div" while page <= count: # 获取url信息 url = self.get_url(keyid, page) # print url # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # self.logger.info("执行页面:"+url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取整个商品的某一个商品的选项,返回的是一个列表 itempath = html.xpath(content) if itempath != None and itempath != []: # print itempath for item in itempath: title = item.xpath("h3[@class='newaps']/a") # crawl_data=[] #存储数据 # jg=item.xpath("") # 价格 pric = item.xpath( "ul[@class='rsltGridList grey']/li[@class='newp']/div") if pric == None: pric = item.xpath("ul/li[@class='newp']/div") # 商品评分 socreitmem = item.xpath( "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a") if socreitmem != []: scoreinfo = socreitmem[0].get('alt') if scoreinfo != None: score = float(scoreinfo[2:-1]) for t in title: # 获取商品的标题和url original_price = u"¥0.00" if pric == None or pric == []: price = u"¥0.00" else: try: price = pric[0].xpath("a/span")[0].text except: print url print "出错价格" + pric if pric != None and pric != [] and pric[0].xpath("a/del") != []: # 有原价 original_price = pric[0].xpath("a/del")[0].text else: # 如果没有原价,那就可以现价一样 original_price = price # i+=1 # 把信息存储到mongodb中 data = { 'priorcategory': priorcategory, 'presentcategory': presentcategory } if price != None and price.strip() != '' and pric != [] and pric[0] != '': # self.logger.info("价格:"+price) # 把信息存储到cassandra中 try: float(price.strip()[1:].replace(",", "")) # float(original_price.strip()[1:].replace(",","") except: self.logger.error("错误price:" + price) self.logger.error("错误price:" + original_price) crawl_data = { # 'id': uuid.uuid1(), 'source_id': t.get("href"), 'source': source, 'summary': {}, 'title': t.xpath("span")[0].text, 'adword': '', 'price': float(price.strip()[1:].replace(",", "")), 'original_price': float(original_price.strip()[1:].replace(",", "")), 'score': 0 } crawl_data.update(category_data) # 保存到cassandra数据库中category_data model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] # print "执行存储cassandra...." Scheduler.schedule( DetailCrawler.type, key=t.get("href"), data=data) Scheduler.schedule( CommentCrawler.type, key=t.get("href"), data=data) # print repr(json.dumps(crawl_data)) page += 1
def crawl(self): #商品id, 需要获取 goodid = self.data['uuid'] # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1" source = "amazon" url = self.key source_id = url category_data = extract_category(self) count = getCommSize(self.get_url(url, 1)) # 页数初始值为3 page = 1 # 从第一页开始 while page <= count: newurl = self.get_url(url, page) print newurl # productReviews # 获取该url的流信息 html_stream = ProcessData.get_web_data(newurl) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取评论区 comment = html.xpath("//table[@id='productReviews']//tr/td/div") for comitem in comment: # None # 评论内容 item = comitem.xpath("div[@class='reviewText']//text()") # 评分 scoreitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span/span/span") # 发布时间 pubtimeitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr" ) # 用户的链接地址 user_iditem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a" ) # 有用信息 usefulitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']") oninfo = "" for i in item: oninfo += i # 有用和无用信息 if usefulitem != None and usefulitem != []: tmpuseful = usefulitem[0].text.strip() else: tmpuseful = "0" if tmpuseful == "": tmpuseful = "0" elif tmpuseful != "0": tmpuseful = tmpuseful[0:tmpuseful.index("/")] # 日期 pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d') if pubtimeitem != None and pubtimeitem != []: pubtim = datetime.strptime( pubtimeitem[0].text.replace("年", "-").replace( "月", "-").replace("日", ""), '%Y-%m-%d') # 把日期的字符串类型,转换成日期类型 sorce = "0.0" if scoreitem != None and scoreitem != []: sorce = scoreitem[0].text[2:-1].strip() # print "评分:"+sorce # print user_iditem userid = '' if user_iditem != None and user_iditem != []: userid = str(user_iditem[0].get("href")) comment_data = { "ecid": goodid, "source_id": source_id, "source": source, "comment_id": "", "pubtime": pubtim, "buytime": pubtim, "score": float(sorce), "user_id": userid, "useful": int(tmpuseful), 'reply': 0, "content": oninfo.strip() } # print comment_data # 把原始和现有分类存储到数据库中 comment_data.update(category_data) model = EcCommentModel(comment_data) export(model) page += 1
def crawl(self): # id号 ids = self.data['uuid'] # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1" # source_id="" # 商品url信息 url = self.key print "url:" + url source = "amazon" category_data = extract_category(self) # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取商品的详细信息 prodDetails = html.xpath("//div[@id='prodDetails']") if len(prodDetails) == 0: # 获取模版也具有基本信息的数据 detailed = getDetailedGoods(type=self.type, key=self.key, data=self.data).crawlHtml(html) else: # 打印商品样式 style = prodDetails[0].xpath("div[@class='disclaim']/strong") # print style[0].text # 获取具体商品信息 goodinfo = prodDetails[0].xpath( "div[@class='wrapper CNlocale']//table/tbody/tr") # 商品 summary = {} ecbrands = "" ecnames = "" introduce = {} for info in goodinfo: # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text if info.xpath("td[@class='label']") != []: if info.xpath("td[@class='label']")[0].text == "用户评分": summary[info.xpath( "td[@class='label']")[0].text] = info.xpath( "td[@class='value']")[0].xpath( "//div[@id='averageCustomerReviewRating']" )[0].text.strip()[2:-1] # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] elif info.xpath( "td[@class='label']")[0].text.strip() == "品牌": ecbrands = info.xpath( "td[@class='value']")[0].text.strip() else: summary[info.xpath("td[@class='label']") [0].text] = info.xpath( "td[@class='value']")[0].text.strip() # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip() # 存入cassandra中 crawl_data = { 'id': ids, 'source': source, 'source_id': url, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): # 获取key 信息 # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071" keyid = self.key source = "amazon" score = 0 # 评分 # 获取原始分类 category_data = extract_category(self) # priorcategory priorcategory = self.data["priorcategory"] presentcategory = self.data["presentcategory"] count = getPageSize(self.get_url(keyid, 1)) # 页数初始值为3 page = 1 # 从第一页开始 content = "//div[@id='mainResults']/div" while page <= count: # 获取url信息 url = self.get_url(keyid, page) # print url # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # self.logger.info("执行页面:"+url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取整个商品的某一个商品的选项,返回的是一个列表 itempath = html.xpath(content) if itempath != None and itempath != []: # print itempath for item in itempath: title = item.xpath("h3[@class='newaps']/a") # crawl_data=[] #存储数据 # jg=item.xpath("") # 价格 pric = item.xpath( "ul[@class='rsltGridList grey']/li[@class='newp']/div") if pric == None: pric = item.xpath("ul/li[@class='newp']/div") # 商品评分 socreitmem = item.xpath( "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a" ) if socreitmem != []: scoreinfo = socreitmem[0].get('alt') if scoreinfo != None: score = float(scoreinfo[2:-1]) for t in title: # 获取商品的标题和url original_price = u"¥0.00" if pric == None or pric == []: price = u"¥0.00" else: try: price = pric[0].xpath("a/span")[0].text except: print url print "出错价格" + pric if pric != None and pric != [] and pric[0].xpath( "a/del") != []: # 有原价 original_price = pric[0].xpath("a/del")[0].text else: # 如果没有原价,那就可以现价一样 original_price = price # i+=1 # 把信息存储到mongodb中 data = { 'priorcategory': priorcategory, 'presentcategory': presentcategory } if price != None and price.strip( ) != '' and pric != [] and pric[0] != '': # self.logger.info("价格:"+price) # 把信息存储到cassandra中 try: float(price.strip()[1:].replace(",", "")) # float(original_price.strip()[1:].replace(",","") except: self.logger.error("错误price:" + price) self.logger.error("错误price:" + original_price) crawl_data = { # 'id': uuid.uuid1(), 'source_id': t.get("href"), 'source': source, 'summary': {}, 'title': t.xpath("span")[0].text, 'adword': '', 'price': float(price.strip()[1:].replace(",", "")), 'original_price': float(original_price.strip()[1:].replace( ",", "")), 'score': 0 } crawl_data.update(category_data) # 保存到cassandra数据库中category_data model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] # print "执行存储cassandra...." Scheduler.schedule(DetailCrawler.type, key=t.get("href"), data=data) Scheduler.schedule(CommentCrawler.type, key=t.get("href"), data=data) # print repr(json.dumps(crawl_data)) page += 1
def crawl(self): #商品id, 需要获取 goodid = self.data['uuid'] # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1" source = "amazon" url = self.key source_id = url category_data = extract_category(self) count = getCommSize(self.get_url(url, 1)) # 页数初始值为3 page = 1 # 从第一页开始 while page <= count: newurl = self.get_url(url, page) print newurl # productReviews # 获取该url的流信息 html_stream = ProcessData.get_web_data(newurl) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取评论区 comment = html.xpath("//table[@id='productReviews']//tr/td/div") for comitem in comment: # None # 评论内容 item = comitem.xpath("div[@class='reviewText']//text()") # 评分 scoreitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span/span/span") # 发布时间 pubtimeitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr") # 用户的链接地址 user_iditem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a") # 有用信息 usefulitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']") oninfo = "" for i in item: oninfo += i # 有用和无用信息 if usefulitem != None and usefulitem != []: tmpuseful = usefulitem[0].text.strip() else: tmpuseful = "0" if tmpuseful == "": tmpuseful = "0" elif tmpuseful != "0": tmpuseful = tmpuseful[0:tmpuseful.index("/")] # 日期 pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d') if pubtimeitem != None and pubtimeitem != []: pubtim = datetime.strptime(pubtimeitem[0].text.replace( "年", "-").replace("月", "-").replace("日", ""), '%Y-%m-%d') # 把日期的字符串类型,转换成日期类型 sorce = "0.0" if scoreitem != None and scoreitem != []: sorce = scoreitem[0].text[2:-1].strip() # print "评分:"+sorce # print user_iditem userid = '' if user_iditem != None and user_iditem != []: userid = str(user_iditem[0].get("href")) comment_data = { "ecid": goodid, "source_id": source_id, "source": source, "comment_id": "", "pubtime": pubtim, "buytime": pubtim, "score": float(sorce), "user_id": userid, "useful": int(tmpuseful), 'reply': 0, "content": oninfo.strip() } # print comment_data # 把原始和现有分类存储到数据库中 comment_data.update(category_data) model = EcCommentModel(comment_data) export(model) page += 1
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)