def crawl(self): wareId = str(self.key) url = "http://item.yhd.com/item/%s"%wareId html_stream = ProcessData.get_web_data(url) # print html_stream.text.encode('utf-8') tree = etree.HTML(html_stream.text) self.crawler_data(tree)
def crawl(self): key = str(self.key) count = 2 #页数初始值为3 pages = 1 #从第一页开始 for i in xrange(1,count): url = self.get_url(key,pages) html_stream = ProcessData.get_web_data(url) # print html_stream.text.encode('utf-8') tree = etree.HTML(html_stream.text) self.crawler_data(tree)
def crawl(self): # wareId = '1229271' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ecid = '124' wareId = self.key ecid = self.data['uuid'] category_data = extract_category(self) pages = 1 count = True while count: number = 0 #去重 url = self.get_url(wareId,pages) # print '++++++++= ',url html_stream = ProcessData.get_web_data(url) try: tree = etree.HTML(html_stream.text) except: print 'error: ',url break xpath = "//div[@id='comments-list']/div[@class='mc']" dom = tree.xpath(xpath) if dom == []: count = False continue for item in dom: datas = self.handle(item) comment_data={ # 'uuid': uuid.uuid1(), #primary key 'ecid': ecid, #commodity table foreign key 'source_id': wareId, 'source': self.data.get('source'), 'comment_id': datas['commentid'], #review id 'score': datas['score'], #commodity score 'pubtime': datas['commenttime'], 'buytime': datas['buytime'], 'user_id': datas['url'], # 'usernickName': groups[i]['usernickName'], 'useful': datas['useful'], 'reply': datas['reply'], 'content': datas['comment'], 'province': datas['province'] } comment_data.update(category_data) model = EcCommentModel(comment_data) is_saved = export(model) if is_saved == True: pass else: number += 1 if number > 10: break pages += 1
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): skulist = [] goodsNo = str(self.key) ids = self.data.get('uuid') category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) r = tree.xpath( "//div[@class='wap_tab_con']/div[2]/table[@class='parameter']/tbody/tr" ) i = len(r) standard = {} r1 = tree.xpath("//table[@class='parameter']/tbody/tr") for x in r1: m1 = x.xpath("td[@class='bg']") m2 = x.xpath("td[@class='bgv']") if len(m1) != 0 and len(m2) != 0: standard[m1[0].text] = m2[0].text rpack = tree.xpath("//div[@class='wap_tab_con']/div[3]") ecparkinglist = rpack[0].text rafter = tree.xpath("//div[@class='wap_tab_con']/div[4]") ecaftersale = rafter[0].text ecbrands = standard[u'品牌'] if standard.get(u'品牌') else '' # for k,v in standard.items(): # print k.encode('utf-8'),v.encode('utf-8') # print ecbrands.encode('utf-8') json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) skulist = json['skuList'] for sku in skulist: ecnowprice = sku['skuPrice'] ecnmaket = sku['skuPriceDesc'] ecname = sku['skuName'] adword = sku['promWords'] skuid = sku['skuID'] ecimglist = sku['skuSourceImgUrl'] source_id = goodsNo + '-' + skuid crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': source_id, 'summary': standard, 'introduce': {}, 'name': ecname, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): wareId = str(self.key) url = "http://item.yhd.com/item/%s"%wareId html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) crawl_data = self.crawler_data(tree) product_id = self.parse_productId(tree) model = EcDetailModel(crawl_data) export(model) comment_data = { 'uuid': model['id'], 'status': crawl_data['status'], 'brand': brand, 'series': series, 'version': version, 'is_Bbc': crawl_data['comment']['is_Bbc'], 'priorcategory': self.data['priorcategory'], 'source_id': wareId, } Scheduler.schedule(CommentCrawler.type, key=product_id, data=comment_data)
def crawl(wareId,ids ): import sys reload(sys) sys.setdefaultencoding("utf-8") url = 'http://item.jd.com/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) if html_stream=={}: return {} html_stream.encoding = 'gb2312' tree = etree.HTML(html_stream.text) xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()" dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') >0: item = item.split(':',1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)
def get_response(self, key, page): url = "http://club.yhd.com/review/%s-%s.html" % (key,str(page)) response = ProcessData.get_web_data(url) response.encoding = "utf-8" return response
def get_response(self, key): url = self.get_url(key) return ProcessData.get_web_data(url)