def parse_intr(self, tree, xpath): dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') > 0: item = item.split(':', 1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(wareId,ids ): import sys reload(sys) sys.setdefaultencoding("utf-8") url = 'http://item.jd.com/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) if html_stream=={}: return {} html_stream.encoding = 'gb2312' tree = etree.HTML(html_stream.text) xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()" dom = tree.xpath(xpath) introduce = {} temporary = '' for item in dom: item = item.strip() if item == '': continue elif item.find(':') >0: item = item.split(':',1) if item[1] == '': temporary = extract_title(item[0]) else: introduce[extract_title(item[0])] = extract_text(item[1]) else: if temporary != '': introduce[temporary] = extract_text(item) temporary = '' else: continue if introduce != '': return introduce else: return ''
def parse_summary(self, tree, xpath): dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i % 2 == 0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 return specifications