def parse_detail(self, response): print 'parse_detail now' p = PyQuery(response.body) content = PyQuery(p('div.details')) s = '' for a in content('dl').items(): b = PyQuery(a) s = s + b('dt').text() + b('dd').text() + '\n' item = GsxtItem() item['zhizhao'] = s item['cname'] = response.meta['company_name'] yield item pass
def parse_getcontent(self, response): # 解析搜索界面,得到详情页地址,跳转 print 'getcontent now' try: yield self.return_begin(response).next() self.countsucc = self.countsucc + 1 # print response.body p = PyQuery(response.body) meta = deepcopy(response.meta) if p('div.main-layout a.search_list_item'): '该搜索名能够得到公司信息' print '成功搜索到公司' # 跳转到该公司对应的页面获取详情 # for i in p('div.main-layout a.search_list_item').items(): # href = self.detail_url + i.attr('href') # a = PyQuery(i) # meta.update({'company_name': ''.join(a('h1').text().split())}) # yield Request( # method='GET', # meta=meta, # url=href, # callback=self.parse_detail, # dont_filter=True, # ) # 保存公司简略信息 for content in p('div.main-layout a.search_list_item').items(): item = GsxtItem() a = PyQuery(content) self.logger.info( str(self.countsucc) + ', ' + ''.join(a('h1').text().split())) # item['cname'] = ''.join(a('h1').text().split()) # item['status'] = a('div.wrap-corpStatus span').text() # item['ccode'] = a('div.div-map2 span').text() # item['lawuser'] = a('div.div-user2 span').text() # item['etime'] = a('div.div-info-circle2 span').text() # yield item else: print '没有符合的公司' except Exception as e: self.logger.exception(e) print 'error parse_getcontent'
def save_content(self, response): meta = response.meta if meta['info'] == 'basic': if 'company_detail_basic.html' in response.body: self.logger.info( 'get content basic, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) else: self.logger.info('not get basic, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) print response.body yield self.man_retry(response) return if meta['info'] == 'JCXX': if '营业执照信息' in response.body: self.logger.info( 'get content JCXX, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) else: self.logger.info('not get JCXX, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) print response.body yield self.man_retry(response) return if meta['info'] == 'XZXK': if '行政许可信息' in response.body: self.logger.info( 'get content XZXK, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) else: self.logger.info('not get XZXK, name:{} list_name:{}'.format( meta['name'], meta['list_name'])) print response.body yield self.man_retry(response) return result = GsxtItem.get_result_from_response(response) result['content'] = '%s %s\n%s' % (meta['name'], meta['list_name'], response.body) yield result
def parse_getcontent(self, response): #解析搜索界面,得到详情页地址,跳转 print 'getcontent now' try: p = PyQuery(response.body) meta = deepcopy(response.meta) if p('div.main-layout a.search_list_item'): '该搜索名能够得到公司信息' print '成功搜索到公司' #跳转到该公司对应的页面获取详情 for i in p('div.main-layout a.search_list_item').items(): href = self.detail_url + i.attr('href') a = PyQuery(i) meta.update( {'company_name': ''.join(a('h1').text().split())}) yield Request( method='GET', meta=meta, url=href, callback=self.parse_detail, dont_filter=True, ) #保存公司简略信息 for content in p('div.main-layout a.search_list_item').items(): item = GsxtItem() a = PyQuery(content) item['cname'] = ''.join(a('h1').text().split()) item['status'] = a('div.wrap-corpStatus span').text() item['ccode'] = a('div.div-map2 span').text() item['lawuser'] = a('div.div-user2 span').text() item['etime'] = a('div.div-info-circle2 span').text() yield item else: print '没有符合的公司' except Exception as e: print 'error parse_getcontent'
def parse(self, response): item = GsxtItem() item['text'] = response.xpath("//text()").extract_first() yield item