def parse_list(self, response): print('parse_list:', response.url) item_contains = [] url = response.url sit = response.meta['sit'] cpos = response.xpath(self.extract_dict['inner']['cpos']).extract() p1 = cpos.index(u'详情') + 1 if u'详情' in cpos and cpos.index( u'详情') else 0 p2 = cpos.index(u'企业名称') + 1 if u'企业名称' in cpos and cpos.index( u'企业名称') else 0 p3 = cpos.index(u'证书编号') + 1 if u'证书编号' in cpos and cpos.index( u'证书编号') else 0 if sit == sit_list[0]: inner_nodes = response.xpath(self.extract_dict['inner']['nodes']) inner = self.extract_dict['inner'] inner['cname'] = inner['cname'].format(p2) print("inner['cname']:", inner['cname']) for node in inner_nodes: item = NameItem() try: item['compass_name'] = self.handle_cname( node.xpath(inner['cname']).extract_first(), 'inner') except Exception as e: continue if p1: item['detail_link'] = self.handle_cdetail_link( node.xpath(inner['detail_link']).extract_first(), 'inner', url) else: item['detail_link'] = 'None' item['out_province'] = inner['out_province'][1] if isinstance( inner['out_province'], list) else 'None' item_contains.append(item) if sit == sit_list[1]: print(u'解析外省....') outer_nodes = response.xpath(self.extract_dict['outer']['nodes']) outer = self.extract_dict['outer'] outer['cname'] = outer['cname'].format(p2) for node in outer_nodes: item = NameItem() try: item['compass_name'] = self.handle_cname( node.xpath(outer['cname']).extract_first(), 'outer') except: continue if p1: item['detail_link'] = self.handle_cdetail_link( node.xpath(outer['detail_link']).extract_first(), 'outer', url) if isinstance(outer['out_province'], list) and len(outer['out_province']) > 1: item['out_province'] = outer['out_province'][1] else: item['out_province'] = self.handle_out_province( node.xpath(outer['out_province']).extract_first()) item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): # print('parse_list....', response.text) item_contains = [] url = response.url sit = response.meta['sit'] try: if sit == sit_list[0]: inner_nodes = response.xpath( self.extract_dict['inner']['nodes']) inner = self.extract_dict['inner'] print("inner_nodes:", len(inner_nodes)) for node in inner_nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(inner['cname']).extract_first(), 'inner') item['detail_link'] = self.handle_cdetail_link( node.xpath(inner['detail_link']).extract_first(), 'inner', url) if self.redis_tools.check_finger(item['detail_link']): print('{}已经爬取过'.format(item['detail_link'])) continue item['out_province'] = inner[ 'out_province'][1] if isinstance( inner['out_province'], list) else 'None' item_contains.append(item) if sit == sit_list[1]: print(u'解析外省....') outer_nodes = response.xpath( self.extract_dict['outer']['nodes']) outer = self.extract_dict['outer'] print("outer_nodes:", len(outer_nodes)) for node in outer_nodes: item = NameItem() print(node.xpath(outer['cname']).extract_first()) item['compass_name'] = self.handle_cname( node.xpath(outer['cname']).extract_first(), 'outer') item['detail_link'] = self.handle_cdetail_link( node.xpath(outer['detail_link']).extract_first(), 'outer', url) if self.redis_tools.check_finger(item['detail_link']): print('{}已经爬取过'.format(item['detail_link'])) continue if isinstance(outer['out_province'], list) and len(outer['out_province']) > 1: item['out_province'] = outer['out_province'][1] else: item['out_province'] = self.handle_out_province( node.xpath(outer['out_province']).extract_first()) item_contains.append(item) except Exception as e: print(response.text) with open(self.log_file, 'wa') as fp: fp.write(str(e)) exit(0) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): item_contains = [] node1 = response.xpath(self.inner_extract_dict['nodes']) node2 = response.xpath(self.outer_extract_dict['nodes']) try: for node in node1: inner_item = NameItem() inner_item['compass_name'] = self.handle_cname( node.xpath(self.inner_extract_dict['cname']).extract_first()) inner_item['detail_link'] = self.handle_cdetail_link( node.xpath(self.inner_extract_dict['detail_link']).extract_first()) inner_item['out_province'] = 'liaolin' if not self.redis_tools.check_finger(inner_item['detail_link']): item_contains.append(inner_item) else: print('{}已经爬取过'.format(inner_item['detail_link'])) for node in node2: outer_item = NameItem() outer_item['compass_name'] = self.handle_cname( node.xpath(self.outer_extract_dict['cname']).extract_first()) outer_item['detail_link'] = self.handle_cdetail_link( node.xpath(self.outer_extract_dict['detail_link']).extract_first()) outer_item['out_province'] = self.handle_out_province( node.xpath(self.outer_extract_dict['out_province']).extract_first()) if not self.redis_tools.check_finger(outer_item['detail_link']): item_contains.append(outer_item) else: print(u'{}已经爬取过'.format(outer_item['detail_link'])) except Exception as e: with open(self.log_file, 'wa') as fp: fp.write(str(e)) yield {'item_contains': item_contains} # 翻页 meta = response.meta cur_page_num = meta['cur_page_num'] next_page_flag = response.xpath('//a[@id="Linkbutton3" and contains(@class, "aspNetDisabled")]').extract() if next_page_flag: print(u'不能继续翻页了,当前最大页码:') return print(u'翻页....') next_page = int(cur_page_num) + 1 meta['cur_page_num'] = str(next_page) headers = self.get_header(response.url, flag='2') formdata = self.get_form_data(response) yield scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, meta=meta, headers=headers)
def parse_list(self, response): data = json.loads(response.text)['resultdata'] html = etree.HTML(data) ext_rules = self.extract_dict['inner'] nodes = html.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname'])[0]) item['detail_link'] = 'None' item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} total_page_num = html.xpath('//label[@id="zongyeshu"]/text()')[0] meta = response.meta if int(total_page_num) > int(meta['cur_page']): print(u'当前页码:{}'.format(meta['cur_page'])) yield self.turn_page(response) else: print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page'])) return
def parse_list(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname(node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link(node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} next_page_flag = response.xpath(ext_rules['next_page_flag']) meta = response.meta if not next_page_flag: print(u'当前页码:{}'.format(meta['cur_page'])) yield self.turn_page(response) else: print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page'])) return
def parse_list(self, response): item_contains = [] url = response.url meta = response.meta sit, mark = meta['sit'], meta['mark'] ext_dict = self.extract_dict[mark] nodes = response.xpath(ext_dict['nodes']) print('nodes:', len(nodes)) for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_dict['cname']).extract_first(), 'inner') if ext_dict['detail_link']: item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_dict['detail_link']).extract_first(), 'inner', url) else: item['detail_link'] = 'None' item['out_province'] = ext_dict['out_province'][1] if isinstance( ext_dict['out_province'], list) else 'None' if not self.redis_tools.check_finger(item['compass_name']): item_contains.append(item) else: print(u'{}已经抓取过了'.format(item['compass_name'])) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): json_resp = json.loads(response.text) item_contains = [] for unit in json_resp['data']: cname, cid, _id, bid, province = unit['corpName'], unit[ 'corpCode'], unit['id'], unit['bid'], unit['areacode'] detail_link = 'http://218.13.12.85/cxpt/website/enterpriseInfo.jsp?entID={}&eid={}&bid={}'.format( cid, _id, bid) out_province = self.handle_out_province(province) if self.redis_tools.check_finger(cname): print(u'{}已经爬取过'.format(cname)) continue item = NameItem({ 'compass_name': cname, 'detail_link': detail_link, 'out_province': out_province }) item_contains.append(item) yield {'item_contains': item_contains} if 'total' not in response.meta: response.meta['total_page_num'] = (int(json_resp['total']) + 9) / 10 if int(response.meta['pageIndex']) < int( response.meta['total_page_num']): yield self.turn_page(response) else: print('不能继续翻页了, 当前最大页码:{}'.format(response.meta['pageIndex'])) return
def parse_list(self, response): item_contains = [] url = response.url meta = response.meta sit, mark = meta['sit'], meta['mark'] ext_dict = self.extract_dict[mark] nodes = response.xpath(ext_dict['nodes']) try: for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_dict['cname']).extract_first(), 'inner') item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_dict['detail_link']).extract_first(), 'inner', url) item['out_province'] = ext_dict['out_province'][ 1] if isinstance(ext_dict['out_province'], list) else 'None' item_contains.append(item) except Exception as e: with open(self.log_file, 'wa') as fp: fp.write(str(e) + meta['cur_page_num']) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): json_resp = json.loads(response.text) total_page = json_resp['nPageCount'] total_rows = json_resp['nPageRowsCount'] cur_page_num = json_resp['nPageIndex'] html_str = json_resp['tb'] item_contains = [] html = etree.HTML(html_str) nodes = html.xpath(self.extract_dict['inner']['nodes']) for node in nodes: item = NameItem() item['compass_name'] = node.xpath( self.extract_dict['inner']['cname'])[0] item['detail_link'] = self.handle_cdetail_link( node.xpath(self.extract_dict['inner']['detail_link'])[0]) item['out_province'] = self.extract_dict['inner']['out_province'][ 1] item_contains.append(item) yield {'item_contains': item_contains} if int(cur_page_num) < int(total_page): yield self.turn_page(response) else: print(u'不能再翻页了,当前页码:', cur_page_num) return
def parse_list(self, response): json_data = json.loads(response.text) per_page_rows = 15 total_page_num = (json_data['datax'] + per_page_rows - 1) / per_page_rows item_contains = [] for unit in json_data['data']: cname, compass_id, out_province = unit['ci_name'], unit[ 'id'], unit['ci_reg_addr'] detail_link = 'http://218.95.173.11:8092/selectact/query.jspx?resid=IDIXWP2KBO&rowid={}&rows=10'.format( compass_id) item = NameItem({ 'compass_name': cname, 'detail_link': detail_link, 'out_province': out_province }) item_contains.append(item) yield {'item_contains': item_contains} if int(response.meta['cur_page_num']) < int(total_page_num): self.cnt += 1 print('即将翻%d页' % self.cnt) yield self.turn_page(response) else: print('不能继续翻页了, 当前页码:', response.meta['cur_page_num'])
def parse_list(self, response): item_contains = [] sit = response.meta['sit'] if sit == sit_list[0]: inner_nodes = response.xpath(self.extract_dict['inner']['nodes']) inner = self.extract_dict['inner'] for node in inner_nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(inner['cname']).extract_first(), 'inner') item['detail_link'] = self.handle_cdetail_link( node.xpath(inner['detail_link']).extract_first(), 'inner') if self.redis_tools.check_finger(item['detail_link']): print('{}已经爬取过'.format(item['detail_link'])) continue item['out_province'] = inner['out_province'][1] if isinstance( inner['out_province'], list) else 'None' item_contains.append(item) if sit == sit_list[1]: print(u'解析外省....') outer_nodes = response.xpath(self.extract_dict['outer']['nodes']) outer = self.extract_dict['outer'] print("outer_nodes:", len(outer_nodes)) for node in outer_nodes: item = NameItem() print(node.xpath(outer['cname']).extract_first()) item['compass_name'] = self.handle_cname( node.xpath(outer['cname']).extract_first(), 'outer') item['detail_link'] = self.handle_cdetail_link( node.xpath(outer['detail_link']).extract_first(), 'outer') if self.redis_tools.check_finger(item['detail_link']): print(u'{}已经爬取过'.format(item['detail_link'])) continue if isinstance(outer['out_province'], list) and len(outer['out_province']) > 1: item['out_province'] = outer['out_province'][1] else: item['out_province'] = self.handle_out_province( node.xpath(outer['out_province']).extract_first()) item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): meta = response.meta sit = meta['sit'] out_province = 'beijing' if sit_list[0] == sit else 'waisheng' json_data = json.loads(response.body_as_unicode())['data'] item_contains = [] for unit in json_data: item = NameItem({ 'compass_name': unit['enterpriseName'], 'detail_link': 'None', 'out_province': out_province }) item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list1(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['detail_link']): print(u'{}已经爬取郭'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list2(self, response): json_data = json.loads(response.body_as_unicode()) item_contains = [] for row in json_data['rows']: item = NameItem() item['compass_name'] = row['cxaa05'] item['detail_link'] = row['link'] item['out_province'] = 'waisheng' item_contains.append(item) yield {'item_contains': item_contains} meta = response.meta total_page = (json_data['total'] + 14) / 15 cur_page = meta['cur_page'] if int(cur_page) >= int(total_page): print(u'不能继续翻页了,当前最大页码为:', cur_page) return yield self.turn_page1(response)
def parse_list(self, response): meta = response.meta rule, sit = meta['rule'], meta['sit'] out_province = 'chongqing' if sit_list[0] == sit else 'waisheng' ext_rule = self.extract_dict[rule] nodes = response.xpath(ext_rule['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname(node.xpath(ext_rule['cname']).extract_first()) item['detail_link'] = 'None' item['out_province'] = out_province if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经抓取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response)
def parse_list(self, response): sit = response.meta['sit'] json_data = json.loads(response.text) html = etree.HTML(json_data['tb']) nodes = html.xpath(self.extract_dict['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(self.extract_dict['cname'])[0]) item['detail_link'] = self.handle_cdetail_link( node.xpath(self.extract_dict['detail_link'])[0]) item['out_province'] = 'jilin' if sit == sit_list[ 0] else node.xpath(self.extract_dict['out_province'])[0] if not self.redis_tools.check_finger(item['detail_link']): item_contains.append(item) else: print('{}已经爬取过'.format(item['detail_link'])) yield {'item_contains': item_contains} # 翻页 total_page = int(json_data['nPageCount']) cur_page = int(json_data['nPageIndex']) if int(total_page) > int(cur_page): print('翻页....') next_page = cur_page + 1 mpara = 'SnCorpData' if sit == sit_list[0] else 'SwCorpData' next_link = 'http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method={}&nPageIndex={}&nPageSize=20'.format( mpara, next_page) response.meta['cur_page'] = next_page yield scrapy.Request(next_link, callback=self.parse_list, meta=response.meta) else: print('不能继续翻页了,当前页码:', cur_page)