def parse_detail(self, response): item = crawler116() content = response.body soup = BeautifulSoup(content, "lxml") # 按照固定格式提取页面数据 div = soup.find('div', {'class': 'display_con'}) tdnodes = div.find_all('td') data_source = '4' entity_name = tdnodes[1].text.strip() entity_type = '2' case_no = tdnodes[19].text.strip() punish_type1 = tdnodes[13].text.strip() punish_type2 = tdnodes[15].text.strip() punish_reason = tdnodes[25].text.strip() law_item = tdnodes[27].text.strip() credit_no = tdnodes[3].text.strip() org_code = tdnodes[5].text.strip() reg_no = tdnodes[7].text.strip() tax_no = tdnodes[9].text.strip() identity_card = tdnodes[21].text.strip() legal_man = tdnodes[11].text.strip() punish_result = tdnodes[29].text.strip() punish_date = tdnodes[17].text.strip() punish_agent = tdnodes[31].text.strip() current_status = tdnodes[33].text.strip() area_code = tdnodes[39].text.strip() offical_updtime = tdnodes[37].text.strip() note = tdnodes[41].text.strip() create_date = time.strftime('%Y-%m-%d', time.localtime()) item['data_source'] = data_source item['entity_name'] = entity_name item['entity_type'] = entity_type item['case_no'] = case_no item['punish_type1'] = punish_type1 item['punish_type2'] = punish_type2 item['punish_reason'] = punish_reason item['law_item'] = law_item item['credit_no'] = credit_no item['org_code'] = org_code item['reg_no'] = reg_no item['tax_no'] = tax_no item['identity_card'] = identity_card item['legal_man'] = legal_man item['punish_result'] = punish_result item['punish_date'] = punish_date item['punish_agent'] = punish_agent item['current_status'] = current_status item['area_code'] = area_code item['offical_updtime'] = offical_updtime item['note'] = note item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item
def parsePerson(self, response): if not self.patt.search(response.text): soup = BeautifulSoup(response.text, 'lxml') data = soup.find('ul', class_=re.compile(r'new-tab1')).find_all('li') item = crawler116() item['entity_name'] = data[5].find_all('p')[-1].get_text( strip=True) item['identity_card'] = data[6].find_all('p')[-1].get_text( strip=True) # item['case_name'] = data[1].find_all('p')[-1].get_text(strip=True) item['punish_type1'] = data[2].find_all('p')[-1].get_text( strip=True) item['case_no'] = data[0].find_all('p')[-1].get_text(strip=True) item['punish_agent'] = data[9].find_all('p')[-1].get_text( strip=True) item['law_item'] = data[4].find_all('p')[-1].get_text(strip=True) item['punish_result'] = data[7].find_all('p')[-1].get_text( strip=True) item['punish_reason'] = data[3].find_all('p')[-1].get_text( strip=True) item['notice_id'] = response.url.split('id=')[-1] item['punish_date'] = data[8].find_all('p')[-1].get_text( strip=True) url = response.url item['source_url'] = url item['spider_name'] = self.name data = response.text item['source_page'] = data item['reg_no'] = '' item['tax_no'] = '' item['legal_man'] = '' item['credit_no'] = '' item['org_code'] = '' item['current_status'] = '' item['area_code'] = '' item['offical_updtime'] = '' item['note'] = '' item['create_date'] = '' item['update_date'] = '' item['punish_type2'] = '' item['entity_type'] = '' item['data_source'] = '' yield item
def parseDetail(self, response): if not self.patt.search(response.text): soup = BeautifulSoup(response.text, 'lxml') trs = soup.find('div', id='content').find_all('tr') for data in trs[1:]: item = crawler116() item['entity_name'] = data.find_all('td')[2].get_text( strip=True) item[ 'source_url'] = 'http://www.creditxizang.gov.cn' + data.find_all( 'td')[1].a['href'] item['case_no'] = data.find_all('td')[0].get_text(strip=True) item['punish_agent'] = data.find_all('td')[3].get_text( strip=True) url = response.url item['source_url'] = url item['spider_name'] = self.name data = response.text item['source_page'] = data item['notice_id'] = '' item['data_source'] = '' item['current_status'] = '' item['area_code'] = '' item['offical_updtime'] = '' item['note'] = '' item['create_date'] = '' item['update_date'] = '' item['punish_type2'] = '' item['entity_type'] = '' item['credit_no'] = '' item['org_code'] = '' item['reg_no'] = '' item['tax_no'] = '' item['identity_card'] = '' item['legal_man'] = '' item['punish_result'] = '' item['punish_date'] = '' item['punish_type1'] = '' item['punish_reason'] = '' item['law_item'] = '' yield item
def parse(self,response): soup=BeautifulSoup(response.text,'lxml') trs=soup.table.find_all('tr') item = crawler116() item['case_no'] = trs[0].find_all('td')[-1].get_text(strip=True) item['credit_no'] =trs[1].find_all('td')[-1].get_text(strip=True) item['punish_type1'] =trs[3].find_all('td')[-1].get_text(strip=True) item['punish_reason'] =trs[4].find_all('td')[-1].get_text(strip=True) item['law_item'] = trs[5].find_all('td')[-1].get_text(strip=True).replace(' ','').replace('\t','').replace('\n','').replace('\r','') item['entity_name'] =trs[6].find_all('td')[-1].get_text(strip=True) item['legal_man'] =trs[11].find_all('td')[-1].get_text(strip=True) item['punish_date']=trs[12].find_all('td')[-1].get_text(strip=True) # item['punishresult'] = .replace(' ','').replace('\t','').replace('\n','').replace('\r','') # item['timeresult'] = time.strftime('%Y-%m-%d', time.localtime(detaildic[jj]['f15'] / 1000)) item['punish_agent'] =trs[13].find_all('td')[-1].get_text(strip=True) item['source_url']=response.url item['source_page']=response.text item['spider_name']=self.name yield item
def parseJson(self, response): # patt=re.compile(r'\d+\.\d+\.\d+\.\d+') if not self.patt.search(response.text): r = json.loads(response.text.split('</script>')[-1]) data = r['msg']['rows'] for data in data: item = crawler116() item['spider_name'] = self.name item['punish_reason'] = data['fullname'] item['notice_id'] = data['id'] data = {'id': data['id'], 'dataType': '0', 'appType': 'APP001'} url = 'https://www.cdcredit.gov.cn/homePage/findDoublePubDetail.do' yield scrapy.FormRequest(url, formdata=data, meta={'item': item}, dont_filter=True, callback=self.parsePageDetail) else: print('=====================ip blocked========================') print(self.patt.search(response.text).group(1))
def parseJson(self, response): # patt=re.compile(r'\d+\.\d+\.\d+\.\d+') if not self.patt.search(response.text): r = json.loads(response.text.split('</script>')[-1]) lists = r['mess'] for data in lists: item = crawler116() item['entity_name'] = data['CF_XDR_MC'] item['punish_agent'] = data['CF_XZJG'] item['notice_id'] = data['VAL_ID'] item['punish_date'] = data['CF_JDRQ'] item['spider_name'] = self.name post_data = {'valId': data['VAL_ID'], 'type': '2'} yield scrapy.FormRequest(self.url_detail, formdata=post_data, meta={'item': item}, dont_filter=True, callback=self.parsePageDetail) else: print('=====================ip blocked========================') print(self.patt.search(response.text).group(1))
def parseJson(self,response): # patt=re.compile(r'\d+\.\d+\.\d+\.\d+') if not self.patt.search(response.text): r = json.loads(response.text.split('</script>')[-1]) data=r['data'] for each in data: item=crawler116() item['punish_date']=each['CF_SXQ'] item['entity_name']=each['CF_XDR_MC'] item['case_no']=each['CF_WSH'] item['source_url']='http://xyhn.hainan.gov.cn/JRBWeb/website/SysXymlResourcesEditController.do?reqCode=showxzxkxx&key=%27{}%27&stype=2'.format(each['uuid']) item['punish_agent']=each['CF_XZJG'] item['org_code']=each['QYBM_ID'] item['notice_id']=each['uuid'] item['spider_name']=self.name yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail ) else: print('=====================ip blocked========================') print(self.patt.search(response.text).group(1))
def parse_detail(self, response): pass r = json.loads(response.text) for data in r['result']['data']['list']: item = crawler116() item['law_item'] = data.get('cfYj') item['punish_result'] = data.get('cfJg') item['punish_reason'] = data.get('cfSy') item['case_no'] = data.get('cfWsh') item['punish_agent'] = data.get('cfXzjg') item['punish_type1'] = data.get('cfJg') item['punish_date'] = data.get('publishDate') item['legal_man'] = unquote( response.url.split('legalPerson=')[-1].split('&')[0]) item['credit_no'] = unquote( response.url.split('tyshxydm=')[-1].split('&')[0]) url = response.url item['source_url'] = url item['spider_name'] = self.name data = response.text item['source_page'] = data item['entity_name'] = '' item['org_code'] = '' item['reg_no'] = '' item['tax_no'] = '' item['identity_card'] = '' item['current_status'] = '' item['area_code'] = '' item['offical_updtime'] = '' item['note'] = '' item['create_date'] = '' item['update_date'] = '' item['punish_type2'] = '' item['entity_type'] = '' item['data_source'] = '' item['notice_id'] = '' yield item
def parseJson(self,response): if not self.patt.search(response.text): r = json.loads(response.text.split('</script>')[-1]) data=r['list'] for each in data: item=crawler116() item['credit_no'] =each['creditCode'] item['reg_no'] = '' item['identity_card'] =each['idCard'] item['current_status'] = '' item['area_code'] = each['areaCode'] item['offical_updtime'] = each['uploadTime'] item['note'] = '' item['create_date'] = time.strftime('%Y-%m-%d', time.localtime()) item['update_date'] = '' item['punish_type2'] = each['punishTypeTwo'] item['entity_type'] = each['organizationCode'] item['data_source'] = response.text item['spider_name']=self.name item['tax_no']=each['taxCode'] item['punish_date']=each['punishDate'] item['entity_name']=each['personName'] item['case_no']=each['punishNumber'] item['punish_reason'] = each['punishReason'] item['source_url']='http://www.creditjx.gov.cn/datareporting/doublePublicity/punishDetail/{}'.format(each['id']) item['punish_agent']=each['punishState'] item['org_code']=each['orgId'] item['notice_id']=each['id'] item['punish_type1'] =each['punishTypeOne'] item['legal_man']=each['legalName'] item['punish_result']=each['punishResult'] item['law_item']=each['punishBy'] item['source_page']=response.text yield item else: print('=====================ip blocked========================') print(self.patt.search(response.text).group(1))
def parseJson(self, response): # patt=re.compile(r'\d+\.\d+\.\d+\.\d+') if not self.patt.search(response.text): soup = BeautifulSoup(response.text, 'lxml') trs = soup.table.find_all('tr') for each in trs[1:]: item = crawler116() item['spider_name'] = self.name item['case_no'] = each.find_all('td')[0].a.attrs['title'] item['entity_name'] = each.find_all('td')[2].a.attrs['title'] item['punish_agent'] = each.find_all('td')[3].a.attrs['title'] item[ 'source_url'] = 'http://www.creditsc.gov.cn' + each.find_all( 'td')[1].a.attrs['href'] item['notice_id'] = each.find_all( 'td')[1].a.attrs['href'].split('?')[-1] yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail) else: print('=====================ip blocked========================') print(self.patt.search(response.text).group(1))
def parse_detail(self,response): item = crawler116() content_all = response.body list = [] content1 = re.findall('<div class="data">(.*?)<div class="pageFragment_bg_down"></div>', content_all, re.S)[0] content = re.findall('<td class="value">(.*?)</td>', content1, re.S) for each in content: each = each.replace('\t', '').replace(' ', '').replace('\r\n', '') list.append(each) no_field1 = re.findall('<td class="value table-inner">(.*?)</table>', content1, re.S)[0] no_field = re.findall('<tr class="value">(.*?)</tr>', no_field1, re.S)[0] no1 = re.findall('<td class="left">(.*?)</td>', no_field, re.S)[0].replace('\t', '').replace(' ', '').replace( '\r\n', '') list.append(no1) no2 = re.findall('<td>(.*?)</td>', no_field, re.S) no3 = re.findall('<td class="right">(.*?)</td>', no_field, re.S)[0].replace('\t', '').replace(' ', '').replace( '\r\n', '') list.append(no3) for each in no2: each = each.replace('\t', '').replace(' ', '').replace('\r\n', '') list.append(each) no4 = re.findall('<td >(.*?)</td>', no_field, re.S)[0] list.append(no4) create_date = time.strftime('%Y-%m-%d', time.localtime()) case_no = list[0] punish_type1 = list[2] punish_type2 = list[3] punish_reason = list[4] law_item = list[5] entity_name = list[7] credit_no = list[15] org_code = list[17] reg_no = list[18] tax_no = list[19] identity_card = list[16] legal_man = list[8] punish_result = list[6] punish_date = list[9] punish_agent = list[10] current_status = list[12] area_code = list[11] offical_updtime = list[13] note = list[14] create_date = create_date update_date = '' entity_type = '2' data_source = '6' item['case_no'] =case_no item['punish_type1'] =punish_type1 item['punish_type2'] =punish_type2 item['punish_reason'] =punish_reason item['law_item'] =law_item item['entity_name'] =entity_name item['credit_no'] =credit_no item['org_code'] =org_code item['reg_no'] =reg_no item['tax_no'] = tax_no item['identity_card'] =identity_card item['legal_man'] =legal_man item['punish_result'] =punish_result item['punish_date'] =punish_date item['punish_agent'] =punish_agent item['current_status'] =current_status item['area_code'] =area_code item['offical_updtime'] =offical_updtime item['note'] =note item['create_date'] = create_date # item['update_date'] =update_date item['entity_type'] = entity_type item['data_source'] = data_source item['source_url'] = response.url item['source_page'] = content item['spider_name']=self.name yield item
def parse_detail(self,response): item = crawler116() content = response.body content_detail = [] content_field = re.findall('class="table_normal1">(.*?)</table>', content, re.S)[0] content1 = re.findall('<td(.*?)</td>', content_field, re.S) for each in content1: each = each.replace('>', '').replace('\t', '').replace('\0', '').replace(' ', '').replace('\r\n', '').replace( '¥', '') content_detail.append(each) create_date = time.strftime('%Y-%m-%d', time.localtime()) credit_no = content_detail[6] case_no = content_detail[0] punish_type1 = content_detail[2] punish_reason = content_detail[3] law_item = content_detail[4] entity_name = content_detail[5] org_code = content_detail[7] reg_no = content_detail[8] tax_no = content_detail[9] identity_card = content_detail[10] legal_man = content_detail[11] punish_result = content_detail[12] punish_date = content_detail[13] punish_agent = content_detail[15] current_status = content_detail[16] area_code = content_detail[17] offical_updtime = content_detail[18] note = content_detail[19] create_date = create_date update_date = '' punish_type2 = '' entity_type = '0' data_source = '1' item['credit_no'] =credit_no item['case_no'] =case_no item['punish_type1'] =punish_type1 item['punish_reason'] =punish_reason item['law_item'] =law_item item['entity_name'] =entity_name item['org_code'] =org_code item['reg_no'] =reg_no item['tax_no'] =tax_no item['identity_card'] = identity_card item['legal_man'] =legal_man item['punish_result'] =punish_result item['punish_date'] =punish_date item['punish_agent'] =punish_agent item['current_status'] =current_status item['area_code'] =area_code item['offical_updtime'] =offical_updtime item['note'] =note item['create_date'] =create_date item['update_date'] = update_date item['punish_type2'] =punish_type2 item['entity_type'] =entity_type item['data_source'] =data_source item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item