def parse_detail(self, response): item = crawler114_out() content = response.body soup = BeautifulSoup(content, "lxml") tag_p = soup.find_all('p', attrs={"class": False}) tag_span = soup.find_all('span', attrs={"lang": "EN-US"}) case_no = tag_span[3].get_text(strip=True) ent_name = tag_p[8].get_text(strip=True).encode('utf-8').replace( ':', '') reason = tag_p[9].get_text(strip=True).encode('utf-8').replace(' ', '') release_org = response.meta['pun_org'] release_date = response.meta['pun_date'] data_id = 'hlj' data_source = 'crawler114_3_out' create_date = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = case_no item['release_org'] = release_org item['release_date'] = release_date item['entity_name'] = ent_name item['release_reason'] = reason item['data_id'] = data_id item['data_source'] = data_source item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item
def parse_detail(self, response): item = crawler114_out() data = json.loads(response.body.decode('utf-8')) item['create_date'] = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = data['NOTNO'] ent_name = data['ENTNAME'] item['ent_name'] = ent_name release_org = data['DECORGNAME'] item['release_org'] = release_org release_date = data['ABNTIMEString'] item['release_date'] = release_date reg_no = data['REGNO'] item['reg_no'] = reg_no pun_reason1 = data['FACTANDRULE'] pun_reason2 = data['BASISINFO'] release_reason = u'经查,你单位因' + pun_reason1 + u',违反了' + pun_reason2 + u'的规定,现决定将其列入经营异常名录。' item['release_reason'] = release_reason item['data_source'] = self.name hashcode = hash(ent_name + release_date) item['data_id'] = 'jl' + '-' + str(hashcode) url = response.url item['source_url'] = url item['spider_name'] = self.name data = response.text item['source_page'] = data yield item
def parse_detail(self, response): item = crawler114_out() content = response.body soup = BeautifulSoup(content, "lxml") tag_p = soup.find_all('p') if len(tag_p) >= 2: case_no = tag_p[0].get_text(strip=True) release_reason = tag_p[1].get_text(strip=True) else: case_no = None release_reason = None data_source = 'crawler114_6_out' create_date = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = case_no item['release_org'] = response.meta['release_org'] item['release_date'] = response.meta['release_date'] item['entity_name'] = response.meta['ent_name'] item['release_reason'] = release_reason item['data_source'] = data_source item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['data_id'] = 'yunnan' item['spider_name'] = self.name yield item
def parse(self, response): li_list = response.xpath('//table[@class="noticelist-t"]/tr') for li in li_list: item = crawler114_out() title = li.xpath('./td[1]/a/text()').extract_first() href = li.xpath('./td[1]/a/@href').extract_first() href = 'http://gx.gsxt.gov.cn' + href item['release_org'] = li.xpath('./td[2]/text()').extract_first() release_date = li.xpath('./td[3]/text()').extract_first() item['release_date'] = release_date item['ent_name'] = title.replace(u'关于将', '').replace( u'移出经营异常名录的公告', '').replace(u'移出经营异常名录的', '').replace(u'移出经营异常名录', '') hashcode = hash(title + release_date) item['data_id'] = 'gx' + '-' + str(hashcode) yield scrapy.Request(href, callback=self.parse_detail, meta={'item': item}) # 翻页 url = response.url cur_page = url.split('pageNos=')[-1] total_pages = response.xpath( '//div[@class="pages"]/span[2]/text()').extract_first() total_pages = total_pages.replace(u'\xa0', u' ') total_pages = re.findall('.*?(\d+).*?', total_pages)[0] if int(cur_page) < int(total_pages): next_page = int(cur_page) + 1 next_href = 'http://gx.gsxt.gov.cn/xxgg/xxggAction!queryGgxx.dhtml?vchr_bmdm=¬itype=12¬iceTitle=%E8%AF%B7%E8%BE%93%E5%85%A5%E9%9C%80%E8%A6%81%E6%9F%A5%E8%AF%A2%E4%BF%A1%E6%81%AF&pageNos=' + str( next_page) yield scrapy.Request(next_href, callback=self.parse)
def parse(self, response): li_list = response.xpath('//table/tr')[:-1] for li in li_list: item = crawler114_out() title = li.xpath('./td[2]/a/text()').extract_first() ent_name = title.replace(u'关于', '').replace(u'企业移出经营异常名录公告', '') ent_name = ent_name.replace(' ', '').replace('\r\n', '').replace(u'\xa0', u' ') item['ent_name'] = ent_name item['release_org'] = li.xpath( './td[@id="A5"]/text()').extract_first() item['release_org'] = item['release_org'].replace(' ', '').replace( '\r\n', '') release_date = li.xpath( './td[@class="td4"]/span/text()').extract_first() item['release_date'] = release_date if release_date: hashcode = hash(ent_name + release_date) else: hashcode = hash(ent_name) item['data_source'] = self.name item['data_id'] = 'qh' + '-' + str(hashcode) detail_url = li.xpath('.//a[@id="A3"]/@href').extract_first() detail_url = 'http://qh.gsxt.gov.cn/' + detail_url yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 翻页 # 总页数 url = response.url ret = response.xpath( './/div[@class="newfy"]/ul/li[2]/text()').extract_first() total_pages = re.findall('.*?(\d+).*?', ret)[0] # 当前页 cur_page = url.split('pageNo=')[-1].split('&')[0] if int(cur_page) < int(total_pages): next_page = int(cur_page) + 1 next_url = 'http://qh.gsxt.gov.cn/ycmlNoticeInfo.jspx?mark=02&pageNo=' + str( next_page) + '&order=0&title=&area=' yield scrapy.Request(next_url, callback=self.parse)
def parseDetail(self,response): soup=BeautifulSoup(response.text,'lxml') trs=soup.find_all('tr') for each in trs: item=crawler114_out() if each.find_all('td')[2].get_text(strip=True): item['release_org'] = each.find_all('td')[2].get_text(strip=True) else: item['release_org']='' item['release_date'] = each.find_all('td')[3].get_text(strip=True) item['spider_name']=item['data_source']=self.name if each.find_all('td')[1].a: item['data_id'] = each.find_all('td')[1].a.attrs['href'].split('id=')[-1] item['entity_name'] = each.find_all('td')[1].get_text(strip=True).replace(u'关于', '').replace(u'列入经营异常名录公告', '') item['source_url'] = 'http://ah.gsxt.gov.cn'+each.find_all('td')[1].a.attrs['href'] yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail )
def parse_detail(self, response): content = response.body soup = BeautifulSoup(content, "lxml") item = crawler114_out() release_reason = soup.find_all( class_='Section1')[0].find_all('p')[1].get_text() create_date = time.strftime('%Y-%m-%d', time.localtime()) data_id = 'henan' data_source = 'crawler114_5_out' item['release_reason'] = release_reason item['entity_name'] = response.meta['ent_name'] item['release_org'] = response.meta['release_org'] item['release_date'] = response.meta['release_date'] item['source_url'] = response.url item['source_page'] = content item['create_date'] = create_date item['data_id'] = data_id item['data_source'] = data_source item['spider_name'] = self.name yield item
def parse_detail(self,response): item = crawler114_out() content = response.body soup = BeautifulSoup(content,"lxml") # try: # case_no = soup.find_all('p')[2].get_text(strip=True) # except: # case_no = None # # try: # pun_reason = soup.find_all('p')[4].get_text(strip=True) # except: # pun_reason = None try: case_no = response.xpath('''/html/body/form/div[1]/div[4]/div/div/div/div[2]/div/div/p[2]/text()''')[0].extract() except: case_no = None try: pun_reason = response.xpath('''/html/body/form/div[1]/div[4]/div/div/div/div[2]/div/div/p[4]/text()''')[0].extract() except: pun_reason = None data_source = 'crawler114_8_out' create_date = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = case_no item['entity_name'] = response.meta['ent_name'] item['release_date'] = response.meta['release_date'] item['release_reason'] = pun_reason item['release_org'] = response.meta['release_org'] item['data_source'] = data_source item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['data_id'] = 'shanxi' item['spider_name'] = self.name yield item