Example #1
0
    def parse_detail(self, response):
        item = crawler116()
        content = response.body
        soup = BeautifulSoup(content, "lxml")

        # 按照固定格式提取页面数据
        div = soup.find('div', {'class': 'display_con'})

        tdnodes = div.find_all('td')

        data_source = '4'
        entity_name = tdnodes[1].text.strip()
        entity_type = '2'
        case_no = tdnodes[19].text.strip()
        punish_type1 = tdnodes[13].text.strip()
        punish_type2 = tdnodes[15].text.strip()
        punish_reason = tdnodes[25].text.strip()
        law_item = tdnodes[27].text.strip()
        credit_no = tdnodes[3].text.strip()
        org_code = tdnodes[5].text.strip()
        reg_no = tdnodes[7].text.strip()
        tax_no = tdnodes[9].text.strip()
        identity_card = tdnodes[21].text.strip()
        legal_man = tdnodes[11].text.strip()
        punish_result = tdnodes[29].text.strip()
        punish_date = tdnodes[17].text.strip()
        punish_agent = tdnodes[31].text.strip()
        current_status = tdnodes[33].text.strip()
        area_code = tdnodes[39].text.strip()
        offical_updtime = tdnodes[37].text.strip()
        note = tdnodes[41].text.strip()
        create_date = time.strftime('%Y-%m-%d', time.localtime())

        item['data_source'] = data_source
        item['entity_name'] = entity_name
        item['entity_type'] = entity_type
        item['case_no'] = case_no
        item['punish_type1'] = punish_type1
        item['punish_type2'] = punish_type2
        item['punish_reason'] = punish_reason
        item['law_item'] = law_item
        item['credit_no'] = credit_no
        item['org_code'] = org_code
        item['reg_no'] = reg_no
        item['tax_no'] = tax_no
        item['identity_card'] = identity_card
        item['legal_man'] = legal_man
        item['punish_result'] = punish_result
        item['punish_date'] = punish_date
        item['punish_agent'] = punish_agent
        item['current_status'] = current_status
        item['area_code'] = area_code
        item['offical_updtime'] = offical_updtime
        item['note'] = note
        item['create_date'] = create_date
        item['source_url'] = response.url
        item['source_page'] = content
        item['spider_name'] = self.name

        yield item
Example #2
0
    def parsePerson(self, response):
        if not self.patt.search(response.text):
            soup = BeautifulSoup(response.text, 'lxml')
            data = soup.find('ul',
                             class_=re.compile(r'new-tab1')).find_all('li')

            item = crawler116()
            item['entity_name'] = data[5].find_all('p')[-1].get_text(
                strip=True)
            item['identity_card'] = data[6].find_all('p')[-1].get_text(
                strip=True)
            # item['case_name'] = data[1].find_all('p')[-1].get_text(strip=True)
            item['punish_type1'] = data[2].find_all('p')[-1].get_text(
                strip=True)
            item['case_no'] = data[0].find_all('p')[-1].get_text(strip=True)

            item['punish_agent'] = data[9].find_all('p')[-1].get_text(
                strip=True)
            item['law_item'] = data[4].find_all('p')[-1].get_text(strip=True)
            item['punish_result'] = data[7].find_all('p')[-1].get_text(
                strip=True)
            item['punish_reason'] = data[3].find_all('p')[-1].get_text(
                strip=True)
            item['notice_id'] = response.url.split('id=')[-1]
            item['punish_date'] = data[8].find_all('p')[-1].get_text(
                strip=True)
            url = response.url
            item['source_url'] = url
            item['spider_name'] = self.name
            data = response.text
            item['source_page'] = data
            item['reg_no'] = ''
            item['tax_no'] = ''
            item['legal_man'] = ''
            item['credit_no'] = ''
            item['org_code'] = ''
            item['current_status'] = ''
            item['area_code'] = ''
            item['offical_updtime'] = ''
            item['note'] = ''
            item['create_date'] = ''
            item['update_date'] = ''
            item['punish_type2'] = ''
            item['entity_type'] = ''
            item['data_source'] = ''
            yield item
Example #3
0
 def parseDetail(self, response):
     if not self.patt.search(response.text):
         soup = BeautifulSoup(response.text, 'lxml')
         trs = soup.find('div', id='content').find_all('tr')
         for data in trs[1:]:
             item = crawler116()
             item['entity_name'] = data.find_all('td')[2].get_text(
                 strip=True)
             item[
                 'source_url'] = 'http://www.creditxizang.gov.cn' + data.find_all(
                     'td')[1].a['href']
             item['case_no'] = data.find_all('td')[0].get_text(strip=True)
             item['punish_agent'] = data.find_all('td')[3].get_text(
                 strip=True)
             url = response.url
             item['source_url'] = url
             item['spider_name'] = self.name
             data = response.text
             item['source_page'] = data
             item['notice_id'] = ''
             item['data_source'] = ''
             item['current_status'] = ''
             item['area_code'] = ''
             item['offical_updtime'] = ''
             item['note'] = ''
             item['create_date'] = ''
             item['update_date'] = ''
             item['punish_type2'] = ''
             item['entity_type'] = ''
             item['credit_no'] = ''
             item['org_code'] = ''
             item['reg_no'] = ''
             item['tax_no'] = ''
             item['identity_card'] = ''
             item['legal_man'] = ''
             item['punish_result'] = ''
             item['punish_date'] = ''
             item['punish_type1'] = ''
             item['punish_reason'] = ''
             item['law_item'] = ''
             yield item
Example #4
0
    def parse(self,response):
        soup=BeautifulSoup(response.text,'lxml')
        trs=soup.table.find_all('tr')


        item = crawler116()
        item['case_no'] =  trs[0].find_all('td')[-1].get_text(strip=True)
        item['credit_no'] =trs[1].find_all('td')[-1].get_text(strip=True)
        item['punish_type1'] =trs[3].find_all('td')[-1].get_text(strip=True)
        item['punish_reason'] =trs[4].find_all('td')[-1].get_text(strip=True)
        item['law_item'] = trs[5].find_all('td')[-1].get_text(strip=True).replace(' ','').replace('\t','').replace('\n','').replace('\r','')
        item['entity_name'] =trs[6].find_all('td')[-1].get_text(strip=True)
        item['legal_man'] =trs[11].find_all('td')[-1].get_text(strip=True)
        item['punish_date']=trs[12].find_all('td')[-1].get_text(strip=True)
        # item['punishresult'] = .replace(' ','').replace('\t','').replace('\n','').replace('\r','')
        # item['timeresult'] = time.strftime('%Y-%m-%d', time.localtime(detaildic[jj]['f15'] / 1000))
        item['punish_agent'] =trs[13].find_all('td')[-1].get_text(strip=True)
        item['source_url']=response.url
        item['source_page']=response.text
        item['spider_name']=self.name
        yield item
Example #5
0
    def parseJson(self, response):
        # patt=re.compile(r'\d+\.\d+\.\d+\.\d+')
        if not self.patt.search(response.text):
            r = json.loads(response.text.split('</script>')[-1])
            data = r['msg']['rows']
            for data in data:
                item = crawler116()
                item['spider_name'] = self.name
                item['punish_reason'] = data['fullname']
                item['notice_id'] = data['id']

                data = {'id': data['id'], 'dataType': '0', 'appType': 'APP001'}
                url = 'https://www.cdcredit.gov.cn/homePage/findDoublePubDetail.do'
                yield scrapy.FormRequest(url,
                                         formdata=data,
                                         meta={'item': item},
                                         dont_filter=True,
                                         callback=self.parsePageDetail)
        else:
            print('=====================ip blocked========================')
            print(self.patt.search(response.text).group(1))
Example #6
0
    def parseJson(self, response):
        # patt=re.compile(r'\d+\.\d+\.\d+\.\d+')
        if not self.patt.search(response.text):
            r = json.loads(response.text.split('</script>')[-1])
            lists = r['mess']
            for data in lists:
                item = crawler116()
                item['entity_name'] = data['CF_XDR_MC']
                item['punish_agent'] = data['CF_XZJG']
                item['notice_id'] = data['VAL_ID']
                item['punish_date'] = data['CF_JDRQ']
                item['spider_name'] = self.name
                post_data = {'valId': data['VAL_ID'], 'type': '2'}

                yield scrapy.FormRequest(self.url_detail,
                                         formdata=post_data,
                                         meta={'item': item},
                                         dont_filter=True,
                                         callback=self.parsePageDetail)
        else:
            print('=====================ip blocked========================')
            print(self.patt.search(response.text).group(1))
Example #7
0
 def parseJson(self,response):
     # patt=re.compile(r'\d+\.\d+\.\d+\.\d+')
     if not self.patt.search(response.text):
         r = json.loads(response.text.split('</script>')[-1])
         data=r['data']
         for each in data:
             item=crawler116()
             item['punish_date']=each['CF_SXQ']
             item['entity_name']=each['CF_XDR_MC']
             item['case_no']=each['CF_WSH']
             item['source_url']='http://xyhn.hainan.gov.cn/JRBWeb/website/SysXymlResourcesEditController.do?reqCode=showxzxkxx&key=%27{}%27&stype=2'.format(each['uuid'])
             item['punish_agent']=each['CF_XZJG']
             item['org_code']=each['QYBM_ID']
             item['notice_id']=each['uuid']
             item['spider_name']=self.name
             yield scrapy.Request(item['source_url'],
                                  meta={'item': item},
                                  dont_filter=True,
                                  callback=self.parsePageDetail
                                  )
     else:
         print('=====================ip blocked========================')
         print(self.patt.search(response.text).group(1))
Example #8
0
 def parse_detail(self, response):
     pass
     r = json.loads(response.text)
     for data in r['result']['data']['list']:
         item = crawler116()
         item['law_item'] = data.get('cfYj')
         item['punish_result'] = data.get('cfJg')
         item['punish_reason'] = data.get('cfSy')
         item['case_no'] = data.get('cfWsh')
         item['punish_agent'] = data.get('cfXzjg')
         item['punish_type1'] = data.get('cfJg')
         item['punish_date'] = data.get('publishDate')
         item['legal_man'] = unquote(
             response.url.split('legalPerson=')[-1].split('&')[0])
         item['credit_no'] = unquote(
             response.url.split('tyshxydm=')[-1].split('&')[0])
         url = response.url
         item['source_url'] = url
         item['spider_name'] = self.name
         data = response.text
         item['source_page'] = data
         item['entity_name'] = ''
         item['org_code'] = ''
         item['reg_no'] = ''
         item['tax_no'] = ''
         item['identity_card'] = ''
         item['current_status'] = ''
         item['area_code'] = ''
         item['offical_updtime'] = ''
         item['note'] = ''
         item['create_date'] = ''
         item['update_date'] = ''
         item['punish_type2'] = ''
         item['entity_type'] = ''
         item['data_source'] = ''
         item['notice_id'] = ''
         yield item
Example #9
0
 def parseJson(self,response):
     if not self.patt.search(response.text):
         r = json.loads(response.text.split('</script>')[-1])
         data=r['list']
         for each in data:
             item=crawler116()
             item['credit_no'] =each['creditCode']
             item['reg_no'] = ''
             item['identity_card'] =each['idCard']
             item['current_status'] = ''
             item['area_code'] = each['areaCode']
             item['offical_updtime'] = each['uploadTime']
             item['note'] = ''
             item['create_date'] = time.strftime('%Y-%m-%d', time.localtime())
             item['update_date'] = ''
             item['punish_type2'] = each['punishTypeTwo']
             item['entity_type'] = each['organizationCode']
             item['data_source'] = response.text
             item['spider_name']=self.name
             item['tax_no']=each['taxCode']
             item['punish_date']=each['punishDate']
             item['entity_name']=each['personName']
             item['case_no']=each['punishNumber']
             item['punish_reason'] = each['punishReason']
             item['source_url']='http://www.creditjx.gov.cn/datareporting/doublePublicity/punishDetail/{}'.format(each['id'])
             item['punish_agent']=each['punishState']
             item['org_code']=each['orgId']
             item['notice_id']=each['id']
             item['punish_type1'] =each['punishTypeOne']
             item['legal_man']=each['legalName']
             item['punish_result']=each['punishResult']
             item['law_item']=each['punishBy']
             item['source_page']=response.text
             yield item
     else:
         print('=====================ip blocked========================')
         print(self.patt.search(response.text).group(1))
Example #10
0
    def parseJson(self, response):
        # patt=re.compile(r'\d+\.\d+\.\d+\.\d+')
        if not self.patt.search(response.text):
            soup = BeautifulSoup(response.text, 'lxml')
            trs = soup.table.find_all('tr')
            for each in trs[1:]:
                item = crawler116()
                item['spider_name'] = self.name
                item['case_no'] = each.find_all('td')[0].a.attrs['title']
                item['entity_name'] = each.find_all('td')[2].a.attrs['title']
                item['punish_agent'] = each.find_all('td')[3].a.attrs['title']
                item[
                    'source_url'] = 'http://www.creditsc.gov.cn' + each.find_all(
                        'td')[1].a.attrs['href']
                item['notice_id'] = each.find_all(
                    'td')[1].a.attrs['href'].split('?')[-1]

                yield scrapy.Request(item['source_url'],
                                     meta={'item': item},
                                     dont_filter=True,
                                     callback=self.parsePageDetail)
        else:
            print('=====================ip blocked========================')
            print(self.patt.search(response.text).group(1))
Example #11
0
    def parse_detail(self,response):
        item = crawler116()
        content_all = response.body
        list = []
        content1 = re.findall('<div class="data">(.*?)<div class="pageFragment_bg_down"></div>', content_all, re.S)[0]
        content = re.findall('<td class="value">(.*?)</td>', content1, re.S)
        for each in content:
            each = each.replace('\t', '').replace(' ', '').replace('\r\n', '')
            list.append(each)
        no_field1 = re.findall('<td class="value table-inner">(.*?)</table>', content1, re.S)[0]
        no_field = re.findall('<tr class="value">(.*?)</tr>', no_field1, re.S)[0]
        no1 = re.findall('<td class="left">(.*?)</td>', no_field, re.S)[0].replace('\t', '').replace(' ', '').replace(
            '\r\n', '')
        list.append(no1)
        no2 = re.findall('<td>(.*?)</td>', no_field, re.S)
        no3 = re.findall('<td class="right">(.*?)</td>', no_field, re.S)[0].replace('\t', '').replace(' ', '').replace(
            '\r\n', '')
        list.append(no3)
        for each in no2:
            each = each.replace('\t', '').replace(' ', '').replace('\r\n', '')
            list.append(each)
        no4 = re.findall('<td >(.*?)</td>', no_field, re.S)[0]
        list.append(no4)

        create_date = time.strftime('%Y-%m-%d', time.localtime())

        case_no = list[0]
        punish_type1 = list[2]
        punish_type2 = list[3]
        punish_reason = list[4]
        law_item = list[5]
        entity_name = list[7]
        credit_no = list[15]
        org_code = list[17]
        reg_no = list[18]
        tax_no = list[19]
        identity_card = list[16]
        legal_man = list[8]
        punish_result = list[6]
        punish_date = list[9]
        punish_agent = list[10]
        current_status = list[12]
        area_code = list[11]
        offical_updtime = list[13]
        note = list[14]
        create_date = create_date
        update_date = ''
        entity_type = '2'
        data_source = '6'

        item['case_no'] =case_no
        item['punish_type1'] =punish_type1
        item['punish_type2'] =punish_type2
        item['punish_reason'] =punish_reason
        item['law_item'] =law_item
        item['entity_name'] =entity_name
        item['credit_no'] =credit_no
        item['org_code'] =org_code
        item['reg_no'] =reg_no
        item['tax_no'] = tax_no
        item['identity_card'] =identity_card
        item['legal_man'] =legal_man
        item['punish_result'] =punish_result
        item['punish_date'] =punish_date
        item['punish_agent'] =punish_agent
        item['current_status'] =current_status
        item['area_code'] =area_code
        item['offical_updtime'] =offical_updtime
        item['note'] =note
        item['create_date'] = create_date
        # item['update_date'] =update_date
        item['entity_type'] = entity_type
        item['data_source'] = data_source
        item['source_url'] = response.url
        item['source_page'] = content
        item['spider_name']=self.name


        yield item
Example #12
0
    def parse_detail(self,response):
        item = crawler116()
        content = response.body
        content_detail = []
        content_field = re.findall('class="table_normal1">(.*?)</table>', content, re.S)[0]
        content1 = re.findall('<td(.*?)</td>', content_field, re.S)
        for each in content1:
            each = each.replace('>', '').replace('\t', '').replace('\0', '').replace(' ', '').replace('\r\n',
                                                                                                      '').replace(
                '&yen;', '')
            content_detail.append(each)

        create_date = time.strftime('%Y-%m-%d', time.localtime())
        credit_no = content_detail[6]
        case_no = content_detail[0]
        punish_type1 = content_detail[2]
        punish_reason = content_detail[3]
        law_item = content_detail[4]
        entity_name = content_detail[5]
        org_code = content_detail[7]
        reg_no = content_detail[8]
        tax_no = content_detail[9]
        identity_card = content_detail[10]
        legal_man = content_detail[11]
        punish_result = content_detail[12]
        punish_date = content_detail[13]
        punish_agent = content_detail[15]
        current_status = content_detail[16]
        area_code = content_detail[17]
        offical_updtime = content_detail[18]
        note = content_detail[19]
        create_date = create_date
        update_date = ''
        punish_type2 = ''
        entity_type = '0'
        data_source = '1'

        item['credit_no'] =credit_no
        item['case_no'] =case_no
        item['punish_type1'] =punish_type1
        item['punish_reason'] =punish_reason
        item['law_item'] =law_item
        item['entity_name'] =entity_name
        item['org_code'] =org_code
        item['reg_no'] =reg_no
        item['tax_no'] =tax_no
        item['identity_card'] = identity_card
        item['legal_man'] =legal_man
        item['punish_result'] =punish_result
        item['punish_date'] =punish_date
        item['punish_agent'] =punish_agent
        item['current_status'] =current_status
        item['area_code'] =area_code
        item['offical_updtime'] =offical_updtime
        item['note'] =note
        item['create_date'] =create_date
        item['update_date'] = update_date
        item['punish_type2'] =punish_type2
        item['entity_type'] =entity_type
        item['data_source'] =data_source
        item['source_url'] = response.url
        item['source_page'] = content
        item['spider_name'] = self.name

        yield item