Example #1
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         data = json.loads(response.text)
         html = data.get("contentViewsMap", "").get("rightData1", "")
         tree = etree.HTML(html)
         tr_list = tree.xpath(".//tr")
         title_list = []
         value_list = []
         for tr in tr_list[1:]:
             tds = tr.xpath(".//td")
             title_list += [''.join(td.xpath("string()")).strip() for td in tds[::2]]
             value_list += [''.join(td.xpath("string()")).strip() for td in tds[1::2]]
         data_dict = dict(zip(title_list, value_list))
         item = self.result_item_assembler(response)
         item['_id'] = calc_str_md5(response.url)
         item['bbd_html'] = ''
         item['_parsed_data'] = self.convert_time(map_field(data_dict))
         yield item
         self.logger1.info('{} save successfully'.format(response.url))
     except Exception as e:
         self.logger1.warning("Exception on save detail page {} {} {}".format(
             response.url, traceback.format_exc(), e))
Example #2
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         tr_list = response.xpath("//div[@class='fl ml20 mb10 mt10 f_yh']//table//tr")
         if tr_list:
             data_dict = {}
             for tr in tr_list:
                 title = clean_all_space(''.join(tr.xpath(".//th").xpath("string(.)").extract()).replace(":", "").replace(":", ""))
                 value = clean_all_space(''.join(tr.xpath(".//td").xpath("string(.)").extract()))
                 data_dict.update({title: value})
             item = self.result_item_assembler(response)
             item['_id'] = calc_str_md5(response.url)
             item['bbd_html'] = ''
             res_dict = self.convert_time(map_field(data_dict))
             if "xzcf" in self.name:
                 if "license_status" in res_dict.keys():
                     res_dict["punish_status"] = res_dict.pop("license_status", "")
             item['_parsed_data'] = res_dict
             yield item
             self.logger1.info('{} save successfully'.format(response.url))
         else:
             self.logger1.info('retry {}'.format(response.url))
             yield Request(response.url, callback=self.parse_detail, errback=self.error_parse)
     except Exception as e:
         self.logger1.warning("Exception on save detail page {} {} {}".format(
             response.url, traceback.format_exc(), e))
Example #3
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         tr_list = response.xpath("//table[@class='table_normal1']//tr")
         data_dict = {}
         for tr in tr_list:
             title = ''.join(
                 tr.xpath(".//th").xpath("string()").extract()).strip()
             value = ''.join(
                 tr.xpath(".//td").xpath("string()").extract()).strip()
             data_dict.update({title: value})
         item = self.result_item_assembler(response)
         item['_id'] = calc_str_md5(response.url)
         item['bbd_html'] = ''
         res_dict = self.convert_time(map_field(data_dict))
         if "xzcf" in self.name and "license_status" in res_dict.keys():
             res_dict["punish_status"] = res_dict.pop("license_status", "")
         item['_parsed_data'] = res_dict
         yield item
         self.logger1.info('{} save successfully'.format(response.url))
     except Exception as e:
         self.logger1.warning(
             "Exception on save detail page {} {} {}".format(
                 response.url, traceback.format_exc(), e))
Example #4
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         li_list = response.xpath("//div[@class='warp']//ul//li")
         data_dict = {}
         for li in li_list:
             data_str = clean_all_space(''.join(
                 li.xpath("string()").extract()).replace(":", ":"))
             title = data_str.split(":", 1)[0]
             value = ''.join(
                 li.xpath(".//span").xpath("string()").extract()).strip()
             data_dict.update({title: value})
         item = self.result_item_assembler(response)
         item['_id'] = calc_str_md5(response.url)
         item['bbd_html'] = ''
         item['_parsed_data'] = self.convert_time(map_field(data_dict))
         yield item
         self.logger1.info('{} save successfully'.format(response.url))
     except Exception as e:
         self.logger1.warning(
             "Exception on save detail page {} {} {}".format(
                 response.url, traceback.format_exc(), e))
Example #5
0
 def handle_result(self, response, result_dict):
     item = self.result_item_assembler(response)
     item['_parsed_data'] = map_field(result_dict)
     item['_id'] = calc_str_md5(response.url)
     item['bbd_html'] = ''
     item['bbd_params'] = ''
     return item
Example #6
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         if "error" in response.url:
             return
         self.logger1.info('start to parse {}'.format(response.url))
         tr_list = response.xpath("//div[@class='display_con']//table//tr[position()>1]")
         if tr_list:
             data_dict = {}
             title_list = []
             value_list = []
             for tr in tr_list:
                 tds = tr.xpath(".//td")
                 title_list += [clean_all_space(''.join(td.xpath("string(.)").extract())) for td in tds[::2]]
                 value_list += [clean_all_space(''.join(td.xpath("string(.)").extract())) for td in tds[1::2]]
             data_dict.update(dict(zip(title_list, value_list)))
             item = self.result_item_assembler(response)
             item['_id'] = calc_str_md5(response.url)
             item['bbd_html'] = ''
             result_dict = map_field(data_dict)
             if "xzcf" in self.bbd_table:
                 result_dict["punish_status"] = result_dict.pop("license_status", "")
             item['_parsed_data'] = self.convert_time(result_dict)
             yield item
             self.logger1.info('{} save successfully'.format(response.url))
         else:
             self.logger1.info('retry {}'.format(response.url))
             yield Request(response.url, callback=self.parse_detail, errback=self.error_parse)
     except Exception as e:
         self.logger1.warning("Exception on save detail page {} {} {}".format(
             response.url, traceback.format_exc(), e))
Example #7
0
    def parse_detail(self, response):
        """
        parse detail page
        :param response:
        :return:
        """
        try:
            self.logger1.info('start to parse {}'.format(response.url))
            tr_list = response.xpath("//table[@class='infor']//tr")
            if tr_list:
                data_dict = {}
                result = {}
                for tr in tr_list:
                    title = ''.join(
                        tr.xpath(".//td[@class='name']/text()").extract()
                    ).strip().replace(':', '').replace(':', '').strip()
                    value = ''.join(
                        tr.xpath(".//td[last()]").xpath(
                            "string()").extract()).strip()
                    if title in ['处罚类别', '处罚类型']:
                        if '处罚类别1' in data_dict.keys():
                            title = '处罚类别2'
                        else:
                            title = '处罚类别1'
                    data_dict.update({title: value})
                data_dict[self.data_status] = data_dict.pop('当前状态', '')

                result.update(map_field(data_dict))
                for key, value in result.items():
                    if 'date' in key:
                        if value.strip().isdigit():
                            val = '{}-{}-{} 00:00:00'.format(
                                value[:4], value[4:6], value[6:])
                        elif ':' in value:
                            val = '-'.join(
                                re.findall(
                                    r'(\d+)',
                                    value.split(' ')[0])) + ' {}'.format(
                                        value.split(' ')[1])
                        else:
                            val = '-'.join(re.findall(r'(\d+)', value)
                                           ) + ' 00:00:00' if value else ''
                        result[key] = val
                item = self.result_item_assembler(response)
                item['_id'] = calc_str_md5(response.url)
                item['bbd_html'] = ''
                item['_parsed_data'] = result
                yield item
                self.logger1.info('{} save successfully'.format(response.url))
            else:
                self.logger1.info("retry {}".format(response.url))
                yield Request(response.url,
                              callback=self.parse_detail,
                              errback=self.error_parse)

        except Exception as e:
            self.logger1.warning(
                "Exception on save detail page {} {} {}".format(
                    response.url, traceback.format_exc(), e))
Example #8
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse {}'.format(response.url))
         detail = json.loads(response.body.decode())[0]
         if detail:
             result = {}
             if 'xkid=' in response.url:
                 result['license_code'] = detail.get('xkwsh', '')
                 result['case_name'] = detail.get('xmmc', '')
                 result['approval_category'] = detail.get('splb', '')
                 result['license_content'] = detail.get('xknr', '')
                 result['company_name'] = detail.get('xzxdr', '')
                 result['license_start_date'] = detail.get(
                     'xkjdrq', '').replace('/', '-') + ' 00:00:00'
                 result['license_end_date'] = detail.get(
                     'xkjzrq', '').replace('/', '-') + ' 00:00:00'
                 result['license_org'] = detail.get('xkjg', '')
             else:
                 result['punish_code'] = detail.get('cfwsh', '')
                 result['case_name'] = detail.get('cfmc', '')
                 result['punish_category_one'] = detail.get('cflb', '')
                 result['punish_type'] = detail.get('cfsy', '')
                 result['punish_basis'] = detail.get('cfyj', '')
                 result['company_name'] = detail.get('xzxdr', '')
                 result['public_date'] = detail.get('cfjdrq', '').replace(
                     '/', '-') + ' 00:00:00'
                 result['punish_org'] = detail.get('cfjguan', '')
             item = self.result_item_assembler(response)
             item['_id'] = calc_str_md5(response.url)
             item['bbd_html'] = ''
             item['_parsed_data'] = result
             yield item
             self.logger1.info('{} save successfully'.format(response.url))
         else:
             count = response.meta['count'] + 1
             if count < 10:
                 self.logger1.info(
                     'The page has no content,try!  {}'.format(
                         response.url))
                 yield Request(response.url,
                               callback=self.parse_detail,
                               errback=self.error_parse,
                               priority=6,
                               meta={'count': count})
             else:
                 self.logger1.info(
                     'The page has no content,discard!  {}'.format(
                         response.url))
     except Exception as e:
         self.logger1.warning(
             "Exception on save detail page {} {} {}".format(
                 response.url, traceback.format_exc(), e))
Example #9
0
 def handle_result(self, response, result_dict, info_id):
     item = self.result_item_assembler(response)
     item['_parsed_data'] = result_dict
     item['bbd_html'] = ''
     item[
         'bbd_url'] = response.url + '?CategoryNum=014013&InfoID={}'.format(
             info_id)
     item['_id'] = calc_str_md5(info_id)
     return item
Example #10
0
 def handle_result(self, response, result_dict):
     item = ParsedItem()
     self.common_item_assembler(response, item)
     item['_parsed_data'] = map_field(result_dict)
     item['_id'] = calc_str_md5(response.url)
     item['bbd_html'] = ''
     item['bbd_type'] = "credit_bj"
     item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
     item['bbd_params'] = ''
     return item
Example #11
0
    def handle_result(self, response, result_dict):
        result_dict = map_field(result_dict)
        if 'xzcf' in self.name and 'license_status' in result_dict:
            result_dict['punish_status'] = result_dict.pop(
                'license_status', '')

        item = self.result_item_assembler(response)
        item['_parsed_data'] = result_dict
        item['_id'] = calc_str_md5(response.url)
        item['bbd_html'] = ''
        item['bbd_params'] = ''
        return item
Example #12
0
    def parse_detail(self, response):
        """
        parse detail page
        :param response:
        :return:
        """
        try:
            self.logger1.info('start to parse {}'.format(response.url))
            tr_list = response.xpath("//table[@class='table table-hover']//tr")
            if tr_list:
                data_dict = {}
                result = {}
                for tr in tr_list:
                    title = ''.join(tr.xpath(".//th").xpath("string()").extract()).strip()
                    value = ''.join(tr.xpath(".//td").xpath("string()").extract()).strip()
                    if title in ['处罚类别', '处罚类型']:
                        if '处罚类别1' in data_dict.keys():
                            title = '处罚类别2'
                        else:
                            title = '处罚类别1'
                    data_dict.update({title: value})

                result.update(map_field(data_dict))
                for key, value in result.items():
                    if 'date' in key:
                        if value.strip().isdigit():
                            val = '{}-{}-{} 00:00:00'.format(value[:4], value[4:6], value[6:])
                        elif ':' in value:
                            val = '-'.join(re.findall(r'(\d+)', value.split(' ')[0])) + ' {}'.format(value.split(' ')[1])
                        else:
                            val = '-'.join(re.findall(r'(\d+)', value)) + ' 00:00:00' if value else ''
                        result[key] = val
                if 'punish_basis' in result and 'punish_code' not in result:
                    result['punish_code'] = result.pop('license_code', '')
                item = self.result_item_assembler(response)
                item['_id'] = calc_str_md5(response.url)
                item['bbd_html'] = ''
                item['_parsed_data'] = result
                yield item
                self.logger1.info('{} save successfully'.format(response.url))
            else:
                count = response.meta['count'] + 1
                if count < 10:
                    self.logger1.info('The page has no content,try!  {}'.format(response.url))
                    yield Request(response.url, callback=self.parse_detail, errback=self.error_parse,
                                  priority=6, meta={'count': count})
                else:
                    self.logger1.info('The page has no content,discard!  {}'.format(response.url))
        except Exception as e:
            self.logger1.warning("Exception on save detail page {} {} {}".format(
                response.url, traceback.format_exc(), e))
Example #13
0
    def handle_result(self, response, result_dict):
        result_dict = map_field(result_dict)
        if 'xzcf' in self.name and 'license_status' in result_dict:
            result_dict['punish_status'] = result_dict.pop(
                'license_status', '')

        item = ParsedItem()
        self.common_item_assembler(response, item)
        item['_parsed_data'] = result_dict
        item['_id'] = calc_str_md5(response.url)
        item['bbd_html'] = ''
        item['bbd_params'] = ''
        item['bbd_type'] = self.name.split('__')[-1][:-7]
        item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type'))
        return item
Example #14
0
 def parse_detail(self, response):
     """
     parse detail page
     :param response:
     :return:
     """
     try:
         self.logger1.info('start to parse detail content {}'.format(
             response.url))
         cmp_name = ''.join(
             response.xpath(
                 './/td[@class="listf2"]//text()').extract()).strip()
         punish_rst = ''.join(
             response.xpath('//td[@class="xzcf_jds"]//text()').extract())
         cont_type = ''.join(
             response.xpath('//td[@width="270"]//text()').extract())
         tr_list = response.xpath('//table[@class="xzcf_bg"]//tr')
         special = tr_list.pop(1)
         key_lst = [
             ''.join(item.xpath('string(.)').extract()).strip(u':')
             for item in tr_list.xpath('.//td[2]')
         ]
         val_lst = [
             ''.join(item.xpath('string(.)').extract()).strip()
             for item in tr_list.xpath('.//td[3]')
         ]
         special_dic = self.deal_special(special)
         detail_dic = dict(zip(key_lst, val_lst))
         detail_dic.update(special_dic)
         if u'行政处罚' in cont_type:
             key = 'punish_content'
         else:
             key = 'license_content'
         detail_dic.update({'case_name': cmp_name, key: punish_rst})
         result = map_field(detail_dic)
         item = self.result_item_assembler(response)
         item['_id'] = calc_str_md5(response.url)
         item['bbd_html'] = ''
         item['_parsed_data'] = result
         yield item
         self.logger1.info('{} save successfully'.format(response.url))
     except Exception as e:
         self.logger1.warning(
             "Exception on save detail page {} {} {}".format(
                 response.url, traceback.format_exc(), e))