def handle_result(self, response, result_dict): item = ParsedItem() self.common_item_assembler(response, item) item['_parsed_data'] = map_field(result_dict) item['_id'] = calc_str_md5(response.url) item['bbd_html'] = '' item['bbd_type'] = "credit_bj" item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type')) item['bbd_params'] = '' return item
def handle_result(self, response, result_dict): result_dict = map_field(result_dict) if 'xzcf' in self.name and 'license_status' in result_dict: result_dict['punish_status'] = result_dict.pop( 'license_status', '') item = ParsedItem() self.common_item_assembler(response, item) item['_parsed_data'] = result_dict item['_id'] = calc_str_md5(response.url) item['bbd_html'] = '' item['bbd_params'] = '' item['bbd_type'] = self.name.split('__')[-1][:-7] item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type')) return item
def parse(self, source, *args, **kwargs): """ :Keyword Arguments: self -- source *args -- **kwargs -- :return: None """ try: detail_html = source.pop('bbd_html', '') detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) json_data = json.loads(detail_html) get_func = self.get_value(json_data) res_dict = { "company_name": get_func("xkXdr"), "credit_code": get_func("xkXdrShxym"), "case_name": get_func("xkXmmc"), "license_code": get_func("xkWsh"), "approval_category": get_func("xkSplb"), "license_content": get_func("xkNr"), "license_org": get_func("xkXzjg"), "license_start_date": get_func("xkSxq"), "license_end_date": get_func("xkJzq"), "license_status": get_func("xkZt"), "administrative_code": get_func("dfbm"), "data_source": get_func("depName"), "pubdate": get_func("publishDate"), "id_number": get_func("xkXdrSfz"), } if "PDetial" in detail_url: res_dict.pop("credit_code", "") else: res_dict.pop("id_number", "") res_dict.update(source) res_dict.update(self.base_dict) res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["rowkey"] = gen_rowkey(res_dict) res_dict["bbd_html"] = "" self.logger.info("save {} to mongo".format( res_dict["company_name"])) return res_dict except Exception: msg = "{} parse error url {}! msg:{}".format( self.parser_info, source["bbd_url"], traceback.format_exc()) self.logger.error(msg)
def parse(self, source, *args, **kwargs): """ parse logic :Keyword Arguments: self -- source -- *args -- **kwargs -- :return: None """ try: detail_html = source.pop('bbd_html', '') detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) json_data = json.loads(detail_html) data_content = json_data["data"]["dataContentJson"] data_func = self.get_value(json_data["data"]) detail_func = self.get_value(data_content) credit_code = detail_func("id_number") if "空" != detail_func( "id_number") else data_func("uscCode") res_dict = { "company_name": detail_func("org_name"), "credit_code": credit_code, "regno": data_func("regCode"), "case_name": detail_func("punish_name"), "punish_code": detail_func("decide_docno"), "punish_category": detail_func("punish_type1"), "punish_type": detail_func("reason"), "punish_content": detail_func("punish_ret"), "punish_basis": detail_func("gist"), "punish_org": detail_func("organization"), "punish_date": detail_func("dt_penalty"), "punish_status": detail_func("cur_status"), } res_dict.update(source) res_dict.update(self.base_dict) res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["rowkey"] = gen_rowkey(res_dict) res_dict["bbd_html"] = "" self.logger.info("save {} to mongo".format( res_dict["company_name"])) return res_dict except Exception as err: msg = "{} parse error url {}! msg:{}".format( self.parser_info, source["bbd_url"], traceback.format_exc()) self.logger.error(msg)
def parse(self, source, *args, **kwargs): """ :Keyword Arguments: self -- source -- *args -- **kwargs -- :return: None """ try: detail_html = source.pop('bbd_html', '') detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) json_data = json.loads(detail_html) get_func = self.get_value(json_data) res_dict = { "punish_code": get_func("cfWsh"), "case_name": get_func("cfAjmc"), "punish_category_one": get_func("cfCflb"), "punish_type": get_func("cfSy"), "punish_basis": get_func("cfYj"), "company_name": get_func("cfXdrMc"), "credit_code": get_func("cfXdrShxym"), "punish_content": get_func("cfJg"), "punish_date": get_func("cfSxq"), "punish_org": get_func("cfXzjg"), "punish_status": get_func("cfZt"), "administrative_code": get_func("dfbm"), "data_source": get_func("depName"), "pubdate": get_func("publishDate"), } res_dict.update(source) res_dict.update(self.base_dict) res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["rowkey"] = gen_rowkey(res_dict) res_dict["bbd_html"] = "" self.logger.info("save {} to mongo".format(res_dict["company_name"])) return res_dict except Exception: msg = "{} parse error url {}! msg:{}".format( self.parser_info, source["bbd_url"], traceback.format_exc()) self.logger.error(msg)
def parse_detail(self, response): """ 解析详情页 :param response: :return: """ try: key = response.meta["key"] titles_tds = response.xpath("//table//tr//td[1]") values_tds = response.xpath("//table//tr//td[last()]") titles = [ clean_all_space(td.xpath("string(.)").extract()) for td in titles_tds ] values = [ "".join(td.xpath("string(.)").extract()).strip() for td in values_tds ] if len(titles) != len(values): raise Exception( "the length of titles and values are not equal, url {}". format(response.url)) tmp_dict = dict(zip(titles, values)) res_dict = map_field(tmp_dict) if "xzcf" in self.name: if "license_status" in res_dict.keys(): res_dict["punish_status"] = res_dict.pop( "license_status", "") item = ParsedItem() self.common_item_assembler(response, item) item["_id"] = "{}_{}".format(key, uuid.uuid4()) item["bbd_html"] = "" item["_parsed_data"] = res_dict item["rowkey"] = gen_rowkey(item, keys=('do_time', 'bbd_type')) yield item self.logger1.info("one data {} save to mongodb".format(key)) except: err_msg = traceback.format_exc() self.logger1.error( "Exception on detail {url}, error:{err_msg}".format( url=response.url, err_msg=err_msg))
def parse(self, source, *args, **kwargs): """ parse logic :Keyword Arguments: self -- source -- *args -- **kwargs -- :return: parsed dict """ try: detail_html = clean_html(source.pop('bbd_html', '')) detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) response = Selector(text=detail_html) titles = [ clean_all_space(re.sub(r':|:', r'', til.strip())) for til in response.xpath('//table//tr[position()>1]//th'). xpath('string(.)').extract() ] values = [ val.strip() for val in response.xpath('//table//tr[position()>1]//td'). xpath('string(.)').extract() ] tmp_dict = dict(zip(titles, values)) res_dict = map_field(tmp_dict) res_dict["bbd_seed"] = "" res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["bbd_html"] = "" res_dict.update(source) res_dict.update(self.base_dict) res_dict["rowkey"] = gen_rowkey(res_dict) return res_dict except Exception as err: msg = '{} parse error! msg:{}'.format(self.parser_info, traceback.format_exc()) self.logger.warning(msg)
def parse_detail(self, response): """ parse detail page :param response: :return: """ try: self.logger1.info('start to parse {}'.format(response.url)) tr_list = response.xpath( "//div[@class='fl ml20 mb10 mt10 f_yh']//table//tr") data_dict = {} for tr in tr_list: title = clean_all_space(''.join( tr.xpath(".//th").xpath("string(.)").extract()).replace( ":", "").replace(":", "")) value = clean_all_space(''.join( tr.xpath(".//td").xpath("string(.)").extract())) data_dict.update({title: value}) item = ParsedItem() self.common_item_assembler(response, item) item["_id"] = "{}_{}".format(response.url, uuid.uuid4()) item['bbd_html'] = '' item['bbd_type'] = "credit_jx" item['rowkey'] = gen_rowkey(item, keys=('do_time', 'bbd_type')) res_dict = self.convert_time(map_field(data_dict)) if "xzcf" in self.name: if "license_status" in res_dict.keys(): res_dict["punish_status"] = res_dict.pop( "license_status", "") item['_parsed_data'] = res_dict yield item self.logger1.info('{} save successfully'.format(response.url)) except Exception as e: self.logger1.warning( "Exception on save detail page {} {} {}".format( response.url, traceback.format_exc(), e))
def parse(self, source, *args, **kwargs): """ parse logic :Keyword Arguments: self -- source -- *args -- **kwargs -- :return: None """ try: detail_html = source.pop('bbd_html', '') detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) json_data = json.loads(detail_html) res_list = [] for data in json_data["results"]: determine_date = data.get("DETERMINEDATE", None) terminal_date = data.get("TERMINALDATE", None) license_start_date = self.date_convert(determine_date) license_end_date = self.date_convert(terminal_date) res_dict = { "company_name": data.get("LEGALPERSON", "") if data.get("LEGALPERSON", "") else "", "license_org": data.get("ORGNAME", "") if data.get("ORGNAME", "") else "", "license_code": data.get("NO", "") if data.get("NO", "") else "", "case_name": data.get("PROJECTNAME", "") if data.get("PROJECTNAME", "") else "", "approval_category": data.get("AUDITTYPE", "") if data.get("AUDITTYPE", "") else "", "license_content": data.get("NOTE", "") if data.get("NOTE", "") else "", "credit_code": data.get("CREDITCODE", "") if data.get("CREDITCODE", "") else "", "organization_code": data.get("ORGNO", "") if data.get("ORGNO", "") else "", "regno": data.get("ICREGCODE", "") if data.get("ICREGCODE", "") else "", "tax_code": data.get("TAXCODE", "") if data.get("TAXCODE", "") else "", "id_number": data.get("REPRESENTATIVEID", "") if data.get( "REPRESENTATIVEID", "") else "", "frname": data.get("REPRESENTATIVE", "") if data.get( "REPRESENTATIVE", "") else "", "license_start_date": license_start_date, "license_end_date": license_end_date, "remark": data.get("REMARK", "") if data.get("REMARK", "") else "", } res_dict.update(source) res_dict.update(self.base_dict) res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["rowkey"] = gen_rowkey(res_dict) res_dict["bbd_html"] = "" res_dict["bbd_url"] = self.real_url_format.format(data["ID"]) res_list.append(res_dict) return res_list except Exception: msg = "{} parse error url {}! msg:{}".format( self.parser_info, source["bbd_url"], traceback.format_exc()) self.logger.error(msg)
def parse(self, source, *args, **kwargs): """ parse logic :Keyword Arguments: self -- source -- *args -- **kwargs -- :return: None """ try: detail_html = source.pop('bbd_html', '') detail_url = source.get('bbd_url', '') self.logger.info('开始解析:{} {}'.format(self.parser_info, detail_url)) json_data = json.loads(detail_html)["results"] res_dict = { "company_name": self.get_value(json_data, "LEGALPERSON"), "punish_org": self.get_value(json_data, "ORGNAME"), "case_name": self.get_value(json_data, "PUNISHNAME"), "punish_code": self.get_value(json_data, "NO"), "punish_category_one": self.get_value(json_data, "AUDITTYPE"), "punish_type": self.get_value(json_data, "REASON"), "punish_basis": self.get_value(json_data, "ACCORDING"), "credit_code": self.get_value(json_data, "CREDITCODE"), "organization_code": self.get_value(json_data, "ORGNO"), "regno": self.get_value(json_data, "ICREGCODE"), "tax_code": self.get_value(json_data, "TAXCODE"), "id_number": self.get_value(json_data, "REPRESENTATIVEID"), "frname": self.get_value(json_data, "REPRESENTATIVE"), "punish_content": self.get_value(json_data, "NOTE"), "punish_date": self.date_convert(self.get_value(json_data, "PUNISHDATE")), "remark": self.get_value(json_data, "REMARK"), } res_dict.update(source) res_dict.update(self.base_dict) res_dict["_id"] = "{}".format(uuid.uuid4()) res_dict["rowkey"] = gen_rowkey(res_dict) res_dict["bbd_html"] = "" self.logger.info("save {} to mongo".format( res_dict["company_name"])) return res_dict except Exception: msg = "{} parse error url {}! msg:{}".format( self.parser_info, source["bbd_url"], traceback.format_exc()) self.logger.error(msg)