def list_parser(self, processor_type, current_processor, response, passvalue): """list 解析""" if not response: logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue)) return # 获取配置参数 table_selector = current_processor.get('table_selector') or 'table#list' # 获取数据记录 records = self._get_data_table(response, table_selector, current_processor) list_values = self._get_list_values(records, current_processor) if current_processor.get('tr_select'): for i in list_values: if len(i) == current_processor.get('tr_select'): list_values = list_values else: list_values = [] # 获取入库字段和值 list_infos, next_param_list = self._get_column_value_mapping(current_processor, list_values, passvalue) # 钻取详情 detail_urls = [''] if current_processor.get('ignore_detail_urls') else self._get_detail_urls(records) #判断是否讲response传入下一级 if not current_processor.get('next_response'): self._route_many_points_many_urls(current_processor.get('detail_point'), detail_urls, passvalue, next_param_list) if current_processor.get('next_response'): self._route_many_points_many_urls(current_processor.get('detail_point'), [''], passvalue, next_param_list, response) # 获取列表的下一页 self._route_next_page(response, processor_type, passvalue) return list_infos
def get_query_reason(self): ''' 查询原因选择 :return: ''' try: orgcode = os.getenv('orgcode') sql = "select QUERY_REASON from {0}.RISK_MNIT_QUERY_RH_LIST where MID_SIGN_CODE='{1}'".format( base_config.tabschema, orgcode) result = ibm_db.exec_immediate(self.conn, sql) orgdetail = ibm_db.fetch_assoc(result) if isinstance(orgdetail, bool): return '' else: if orgdetail.get('QUERY_REASON') == '贷前调查': return '01' elif orgdetail.get('QUERY_REASON') == '贷中操作': return '02' elif orgdetail.get('QUERY_REASON') == '贷后管理': return '03' elif orgdetail.get('QUERY_REASON') == '关联查询': return '04' else: return '' except Exception as e: logger.error("查询原因获取失败error", e) return ''
def get_processer(self, processor_type, *args): """ 主要处理逻辑函数,根据配置文件,获取执行逻辑处理的先后顺序 """ current_processor = processor_config.processor.get(processor_type) if not current_processor: logger.error( "i can't found the processor, processor_type:{}, args:{}". format(processor_type, *args)) return object_list = [self.download, self.output] for obj in object_list: try: # 根据当前的type 获取当前数据的解析函数 f = getattr(obj, current_processor.get('function')) except AttributeError as e: pass else: # 执行当前函数并将处理结果返回,供后续处理 # 支持多列表处理 result = f(processor_type, *args) next_processor = current_processor.get('next_processor') if next_processor: if isinstance(next_processor, str): self.queue.put([next_processor, result]) if isinstance(next_processor, list): for p in next_processor: if p: self.queue.put([p, result]) break else: logger.error( "i can't found the function, processor_type:{}, args:{}". format(processor_type, *args)) return
def _route_one_point_many_urls(self, point, urls, passvalue, other_passvalue_list, rsp): try: for index, url in enumerate(urls): new_passvalue = dict(passvalue) if other_passvalue_list and other_passvalue_list[index]: for key in other_passvalue_list[index]: new_passvalue[key] = other_passvalue_list[index].get(key) self._route_one_point_one_url(point, url, new_passvalue, rsp) except Exception as e: logger.error("超出数组范围!", e)
def write_log_with_zhengxin(self): ''' 记录登陆人行征信查询页面的次数及日志。用于以后的记录查询 :return: ''' try: orgcode = os.getenv('orgcode') # 获取所要查询机构代码的查询用户 # USER_CODE = ibm_db.exec_immediate(self.conn,"select USER_CODE,CUST_NAME,CLIENT_TYPE from RH.PRIORITY_CLIENT_LIST where ORG_CODE='{0}'".format(orgcode)) # usercode = ibm_db.fetcha_assoc(USER_CODE) # 获取查询用户的机构和部门 # ENT_NAME 企业名称 # USER_NAME 用户名称 # USER_CODE 用户CODE # USER_ORG_NAME 用户归属机构名称 # USER_BRANCH_NAME 用户所属分行名称 # 判断改客户是否已经爬取过 query = "select distinct ORG_CODE,MIDSIGNCODE,CUS_NO from {0}.RH_CUST_QUEUE where MIDSIGNCODE='{1}';".format( base_config.tabschema, orgcode) org_Info = ibm_db.exec_immediate(self.conn, query) row = ibm_db.fetch_assoc(org_Info) if row: status = '0' if not row: status = '1' # 添加登陆信息到记录表中 sql = "select ENT_NAME,USER_NAME,USER_CODE,USER_ORG_NAME,USER_BRANCH_NAME,QUERY_REASON from RH.RISK_MNIT_QUERY_RH_LIST where MID_SIGN_CODE='{0}'".format( orgcode) result = ibm_db.exec_immediate(self.conn, sql) orgdetail = ibm_db.fetch_assoc(result) if not isinstance(orgdetail, bool): current_data = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) insertsql = "insert into RH.RH_LOGIN_ZHENGXIN_WEB_LOG" \ " (USER_CODE,CUST_NAME,ORGCODE,USERID,PASSWORD,STATUS,UPLOADTIME,CLIENT_TYPE,ORG_NAME,SEARCH_TYPE,USER_NAME,USER_BRANCH_NAME,CUST_CODE) " \ "values ('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}')".format( orgdetail.get('USER_CODE', ''), orgdetail.get('ENT_NAME', ''), base_config.login_post_data.get('orgCode'), base_config.login_post_data.get('userid'), hashlib.md5(base_config.login_post_data.get('password').encode('utf8')).hexdigest(), os.getenv('credit_status'), current_data, status, orgdetail.get('USER_ORG_NAME', ''), orgdetail.get('QUERY_REASON', ''), orgdetail.get('USER_NAME', ''), orgdetail.get('USER_BRANCH_NAME', ''), orgcode) # print(insertsql) ibm_db.exec_immediate(self.conn, insertsql) ibm_db.commit(self.conn) else: print("企业信息为空") except Exception as e: logger.error("添加登陆记录时出错error", e)
def save_dict_into_mysql(self, processor_type, data): if not isinstance(data, dict): return table = data.pop('table') placeholders = ','.join(['%s'] * len(data)) columns = ','.join(data.keys()) sql = "INSERT INTO {}({}) VALUES({})".format(table, columns, placeholders) # debug logger.info('processor_type:[{}], sql:[{} {}]'.format( processor_type, sql, tuple(data.values()))) try: self.cursor.execute(sql, tuple(data.values())) except pymysql.Error as e: logger.error('Oops, i got the fault: {}, {} {}'.format( e, sql, tuple(data.values()))) self.conn.commit()
def double_parser(self, processor_type, current_processor, response, passvalue): '''对于一个表格里面需要拆分的处理''' if not response: logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue)) return # 获取配置参数 table_selector = current_processor.get('table_selector') or 'table#list' # 获取数据记录 records = self._get_data_table(response, table_selector, current_processor) list_values = self._double_parser(records, current_processor) # 获取入库字段和值 list_infos, next_param_list = self._get_column_value_mapping(current_processor, list_values, passvalue) return list_infos
def _get_data_table(response, table_selector, current_processor): """获取含数据的table""" if not response: return try: soup = BeautifulSoup(response.content, 'html.parser', from_encoding=page_encoding) except AttributeError as e: soup = BeautifulSoup(response, 'html.parser', from_encoding=page_encoding) records = soup.select(table_selector) table_number = current_processor.get('table_number') if table_number: try: records = [(records[table_number])] except IndexError as e: logger.error('{}'.format(e.args)) return return records
def update_spider_done(self, orgcode): '''爬取完成的企业进行入库记录''' try: #1.更新待爬清单的状态为爬取完成 update_spider_queue_table_sql = "update {0}.RH_SPIDER_QUEUE_LIST set STATUS='2' where ORGCODE='{1}' and SEARCHDATE='{2}';".format( base_config.tabschema.upper(), orgcode, os.getenv('last_quarter_end')) ibm_db.exec_immediate(self.conn, update_spider_queue_table_sql) ibm_db.commit(self.conn) #2.更新爬取完成清单的内容,如果没有出错,就插入爬取完成表。出错就不插入 error_spider_record_sql = "select ORGCODE from {0}.RH_ERROR_SPIDER where SEARCHDATE='{1}' and ORGCODE='{2}' and date(UPLOADTIME)='{3}';".format( base_config.tabschema.upper(), os.getenv('last_quarter_end'), orgcode, time.strftime('%Y-%m-%d', time.localtime(time.time()))) error_spider_record = ibm_db.exec_immediate( self.conn, error_spider_record_sql) row = ibm_db.fetch_assoc(error_spider_record) # if row: # pass # if not row: # 成功与否,都添加记录到RH_CUST_QUEUE中 #select CUSTNAME,CUSTID from DB2IIASS.DESK_SXGL0431_D where ORGCERTCODE='93125730-6'; sql = "select distinct CUSTNAME,CUSTID from {0}.DESK_SXGL0431_D where LNCARDNO='{1}';".format( base_config.hongduntabschema.upper(), orgcode) result_query = ibm_db.exec_immediate(self.conn, sql) row = ibm_db.fetch_assoc(result_query) while (row): # print(row) insert_sql = "insert into {0}.RH_CUST_QUEUE(ENT_NAME,CUS_NO,ORG_CODE,MIDSIGNCODE,SEARCHDATE,FINISH_DATE) values ('{1}','{2}','{3}','{4}','{5}','{6}')".format( base_config.tabschema, row.get('CUSTNAME'), row.get('CUSTID'), orgcode, os.getenv('midsigncode'), os.getenv('last_quarter_end'), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))) ibm_db.exec_immediate(self.conn, insert_sql) ibm_db.commit(self.conn) row = ibm_db.fetch_assoc(result_query) #os.environ['credit_status'] = '0' #HtmlOutputer().write_log_with_zhengxin() except Exception as e: logger.error("爬取完成的企业进行入库记录error", e) pass
def detail_parser(self, processor_type, current_processor, response, passvalue): """detail 解析""" if not response: logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue)) return # 获取配置参数 table_selector = current_processor.get('table_selector') or 'table#list > tbody > tr > td' # 获取数据记录 records = self._get_data_table(response, table_selector, current_processor) detail_values = self._get_detail_values(records) # 获取入库字段和值 list_infos, next_param_list = self._get_column_value_mapping(current_processor, [detail_values], passvalue) # 钻取详情 self._route_many_points_many_urls(current_processor.get('detail_point'), [''], passvalue, next_param_list, response) return list_infos
def re_add_post__parser(self, processor_type, current_processor, response, passvalue): ''' 正则匹配出所需要的post的表单数据 ''' if not response: logger.error('processor_type:{}, passvalue:{}'.format(processor_type, passvalue)) return import re content = response.text loanid = [] loanid_list = re.findall("\(\'\d+\'\)", content) # 获取loanid的id for i in loanid_list: loanid.append(str(i)[2]) financecode = re.search('financecode\=\d+', content) financecode = financecode.group().split('=')[1] if financecode else '' loancardcode = re.search('\+.*\d+.?\+', content) loancardcode = loancardcode.group().split('+')[1].strip() if loancardcode else '' if current_processor.get("re_type"): contractcode = re.search('contractcode=[0-9A-Z]+', content) contractcode = contractcode.group().split('=')[1] if contractcode else '' else: contractcode = re.search('constr .*\= .*\;', content) contractcode = contractcode.group().split("'")[1].encode('gbk') if contractcode else '' #loandb=1 担保 loandb=2 被担保 dzy=1 保证 2 抵押 3 质押 dzy = current_processor.get('label_type') # loandb = re.search('&loandb\=\d+', content) # loandb = loandb.group().split('=')[1] if loandb else '' loandb = current_processor.get('loandb_type') # print("loanid=",loanid,"financecode=",financecode,"loancardcode=",loancardcode,"contractcode=",contractcode,"loandb=",loandb,'dzy=',dzy) # 钻取详情 for i in loanid: next_param_list = [ {'loanid': i, "financecode": financecode, "loancardcode": loancardcode, "contractcode": contractcode, 'dzy': dzy, 'loandb': loandb}] self._route_many_points_many_urls(current_processor.get('detail_point'), [''], passvalue, next_param_list) return ''
redis_conn = RedisOp(**base_config.redis_config) #从redis的队列中获取数据,首先获取优先级高的,其次是低优先级 while 1: try: task_orgcode = redis_conn.run_redis_fun('brpop', [ 'high_level_spider_orgcode_queue', 'low_level_spider_orgcode_queue' ], 0) except Exception as e: print(e) else: if task_orgcode: orgcode = task_orgcode[1].decode() if orgcode is None: print("orgCode:", orgcode) logger.error("待查询中征码为空,跳过本次查询!", orgcode) else: os.environ['orgcode'] = orgcode # os.environ['last_quarter_start'] = '2016-07-01' # os.environ['last_quarter_end']= '2016-09-30' rule_orgcode_filter(orgcode) # for i in ['71657512-5','17768539-1']: # import time # import os # os.environ['orgcode'] = '71657512-5' # starttime = time.time() # start_spider('55805200-2') # print('runtime=',time.time()-starttime) # time.sleep(3) ''' 江西赣锋锂业股份有限公司 71657512-5 3605030000000514 2016-12-01
# print("orgcode>>>>>>>>>>>>>>>",orgcode) if orgcode: #判断这个企业是否正在爬取 spider_doing = redis_conn.run_redis_fun( 'sismember', 'rh_spider_done_orgcode', orgcode) if spider_doing: print("正在爬取。。。") redis_conn.run_redis_fun( 'lpush', 'request_url_failure_' + time.strftime( '%Y%m%d', time.localtime(time.time())), json.dumps(task)) redis_conn.run_redis_fun( 'expire', 'request_url_failure_' + time.strftime( '%Y%m%d', time.localtime(time.time())), 604800) else: processor_type, *args = task.get( 'processor_type'), task.get('url'), task.get( 'payload') q = Queue() spider = SpiderByQueue(q) spider.get_processer(processor_type, *args) spider.run() except Exception as e: logger.error("重跑出错error", e) ''' 江西赣锋锂业股份有限公司 71657512-5 3605030000000514 2016-12-01 中交第二航务工程局有限公司 17768539-1 4201010000043669 2016-12-01 贵州盛鑫矿业集团投资有限公司 56500196-7 5202010000176539 2016-12-01 临海市春风灯饰有限公司 70473721-1 3309060000195576 2016-12-01 '''