def __shuffle(self, data): serial_number = req_for_serial_number(code="WD_TY") data["ID_"] = serial_number # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) data["CREATE_TIME_"] = create_time data["CREATE_BY_ID_"] = CREATE_ID data["CREATE_BY_NAME_"] = CREATE_NAME data["M_STATUS_"] = "N" data["DELETE_STATUS_"] = "N" data["DATA_STATUS_"] = "UNCHECK" data["PUBLISH_STATUS_"] = "N" data["HOT_"] = "0" source = re.findall(r"(https?://.*?)/", data["URL_"]) if source: data["SOURCE_"] = source[0] data["SOURCE_NAME_"] = data["ENTITY_NAME_"] if data["PROVINCE_NAME_"] == data["CITY_NAME_"] and data[ "PROVINCE_CODE_"] == data["CITY_CODE_"]: data["CITY_CODE_"] = data["CITY_CODE_"][:3] + "100" return data
def generic_shuffle(self, data, re_data, field="CONTENT_"): re_data = deepcopy(data) # 文件存储 for _ in range(1, 10): if f"FJ{_}_NAME_" in data and data.get(f'FJ{_}_URL_'): type = find_type(data.get(f'FJ{_}_URL_')) if find_type( data.get(f'FJ{_}_URL_')) else find_type( data.get(f"FJ{_}_NAME_")) if not type: return re_data try: response = req_for_something(url=data[f'FJ{_}_URL_']) except Exception as e: self.logger.exception('文件获取出错') else: if response: try: # todo 文件上传出错是否继续还是跳过 number = 3932 serial_number = req_for_serial_number( code="GOV_ZX_GDS") file_name = src_dir + str( int(serial_number[5:13]) - number ) + '-' + data.get(f"FJ{_}_NAME_").replace( '.xlsx', '').replace('.xls', '').replace( '.doc', '').replace('.docx', '').replace( '.zip', '').replace('.pdf', '').replace( '.PDF', '') + type re_data[f'FILE_NAME_{_}_'] = str( int(serial_number[5:13]) - number ) + '-' + data.get(f"FJ{_}_NAME_").replace( '.xlsx', '').replace('.xls', '').replace( '.docx', '').replace('.doc', '').replace( '.zip', '').replace('.pdf', '').replace( '.PDF', '') + type with open(file_name, 'wb+') as fp: fp.write(response.content) print('保存文件成功', ' ', re_data[f'FILE_NAME_{_}_']) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f"error: {e}.") finally: response.close() return re_data
def __shuffle(self, data): serial_number = req_for_serial_number(code="JRCP_LCCP_INFO") data["ID_"] = serial_number content = '' data['conten_type'] = find_type(data.get('FJ1_URL_')) if find_type(data.get('FJ1_URL_')) else find_type(data.get('FJ1_NAME_')) # 文本分类模型 try: response = requests.post('http://172.22.69.39:8099/ZHclassify', data={'title': data.get('TITLE_')}).json() except Exception as e: self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}") else: if response: data["type"] = response["type"] else: data["type"] = '发展规划_其他' if data.get('CONTENT_'): if len(data.get('CONTENT_')) < 500: data['accessory'] = str(transform_data(data.get('FJ1_URL_'), data)) if data.get('FJ1_URL_') else '' try: content = data.get('CONTENT_').replace('|', '') + data.get('accessory') if data.get('FJ1_URL_') else data.get('CONTENT_').replace('|', '') except: content = data.get('CONTENT_').replace('|', '') if content: # 文本摘要模型 try: response = requests.post('http://172.22.69.39:8101/ZHsummary', data={'text': content[:500]}).json() except Exception as e: self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}") data["summary"] = '' else: if response: data["summary"] = response.get("summary") else: data["summary"] = '' # 地名及其置信度模型 try: response = requests.post('http://172.22.69.39:8100/ZHlocation', data={'text': content[:500]}).json() except Exception as e: self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}") data["location"] = '' else: if response: data["location"] = response.get("address") else: data["location"] = '' re_data = super(BranchOrganize, self).generic_shuffle(data=data, re_data=data, field="ENTITY_NAME_") return re_data
def __shuffle(self, data): serial_number = req_for_serial_number(code="CRM_MARKET_ACT") data["ID_"] = serial_number bank_list = list() bank_code_list = list() for each in self.bank_list: if data.get('BANK_NAME_') in each['ALIAS_']: bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: data["BANK_CODE_"] = "|".join(bank_code_list) return data
def generic_shuffle(self, data, field="CONTENT_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = copy.deepcopy(data) serial_number = req_for_serial_number(code="CRM_NEWS") re_data["ID_"] = serial_number # 作者 if "NEWS_AUTHOR_" in data: if "编辑" in data["NEWS_AUTHOR_"]: re_data["NEWS_AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["NEWS_AUTHOR_"])[0] # 内容 re_data["NEWS_DESC_TEXT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["NEWS_DESC_TEXT_"]).replace("|", "") # 调用模型 -- 实体识别 try: res = req_for_ner(text=re_data["NEWS_DESC_TEXT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if res.get("Organ"): bank_name = res.get("Organ").get("entity") if bank_name and '银行' in bank_name: re_data["BANK_NAME_"] = bank_name bank_list = list() bank_code_list = list() for each in self.bank_list: if re_data.get('BANK_NAME_') in each['ALIAS_']: bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) return [{"TABLE_NAME_": 'CRM_NEWS', "DATA_": re_data}]
def __shuffle(self, data): serial_number = req_for_serial_number(code="CRM_JJK") data["ID_"] = serial_number # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) data["CREATE_TIME_"] = create_time data["CREATE_BY_ID_"] = CREATE_ID data["CREATE_BY_NAME_"] = CREATE_NAME data["M_STATUS_"] = "N" data["DELETE_STATUS_"] = "N" data["DATA_STATUS_"] = "UNCHECK" data["PUBLISH_STATUS_"] = "N" data["HOT_"] = "0" data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") source = re.findall(r"(https?://.*?)/", data["URL_"]) if source: data["SOURCE_"] = source[0] data["SOURCE_NAME_"] = data["ENTITY_NAME_"] # 处理图片 if "IMG" in data and data["IMG"]: try: response = req_for_something(url=data["IMG"]) except Exception as e: self.logger.exception(f"2.1--err: IMG" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f"error: {e}.") else: if response: content = response.content encode_data = base64.b64encode(content) data["IMG_"] = encode_data.decode("utf-8") response.close() else: data["IMG_"] = "" del data["IMG"] del data["DATETIME_"] return data
def generic_shuffle(self, data): re_data = dict() # print(data) # print(data["DEALTIME_"]) time_array = time.localtime(int(data["DEALTIME_"])) period_time = time.strftime("%Y%m%d", time_array) serial_number = req_for_serial_number(code="WEIBO_BASIC_INFO") re_data["ID_"] = serial_number # 对BANK_NAME_作处理 # 对特殊微信BANK_NAME 做处理 for key, value in self.name_dict.items(): if key[:2] in data["ENTITY_NAME_"]: re_data["BANK_NAME_"] = key re_data["BANK_CODE_"] = value break if "BANK_NAME_" in re_data: if re_data["BANK_NAME_"] == "建信": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建设银行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "农业银行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "工行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "工商银行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "民生银行": re_data["BANK_NAME_"] = "中国民生银行" if re_data["BANK_NAME_"] == "光大银行": re_data["BANK_NAME_"] = "中国光大银行" if re_data["BANK_NAME_"] == "交行": re_data["BANK_NAME_"] = "交通银行" if re_data["BANK_NAME_"] == "招行": re_data["BANK_NAME_"] = "招商银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "中行": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "中银": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "邮储银行": re_data["BANK_NAME_"] = "中国邮政储蓄银行" if re_data["BANK_NAME_"] == "邮政储蓄银行": re_data["BANK_NAME_"] = "中国邮政储蓄银行" if re_data["BANK_NAME_"] == "南海农商银行": re_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司" if re_data["BANK_NAME_"] == "顺德农村商业银行": re_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司" re_data["PERIOD_CODE_"] = period_time # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["MAIN_URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] re_data["SOURCE_TYPE_"] = "" re_data["HOT_"] = "0" re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"] re_data["WEIBO_NAME_"] = data["ENTITY_NAME_"] re_data["FOCUS_"] = data["FOCUS_"] re_data["FANS_"] = data["FANS_"] # 对错误COMPANY 处理 if re.match(r"\d+-\d+-\d+", data["COMPANY_"]): data["COMPANY_"] = data["ENTITY_NAME_"] + "股份有限公司" re_data["COMPANY_"] = data["COMPANY_"] re_data["VIRIFIED_"] = data["VIRIFIED_"] re_data["BRIEF_"] = data["BIREF_"] re_data["VERSION_"] = "0" # 添加大V认证 默认银行官微都为大V re_data["VERIFIED_"] = "Y" re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] if re_data["ENTITY_NAME_"] == "华夏银行": re_data["ENTITY_NAME_"] = "华夏银行微博" re_data["URL_"] = data["MAIN_URL_"] re_data = super(WeiboBasicInfoScript, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def generic_shuffle(self, data): re_data = dict() serial_number = req_for_serial_number(code="MAPBAR") re_data["ID_"] = serial_number re_data["NAME_"] = data["NAME_"] re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "") re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "") # re_data["PROVINCE_CODE_"] = "3100" # re_data["PROVINCE_NAME_"] = "上海市" # re_data["CITY_CODE_"] = "310100" # re_data["CITY_NAME_"] = "上海市" re_data["HOT_"] = 0 # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = "图吧" re_data["SOURCE_TYPE_"] = "图吧" # 获取经纬度 try: if re_data["ADDRESS_"]: location_result = get_lat_lng(address=re_data["ADDRESS_"]) if location_result["status"] == 0: re_data["LNG_"] = str( location_result["result"]["location"]["lng"]) re_data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: re_data["LNG_"] = "" re_data["LAT_"] = "" self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: re_data["LNG_"] = "" re_data["LAT_"] = "" except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}") if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}") else: try: re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in self.city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in self.province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if not re_data.get("CITY_NAME_", ""): for city in self.city_list: if city["NAME_"][:2] in data["TYPE_"]: re_data["CITY_CODE_"] = city["CODE_"] re_data["CITY_NAME_"] = city["NAME_"] break if re_data.get("CITY_NAME_", ""): for prov in self.province_list: if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]: re_data["PROVINCE_CODE_"] = prov["CODE_"] re_data["PROVINCE_NAME_"] = prov["NAME_"] break # CHA_BRANCH_MAIN_ROUTE 主干道 if "道路" in data["TYPE_"]: road_data = dict() road_data.update(re_data) road_data["ID_"] = req_for_serial_number(code="WD_GD") road_data["ADDR_"] = road_data["ADDRESS_"] del road_data["ADDRESS_"] road_shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=road_data, field=None) # CHA_BRANCH_FACILITY 图吧 # serial_number = req_for_serial_number(code="MAPBAR") # re_data["ID_"] = serial_number re_data["TYPE1_"] = data["BTYPE_"] try: re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]] except KeyError: raise Exception("暂不需要清洗的数据") # 小分类清洗(合并部分分类) if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]: re_data["TYPE2_"] = "俱乐部" re_data["TYPE2_CODE_"] = "JLB" elif data["TYPE_"][2:] in ["连锁店", "便利店"]: re_data["TYPE2_"] = "便利店" re_data["TYPE2_CODE_"] = "BLD" elif data["TYPE_"][2:] in ["电子商城", "电器商城"]: re_data["TYPE2_"] = "家电数码" re_data["TYPE2_CODE_"] = "JDSM" elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]: re_data["TYPE2_"] = "门诊/卫生所" re_data["TYPE2_CODE_"] = "MZWSS" else: re_data["TYPE2_"] = data["TYPE_"][2:] re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"]) re_data["SOURCE_TYPE1_"] = data["BTYPE_"] re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get( re_data["SOURCE_TYPE1_"]) re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:] re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get( re_data["SOURCE_TYPE2_"]) re_data["PHONE_"] = data["PHONE_"].replace("无,", "") re_data["BUS_"] = data["BUS_"] re_data["BUSSTOP_"] = data["BUSSTOP_"] shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=re_data, field=None) return_list = list() return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"), "DATA_": shuffle_data }) if "road_shuffle_data" in dir(): return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"), "DATA_": road_shuffle_data }) return return_list
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :param data: :return: """ re_data = dict() serial_number = req_for_serial_number(code="WD_JT_GJ") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # # 补全经度纬度和省市等信息 # try: # city = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-")+1:] # lat_result_list = get_infomation(data["NAME_"], city) # print(lat_result_list) # except KeyError: # re_data["LAT_"] = None # re_data["LNG_"] = None # except Exception as e: # re_data["LAT_"] = None # re_data["LNG_"] = None # self.logger.info("获取经纬度失败{}".format(e)) # if lat_result_list.get('result') and len(lat_result_list['result']) > 0: # for lat_result in lat_result_list['result']: # if lat_result["name"] == "{}-公交车站".format(data["NAME_"]): # print("找到公交") # re_data["LAT_"] = lat_result["location"]["lat"] # re_data["LNG_"] = lat_result["location"]["lng"] # break temp_location = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-") + 1:] + data["NAME_"] + "公交车站" try: lat_result = get_lat_lng(address=temp_location) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data.get("LAT_"): # 根据前面查询的经纬度获取周围公交车站精确经纬度 lat_handle = "" try: lat_origin = ",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])]) i = 0 find_tag = False while True: s3 = get_periphery(classify="公交车站", tag="交通设施", lat_lng=lat_origin, radius=3000, page_num=i) for nearby in s3["results"]: if data["NAME_"] in nearby["name"]: find_tag = True lat = str(nearby["location"]["lat"]) lng = str(nearby["location"]["lng"]) re_data["LAT_"] = lat re_data["LNG_"] = lng lat_handle = lat + "," + lng break if find_tag: break i += 1 if len(s3["results"]) != 20: break except Exception as e: self.logger.info(f"获取精确经纬度失败, ERROR: {e}") if len(lat_handle) > 0: # 获取精确经纬度后根据精确经纬度补全地址信息 try: area_result = get_area(lat_handle) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass else: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass # 站点描述 re_data["DESCRIBE_"] = data["DESCRIBE_"] # 周边站点 re_data["AROUND_STATIONS_"] = self.handle_special_text( data["AROUND_STATIONS_"]).replace("|", ",") # 途径路线 re_data["AROUND_ROUTE_"] = self.handle_special_text( data["AROUND_ROUTE_"]).replace("|", ",") if re_data["AROUND_ROUTE_"]: re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace( "公交线路", "") # 站点名称 re_data["NAME_"] = data["NAME_"] re_data = super(Branchjtgj, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data_list = list() # print(data["CONTENT_HTML_"]) # 根据CONTENT_HTML_ 获取商圈字典型列表("区域":"商圈名") soup = BeautifulSoup(data["CONTENT_HTML_"], "html.parser") dl = soup.find_all('dl', {"class": "list"}) # 商圈字典型列表 dt_dict = dict() for item in dl: # print(item) dt = item.dt.a.string li_list = list() for li in item.find_all('li'): # print(li) li_list.append(li.a.string) dt_dict[dt] = li_list for area_name in dt_dict: shopping_list = dt_dict[area_name] # print(dt_dict) # 得到各商圈经度和维度 补全省市区域数据 for shopping_name in shopping_list: re_data = dict() # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # ID serial_number = req_for_serial_number(code="WD_SS_SQ") re_data["ID_"] = serial_number try: lat_result = get_lat_lng(address=data["CITY_"] + "市" + area_name + shopping_name) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["ADDR_"] = area_result["result"][ "formatted_address"] re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data[ "AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: re_data["ADDR_"] = shopping_name re_data["PROVINCE_NAME_"] = None re_data["CITY_NAME_"] = data["CITY_"] + "市" re_data["AREA_NAME_"] = None re_data["AREA_CODE_"] = None re_data["CITY_CODE_"] = None re_data["PROVINCE_CODE_"] = None re_data["NAME_"] = shopping_name re_data = super(Branchsssq, self).generic_shuffle(data=data, re_data=re_data, field=None) re_data_list.append({ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }) # print(re_data_list) return re_data_list
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() # ID serial_number = req_for_serial_number(code="WD_JT_DT") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 temp_location = data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find( "|")] + data["STATION_NAME_"] + "地铁站" # print(temp_location) # try: # res = req_for_textLoc(text=data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find("|")] + data["STATION_NAME_"]+"地铁站") # except Exception as e: # self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误." # f" 原始数据 collection = {self.m_client.mongo_collection};" # f" ENTITY_CODE_ = {self.entity_code};" # f" 原始数据 _id = {data['_id']};" # f" error: {e}.") # else: # if "error" not in res: # if res["tagsId"] == "None" or res["tagsId"] is None: # pass # else: # re_data["TAGS_"] = res["tagsId"] # if res["flag"] == 1: try: lat_result = get_lat_lng(address=temp_location) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data["LAT_"]: lat_handle = "" try: lat_origin = ",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])]) i = 0 find_tag = False while True: s3 = get_periphery(classify="地铁站", tag="交通设施", lat_lng=lat_origin, radius=3000, page_num=i) for nearby in s3["results"]: if nearby["name"] == data["STATION_NAME_"]: find_tag = True lat = str(nearby["location"]["lat"]) lng = str(nearby["location"]["lng"]) re_data["LAT_"] = lat re_data["LNG_"] = lng lat_handle = lat + "," + lng break if find_tag: break i += 1 if len(s3["results"]) != 20: break except Exception as e: self.logger.info(f"获取精确经纬度失败, ERROR: {e}") if len(lat_handle) > 0: # 获取精确经纬度后根据精确经纬度补全地址信息 try: # area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) area_result = get_area(lat_handle) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass else: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass # print(re_data) # 站点名称 if "STATION_NAME_" in data: re_data["STATION_NAME_"] = data["STATION_NAME_"] # 途经路线(地铁几号线) temp_subway = data["SUBWAY_NAME_"].replace("|", "-") AROUND_ROUTE_ = re.findall(r"地铁\d+号线", temp_subway) if len(AROUND_ROUTE_) == 1: re_data["AROUND_ROUTE_"] = AROUND_ROUTE_[0] elif len(AROUND_ROUTE_) > 1: re_data["AROUND_ROUTE_"] = ",".join(AROUND_ROUTE_) else: re_data["AROUND_ROUTE_"] = "" # 地铁名称 if "SUBWAY_NAME_" in data: SUBWAY_NAME_ = data["SUBWAY_NAME_"].replace("|", "-") if "," in SUBWAY_NAME_: re_data_list = list() SUBWAY_LIST = SUBWAY_NAME_.split(",") for subway in SUBWAY_LIST: # 拆开的地铁名称需要再获取serial_number serial_number = req_for_serial_number(code="WD_JT_DT") re_data["ID_"] = serial_number re_data["SUBWAY_NAME_"] = subway + "-" + re_data[ "STATION_NAME_"] re_data = super(Branchjtdt, self).generic_shuffle(data=data, re_data=re_data, field=None) # temp_dict = deepcopy(re_data) temp_dict = deepcopy({ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }) re_data_list.append(temp_dict) return re_data_list else: re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[ "STATION_NAME_"] re_data = super(Branchjtdt, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }]
def generic_shuffle(self, data, re_data, field=None): """ 通用清洗规则写在这里, 现只有从字段中匹配银行。 :param data: 要清洗的数据 type: dict :param re_data: 要清洗的数据 type: dict :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ... NoneType: None 无需清洗 :return: 清洗完毕的数据 type: dict """ # 涉及银行统一在 __init_____.py 中处理 # if field: # if "BANK_NAME_" not in re_data: # for bank in self.bank_list: # if data["ENTITY_NAME_"][:-4] in bank["ALIAS_"]: # re_data["BACK_CODE_"] = bank["CODE_"] # 银行编码 # re_data["BACK_NAME_"] = bank["NAME_"] # 银行名称 # break if "ID_" not in re_data: serial_number = req_for_serial_number( code=data["ENTITY_CODE_"][:8]) re_data["ID_"] = serial_number # 文件上传 if "YJBG_" in data["ENTITY_CODE_"]: tc = "YJBG" if data["FILE_URL_"]: re_postfix = re.findall(r"\.([pd][do][fc]x?$)", data["FILE_URL_"]) if re_postfix or data.get('ENTITY_CODE_') in [ 'XYK_YJBG_GFYH', 'XYK_YJBG_JTYH' ]: postfix = re_postfix[0] if re_postfix else 'pdf' if "FILE_NAME_" in data: file_name = data["FILE_NAME_"] else: re_file_name = re.findall(rf"/(.*?)\.{postfix}", data["FILE_URL_"], re.IGNORECASE) if re_file_name: file_name = re_file_name[0] else: file_name = str(uuid.uuid1()) try: response = req_for_something(url=data["FILE_URL_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: print('附件请求成功') if response: try: # p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_{tc}_{postfix.upper()}", p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_YJBG", file_name=file_name, postfix=postfix, file=response.content) if "error" in p_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) raise Exception("上传文件出错") else: self.logger.info( f"2.3--success: 文件上传成功." f"{p_response.content.decode('utf-8')}") p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") raise Exception("上传文件出错") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") raise Exception("文件请求失败") if "ENTITY_CODE_" not in re_data: re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] if "ENTITY_NAME_" not in re_data: re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] if "URL_" not in re_data: if "URL_" in data: re_data["URL_"] = data["URL_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time if "PERIOD_CODE_" not in re_data: re_data["PERIOD_CODE_"] = re_data.get("PUBLISH_TIME_", "") if "M_STATUS_" not in re_data: re_data["M_STATUS_"] = "N" if "DELETE_STATUS_" not in re_data: re_data["DELETE_STATUS_"] = "N" if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "VERSION_" not in re_data: re_data["VERSION_"] = "0" if "DATA_VERSION_" not in re_data: re_data["DATA_VERSION_"] = "0" if "MICROBLOG" not in re_data[ "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" return re_data
def generic_shuffle(self, data): """ 清洗规则写这里,如不需要通用清洗规则则不继承, 从大文本中筛选数据 :param data: :param field: :return: """ re_data = dict() re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] serial_number = req_for_serial_number(code="JRCP_XYK") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["SOURCE_TYPE_"] = "WAK" # 对特殊微信 BANK_NAME 做处理 for key, value in self.name_dict.items(): if key[:2] in data["PRO_NAME_"]: re_data["BANK_NAME_"] = key re_data["BANK_CODE_"] = value break if "BANK_NAME_" in re_data: if re_data["BANK_NAME_"] == "建信": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建设银行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "农业银行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "工行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "工商银行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "民生银行": re_data["BANK_NAME_"] = "中国民生银行" if re_data["BANK_NAME_"] == "光大银行": re_data["BANK_NAME_"] = "中国光大银行" if re_data["BANK_NAME_"] == "交行": re_data["BANK_NAME_"] = "交通银行" if re_data["BANK_NAME_"] == "招行": re_data["BANK_NAME_"] = "招商银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "中行": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "中银": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "邮储银行": re_data["BANK_NAME_"] = "中国邮政储蓄银行" # 信用卡名称 if "PRO_NAME_" in data: if "(" in data["PRO_NAME_"]: data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"]. find("(")] elif "(" in data["PRO_NAME_"]: data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"]. find("(")] re_data["PRO_NAME_"] = data["PRO_NAME_"] # 卡币种 if "CURRENCY_TYPE_" in data: re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # 卡币种类型 if data["CURRENCY_TYPE_"] == "人民币": re_data["CURRENCY_TYPE_CODE_"] = "RMB" if re.match(r"人民币/.*?", data["CURRENCY_TYPE_"]): re_data["CURRENCY_TYPE_CODE_"] = "DBZ" if data["CURRENCY_TYPE_"] == "美元": re_data["CURRENCY_TYPE_CODE_"] = "DBZ" # 卡组织|结算渠道 if "BRAND_" in data: re_data["BRAND_"] = data["BRAND_"] # 卡组织CODE for brand_key in self.brand_dict: if brand_key in data["BRAND_"]: re_data["BRAND_CODE_"] = self.brand_dict[brand_key] break # 卡等级 if "LEVEL_" in data: re_data["LEVEL_"] = data["LEVEL_"] # 卡等级CODE for level_key in self.level_dict: if level_key[:2] in data["LEVEL_"][:2]: re_data["LEVEL_CODE_"] = self.level_dict[level_key] break # 取现额度 if "CONSUME_LIMIT_" in data: re_data["CONSUME_LIMIT_"] = data["CONSUME_LIMIT_"] # 这里开始从大文本清洗 # 免息期 GRACE_PERIODS_ = re.findall(r".*?免息期[::]\|(.*?)\|", data["CONTENT_"]) if len(GRACE_PERIODS_) > 0: GRACE_PERIODS_ = GRACE_PERIODS_[0] # 处理到20天50天的错误数据 pattern = re.compile(r"到(\d+)天(\d+)天") if re.match(pattern, GRACE_PERIODS_): GRACE_PERIODS_ = pattern.sub(r"\1天到\2天", GRACE_PERIODS_) if GRACE_PERIODS_ == "消费验证方式:": GRACE_PERIODS_ = "" if GRACE_PERIODS_ == "预借现金额度:" or GRACE_PERIODS_ == "预借现金额度:": GRACE_PERIODS_ = "" if re.match(r"最长\d+天最长\d+天", GRACE_PERIODS_): a = re.match(r"(最长\d+天)最长\d+天", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) if re.match(r"\d+天到\d+天\d+天到\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)到(\d+天)(\d+天)到\d+天", GRACE_PERIODS_) if a.group(1) == a.group(2): GRACE_PERIODS_ = a.group(1) + "到" + a.group(3) else: GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"\d+天\d+天\d+天\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)\d+天(\d+天)\d+天", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"\d+天\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)(\d+天)", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"至\d+天\d+天", GRACE_PERIODS_): a = re.match(r"至(\d+天)(\d+天)", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) re_data["GRACE_PERIODS_"] = GRACE_PERIODS_ else: re_data["GRACE_PERIODS_"] = data["GRACE_PERIODS_"] # 免年费政策 FREE_POLICY_ = re.findall(r".*?免年费政策[::]\|(.*?)\|", data["CONTENT_"]) if len(FREE_POLICY_) > 0: FREE_POLICY_ = FREE_POLICY_[0] # 删除重复数据 pattern = re.compile(r"(免\d+年年费){2,9}") if re.match(pattern, FREE_POLICY_): a = re.match(pattern, FREE_POLICY_) FREE_POLICY_ = a.group(1) pattern = re.compile(r"(终身免年费){2,9}") if re.match(pattern, FREE_POLICY_): a = re.match(pattern, FREE_POLICY_) FREE_POLICY_ = a.group(1) re_data["FREE_POLICY_"] = FREE_POLICY_ # 主卡年费 FEE_ = re.findall(r".*?主卡年费[::]\|(.*?)\|", data["CONTENT_"]) if len(FEE_) > 0: FEE_ = FEE_[0] tempfee = re.findall(r".*?(\d+).*?", FEE_) if len(tempfee) > 0: re_data["FEE_"] = tempfee[0] else: re_data["FEE_"] = "" else: re_data["FEE_"] = "0" # 预借现金额度 PRE_BORROW_ = re.findall(r".*?预借现金额度[::]\|(.*?)\|", data["CONTENT_"]) if len(PRE_BORROW_) > 0: PRE_BORROW_ = PRE_BORROW_[0] if PRE_BORROW_ == "免息期:": PRE_BORROW_ = "" if PRE_BORROW_ == "免年费政策:": PRE_BORROW_ = "" # 去除重复的数据 pattern = re.compile(r"(信用额度的\d+%)信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(信用额度的\d+-\d+%)信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(普卡信用额度的\d+%)白金卡信用额度的\d+%金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(普卡信用额度的\d+%)金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(白金卡信用额度的\d+%)金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) re_data["PRE_BORROW_"] = PRE_BORROW_ else: re_data["PRE_BORROW_"] = "" # 消费验证方式 re_data["VALID_CONSUME_"] = "密码+签名 签名" # 账单日 BILL_DATE_ = re.findall(r".*?账单日[::]\|(.*?)\|", data["CONTENT_"]) if len(BILL_DATE_) > 0: BILL_DATE_ = BILL_DATE_[0] # 处理重复的账单日数据 比如:账单日21号账单日21号账单日21号 pattern = re.compile(r"(账单日\d+号){2,9}") if re.match(pattern, BILL_DATE_): a = re.match(pattern, BILL_DATE_) BILL_DATE_ = a.group(1) re_data["BILL_DATE_"] = BILL_DATE_ else: re_data["BILL_DATE_"] = "" # 积分方式 POINTS_ = re.findall(r".*?积分方式[::]\|(.*?)\|", data["CONTENT_"]) if len(POINTS_) > 0: POINTS_ = POINTS_[0] if re_data.get("BANK_CODE_") and re_data["BANK_CODE_"] == "CMB": POINTS_ = POINTS_.replace("元", "元 ") else: POINTS_ = POINTS_.replace("分", "分 ") POINTS_ = POINTS_.replace("倍", "倍 ") POINTS_ = POINTS_.replace("积分 的2倍", "积分的2倍") re_data["POINTS_"] = POINTS_ else: re_data["POINTS_"] = "" # 积分有效期 VALID_DATE_POINTS_ = re.findall(r".*?积分有效期[::]\|(.*?)\|", data["CONTENT_"]) if len(VALID_DATE_POINTS_) > 0: VALID_DATE_POINTS_ = VALID_DATE_POINTS_[0] # 给几组有效期之间加上空格 pattern = re.compile(r"(白金卡\d+年)(金卡\d+年)(普卡\d+年)") if re.match(pattern, VALID_DATE_POINTS_): VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3", VALID_DATE_POINTS_) pattern = re.compile(r"(\d+年到\d+年)(\d+年)(永久有效)") if re.match(pattern, VALID_DATE_POINTS_): VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3", VALID_DATE_POINTS_) re_data["VALID_DATE_POINTS_"] = VALID_DATE_POINTS_ else: re_data["VALID_DATE_POINTS_"] = "" # 循环信用利息 DAILY_INTEREST_ = re.findall(r".*?循环信用利息(日息)[::]?\|(.*?)\|", data["CONTENT_"]) if len(DAILY_INTEREST_) > 0: DAILY_INTEREST_ = DAILY_INTEREST_[0] if DAILY_INTEREST_ == "消费短信通知费:": DAILY_INTEREST_ = "" re_data["DAILY_INTEREST_"] = DAILY_INTEREST_ else: re_data["DAILY_INTEREST_"] = "" # 最低还款 MIN_REPAY_ = re.findall(r".*?最低还款[::]?\|(.*?)\|", data["CONTENT_"]) if len(MIN_REPAY_) > 0: MIN_REPAY_ = MIN_REPAY_[0] if re.match(r"最低应还所欠金额的\d+%最低应还所欠金额的\d+%", MIN_REPAY_): a = re.match(r"(最低应还所欠金额的\d+%)最低应还所欠金额的\d+%", MIN_REPAY_) MIN_REPAY_ = a.group(1) if MIN_REPAY_ == "账单日:": MIN_REPAY_ = "" re_data["MIN_REPAY_"] = MIN_REPAY_ else: re_data["MIN_REPAY_"] = "" # 卡片特色 if "SPECIAL_" in data and len(data["SPECIAL_"]) > 0: re_data["SPECIAL_"] = data["SPECIAL_"].replace("|", "<br/>") # 增值服务 if "VAS_" in data and len(data["VAS_"]) > 0: re_data["VAS_"] = data["VAS_"].replace("|", "<br/>") # 信用卡图片 # 处理错误的信用卡图片URL if "IMAGES_" in data: pattern = re.compile(r"https:(http://.*)") if re.match(pattern, data["IMAGES_"]): a = re.match(pattern, data["IMAGES_"]) image_url = a.group(1) else: image_url = data["IMAGES_"] response = req_for_something(url=image_url) if response: t = base64.b64encode(response.content) re_data["IMAGE_"] = t.decode("utf-8") re_data = super(BranchXyk, self).generic_shuffle(data=data, re_data=re_data, field=None) # print(re_data) re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
def generic_shuffle(self, data): re_data = dict() if data["TITLE_"]: serial_number = req_for_serial_number(code="WECHAT") re_data["ID_"] = serial_number re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "") # re_data["SOURCE_"] = data[""] # re_data["SOURCE_NAME_"] = data[""] re_data["SOURCE_TYPE_"] = "WECHAT" re_data["HOT_"] = "0" re_data["PUBLISH_TIME_"] = data["PERIOD_CODE_"] # .replace(""", "").replace("&", "") re_data["TITLE_"] = data["TITLE_"] t = base64.b64encode(re_data["TITLE_"].encode("utf-8")) re_data["TITLE_CODE_"] = t.decode("utf-8") re_data["WECHAT_ID_"] = data["WECHAT_"].strip() for wechat_item in self.excel_dict: if re_data["WECHAT_ID_"] == wechat_item["WECHAT_CODE_"]: re_data["WECHAT_NAME_"] = wechat_item["WECHAT_NAME_"] re_data["PROVINCE_NAME_"] = wechat_item["PROVINCE_NAME_"] re_data["PROVINCE_CODE_"] = str( wechat_item["PROVINCE_CODE_"]) if "." in re_data["PROVINCE_CODE_"]: re_data["PROVINCE_CODE_"] = re_data[ "PROVINCE_CODE_"].split(".")[0] re_data["CITY_NAME_"] = wechat_item["CITY_NAME_"] re_data["CITY_CODE_"] = str(wechat_item["CITY_CODE_"]) if "." in re_data["CITY_CODE_"]: re_data["CITY_CODE_"] = re_data["CITY_CODE_"].split( ".")[0] re_data["LAT_"] = str(wechat_item["LAT_"]) re_data["LNG_"] = str(wechat_item["LNG_"]) break re_data["IMPORTANCE_"] = "N" re_data["READS_"] = "0" re_data["COMMENTS_"] = "0" # re_data["ACT_"] = data[""] # re_data["ACT_TYPE_"] = data[""] # 补录 # re_data["TYPE_"] = data[""] # re_data["TYPE_CODE_"] = data[""] re_data["PUBLISH_STATUS_"] = "N" re_data["SENSITIVE_"] = "N" # # 模型 # censor = req_for_censor("".join(re.findall(r"\w+", data["CONTENT_"]))) # if censor: # if censor["censor"] == "N": # re_data["SENSITIVE_"] = "N" # else: # re_data["SENSITIVE_"] = "Y" # re_data["SENSITIVE_WORD_"] = censor["words"] re_data["VERSION_"] = "0" re_data["RECOMMEND_"] = "0" html = re.sub(r"[\n\t\r]+", "", data["CONTENT_"]) html = re.sub(r"<script.*?</script>", "", html) html = re.sub(r"href=\".*?\"", "href=\"javascript:void(0);\"", html) del data["CONTENT_"] data["HTML_"] = html re_data = super(WechatScript, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") if re_data.get('_id'): del re_data['_id'] return [{ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }] else: return
def generic_shuffle(self, data): re_data = list() # CHA_BRANCH_WEIBO_INFO info_data = dict() serial_number = req_for_serial_number(code="WEIBO_INFO") info_data["ID_"] = serial_number print(serial_number) info_data["ENTITY_CODE_"] = data["BANK_CODE_"] info_data["URL_"] = data["CONTENT_URL_"] info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "") # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"]) info_data["SOURCE_"] = source[0] # 数据来源 网站名称 info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] info_data["SOURCE_TYPE_"] = "WEIBO" info_data["LIKES_"] = data["PRAISES_"] if not info_data["LIKES_"]: info_data["LIKES_"] = 0 info_data["COMMENTS_"] = data["REPLIES_"] if not info_data["COMMENTS_"]: info_data["COMMENTS_"] = 0 info_data["RELAYS_"] = data["RELAYS_"] if not info_data["RELAYS_"]: info_data["RELAYS_"] = 0 info_data["IMPORTANCE_"] = "N" info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] info_data["CONTENT_"] = data["CONTENT_"] if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0: for each_image in data["CONTENT_IMAGES_"]: response = req_for_something(url=each_image) if response: t = base64.b64encode(response.content) info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8") response.close() # 补录 # info_data["TYPE_"] = data[""] # info_data["TYPE_CODE_"] = data[""] info_data["PUBLISH_STATUS_"] = "N" if "OWN_" in data: if data["OWN_"] == "转载": info_data["OWN_"] = "N" else: info_data["OWN_"] = "Y" for each in self.weibo_list: if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]: info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"] info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"] break # 模型 # 摘要 try: brief = req_for_ts(info_data["CONTENT_"]) if brief: info_data["BRIEF_"] = brief["summary"] except Exception as e: self.logger.info(f"调用模型req_for_ts失败,原因为{e}") info_data["BRIEF_"] = "" # 是否敏感 try: censor = req_for_censor(info_data["CONTENT_"]) if censor: if censor["censor"] == "N": info_data["SENSITIVE_"] = "N" else: info_data["SENSITIVE_"] = "Y" info_data["SENSITIVE_WORD_"] = censor["words"] except Exception as e: self.logger.info(f"调用模型censor失败,错误为{e}") info_data["SENSITIVE_"] = "N" info_data["VERSION_"] = "0" info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_") # 清洗浦发银行BANK_NAME_和BANK_CODE_ if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博": info_data["BANK_NAME_"] = "浦发银行" info_data["BANK_CODE_"] = "SPDB" if info_data["ENTITY_NAME_"] == "南海农商银行微博": info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司" info_data["BANK_CODE_"] = "NRC" if info_data["ENTITY_NAME_"] == "顺德农商银行微博": info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司" info_data["BANK_CODE_"] = "sdebank" comment = data["INFO_COMMENTS_"] verifieds = 0 for c in comment: if c.get("VERIFIED_", ""): verifieds += 1 # 微博热度 try: hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"], replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds) if hot: info_data["HOT_"] = hot["level"] except Exception as e: self.logger.info(f"调用模型weibo_hot失败,错误为{e}") re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data}) if len(comment) > 0: comment_count = 0 for each in comment: # CHA_BRANCH_WEIBO_COMMENT # 每次需要初始化comment_data不然导致数据重复 comment_data = dict() # HBase row_key serial_number = req_for_serial_number(code="WEIBO_COMMENT") comment_data["ID_"] = serial_number comment_data["INFO_ID_"] = info_data["ID_"] comment_data["COMMENT_"] = each["COMMENT_"] comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"] comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"] comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"] comment_data["REPLIER_"] = each["REPLIER_"] comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"] # 情感分析 if each.get("COMMENT_") and len(each["COMMENT_"]) > 0: try: sentiment = req_for_comment(each["COMMENT_"]) if sentiment: if sentiment["sentiment"] == "中性": comment_data["EMOTION_"] = "NORMAL" if sentiment["sentiment"] == "积极": comment_data["EMOTION_"] = "POSITIVE" if sentiment["sentiment"] == "敏感": comment_data["EMOTION_"] = "NAGETIVE" else: comment_data["EMOTION_"] = "NORMAL" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["EMOTION_"] = "NORMAL" # 是否敏感 try: censor = req_for_censor(each["COMMENT_"]) if censor: if censor["censor"] == "N": comment_data["SENSITIVE_"] = "N" else: comment_data["SENSITIVE_"] = "Y" comment_data["SENSITIVE_WORD_"] = censor["words"] else: comment_data["SENSITIVE_"] = "N" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["SENSITIVE_"] = "N" comment_data["VERSION_"] = "0" comment_data["CREATE_BY_ID_"] = "P0131857" comment_data["CREATE_BY_NAME_"] = "钟楷文" re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data}) comment_count += 1 # 打相关评论日志方便调试 self.logger.info(f'清洗的URL为{info_data["URL_"]}') self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}') self.logger.info(f'插入到comment表的数量为{comment_count}') # print(re_data) return re_data
def generic_shuffle(self, data, re_data, field="CONTENT_"): """ 父类通用清洗规则写在这里, 现只有从字段中匹配银行。 :param data: 要清洗的数据 type: dict :param re_data: 要清洗的数据 type: dict :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ... NoneType: None 无需清洗 :return: 清洗完毕的数据 type: dict """ if not field: pass # 涉及银行统一在 __init_____.py 中处理 else: if "BANK_NAME_" not in re_data: if "ZX" in data.get("ENTITY_CODE_", "")[:2]: if field in data: try: result = req_for_ner(data[field]) except Exception as e: self.logger.exception( f"2.2--err: 请求模型 req_for_ner 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if result: if "Organ" in result: if result["Organ"].get("entity", ""): organ = result["Organ"]["entity"] for each in self.bank_list: if organ in each["ALIAS_"]: re_data["BANK_NAME_"] = each[ "NAME_"] re_data["BANK_CODE_"] = each[ "CODE_"] break else: bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get(field, ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) # 地址信息 # # todo 机构 # # data["UNIT_CODE_"] = "" # # data["UNIT_NAME_"] = "" if "ID_" not in re_data: serial_number = req_for_serial_number( code=data["ENTITY_CODE_"][:7]) re_data["ID_"] = serial_number # FDFS 存储 if "ENTITY_CODE_" in data: if data["ENTITY_CODE_"][:2] == "ZX": tc = "NEWS" elif "WECHAT" in data["ENTITY_CODE_"]: tc = "WECHAT" elif "JRCP_BX" in data["ENTITY_CODE_"]: tc = "INSURANCE" elif "JRCP_LCCP" in data["ENTITY_CODE_"]: tc = "LCCP" elif "BANK_CODE_" in data: if "MICROBLOG" in data["BANK_CODE_"]: tc = "WEIBOBASIC" if "HTML_" in data: if data["HTML_"]: if "HTML_NAME_" in data: html_name = data["HTML_NAME_"] elif "PDF_NAME_" in data: html_name = data["PDF_NAME_"] else: html_name = str(uuid.uuid1()) try: response_file = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_HTML", file_name=html_name, postfix="html", file=data["HTML_"].encode("utf-8")) if "error" in response_file.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {response_file.content.decode('utf-8')}." ) raise Exception( f"附件上传错误{response_file.content.decode('utf-8')}") response_file.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") elif "PDF_" in data: if data["PDF_"]: if "HTML_NAME_" in data: pdf_name = data["HTML_NAME_"] elif "PDF_NAME_" in data: pdf_name = data["PDF_NAME_"] else: if ".PDF" in data["PDF_"] or ".pdf" in data["PDF_"]: file_name = re.findall(r"/(.*?).pdf", data["PDF_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data["PDF_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: # todo 文件上传出错是否继续还是跳过 p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in p_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") elif "PDF_1_" in data: if data["PDF_1_"]: for i in range(10): try: if f"PDF_{i}_NAME_" in data: pdf_name = data[f"PDF_{i}_NAME_"] else: if ".PDF" in data[f"PDF_{i}_"] or ".pdf" in data[ f"PDF_{i}_"]: file_name = re.findall(r"/(.*?).pdf", data[f"PDF_{i}_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data[f"PDF_{i}_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in p_response.content.decode( "utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") except KeyError: break elif "PDF_URL_" in data: if data["PDF_URL_"]: if "PDF_NAME_" in data: pdf_name = data["PDF_NAME_"] else: if ".PDF" in data["PDF_URL_"] or ".pdf" in data["PDF_URL_"]: file_name = re.findall(r"/(.*?).pdf", data["PDF_URL_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data["PDF_URL_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: f_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in f_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {f_response.content.decode('utf-8')}." ) f_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") if "ENTITY_CODE_" not in re_data: re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] if "ENTITY_NAME_" not in re_data: re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] if "URL_" not in re_data: if "URL_" in data: re_data["URL_"] = data["URL_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time if "M_STATUS_" not in re_data: re_data["M_STATUS_"] = "N" if "DELETE_STATUS_" not in re_data: re_data["DELETE_STATUS_"] = "N" if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "MICROBLOG" not in re_data[ "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" return re_data
def __shuffle(self, data): re_data = dict() re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "中国理财网" in data["ENTITY_NAME_"]: serial_number = req_for_serial_number(code="JRCP_LCCP_INFO") re_data["ID_"] = serial_number re_data["PRO_NAME_"] = data["PRO_NAME_"] re_data["PRO_ORG_"] = data["PRO_ORG_"] re_data["REGIST_CODE_"] = data["REGIST_CODE_"] re_data["PRO_STATUS_"] = data["PRO_STATUS_"] re_data["OPT_MODE_"] = data["OPT_MODE_"] re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] # re_data["YIELD_TYPE_CODE_"] = data[""] re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # re_data["CURRENCY_TYPE_CODE_"] = data[""] re_data["START_FUNDS_"] = data["START_FUNDS_"] try: if float(data["START_FUNDS_"]) <= 10000: re_data["START_FUNDS_CODE_"] = "S0_1" elif 10000 < float(data["START_FUNDS_"]) <= 50000: re_data["START_FUNDS_CODE_"] = "S1_5" elif 50000 < float(data["START_FUNDS_"]) < 100000: re_data["START_FUNDS_CODE_"] = "S5_10" elif 100000 < float(data["START_FUNDS_"]): re_data["START_FUNDS_CODE_"] = "S10_" except Exception: re_data["START_FUNDS_"] = 0 org = { '01': '国有银行', '02': '股份制银行', '03': '城商行', '04': '外资银行', '05': '农村合作金融机构', '06': '其他', '07': '其他', '08': '其他', '09': '其他', '00': '其他', '10': '理财子公司' } re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"] re_data['ORG_TYPE_'] = org.get(data.get('ORG_TYPE_')) re_data["RAISE_START_"] = data["RAISE_START_"] re_data["RAISE_END_"] = data["RAISE_END_"] re_data["PRO_START_"] = data["PRO_START_"] re_data["PRO_END_"] = data["PRO_END_"] re_data["YIELD_LOW_"] = data["YIELD_LOW_"] re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"] re_data["REAL_DAYS_"] = data["REAL_DAYS_"] re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"] re_data["DATE_TYPE_"] = data["DATE_TYPE_"] re_data["YIELD_"] = data["YIELD_"] re_data["RAISE_TYPE_"] = data["RAISE_TYPE_"] re_data["INVEST_PROPERTIES_"] = data["INVEST_PROPERTIES_"] re_data["BUS_START_"] = data["BUS_START_"] re_data["BUS_END_"] = data["BUS_END_"] re_data["START_VALUE_"] = data["START_VALUE_"] re_data["PRO_VALUE_"] = data["PRO_VALUE_"] re_data["TOTAL_VALUE_"] = data["TOTAL_VALUE_"] re_data["RECENT_YIELD_"] = data["RECENT_YIELD_"] re_data["PRO_TYPE_"] = data["PRO_TYPE_"] re_data["SALE_AREA_"] = data["SALE_AREA_"] if "PROVINCE_NAME_" in data: re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"] if "PROVINCE_NAME_" in data: re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"] if "CITY_NAME_" in data: re_data["CITY_NAME_"] = data["CITY_NAME_"] if "CITY_CODE_" in data: re_data["CITY_CODE_"] = data["CITY_CODE_"] # re_data["REDEEM_"] = data[""] # re_data["INCREASE_"] = data[""] # re_data["INVEST_RANGE_"] = data[""] bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get("ENTITY_NAME_", ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) # del re_data["CREATE_TIME_"] # del re_data["SPIDER_TIME_"] # del re_data["M_STATUS_"] # del re_data["DELETE_STATUS_"] # del re_data["DATA_STATUS_"] # del re_data["PUBLISH_STATUS_"] re_data = super(BranchFinProduct, self).generic_shuffle(data=data, re_data=re_data, field=None) if not data["YIELD_LOW_"]: re_data['YIELD_LOW_'] = '--' if not data["YIELD_HIGH_"]: re_data['YIELD_HIGH_'] = '--' if not data["START_FUNDS_"]: re_data['START_FUNDS_'] = '--' return {"TABLE_NAME_": TABLE_NAME("CRMLCCP"), "DATA_": re_data} else: source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] serial_number = req_for_serial_number(code="JRCP_LCCP") re_data["ID_"] = serial_number re_data["SOURCE_TYPE_"] = "" # if "PRO_NAME_" not in data: # return re_data["PRO_NAME_"] = data["PRO_NAME_"] f_index = data["ENTITY_NAME_"].find("-") re_data["PRO_ORG_"] = data["ENTITY_NAME_"][:f_index] if "PRO_CODE_" in data: re_data["PRO_CODE_"] = data["PRO_CODE_"] # 登记编码 if "REGIST_CODE_" in data: re_data["REGIST_CODE_"] = data["REGIST_CODE_"] else: if "PDF_" in data: try: text = parse(data["PDF_"]) registration_code = re.findall(r"C\d{13}", text) if registration_code: re_data["REGIST_CODE_"] = registration_code[0] except Exception as e: self.logger.exception( f"2.1--err: PDF." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") # 预售(PRE)、在售(ON)、停售(STOP) # 全部为 在售 re_data["PRO_STATUS_"] = "ON" if "OPT_MODE_" in data: re_data["OPT_MODE_"] = data["OPT_MODE_"] if "YIELD_TYPE_" in data: re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"] # re_data["YIELD_TYPE_CODE_"] = data[""] if "CURRENCY_TYPE_" in data: re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # re_data["CURRENCY_TYPE_CODE_"] = data[""] # 起购金额 if "START_FUNDS_" in data: start_funds = data["START_FUNDS_"].replace(" ", "") start_funds = start_funds.replace("亿", "00000000") start_funds = start_funds.replace("千万", "0000000") start_funds = start_funds.replace("百万", "000000") start_funds = start_funds.replace("十万", "00000") start_funds = start_funds.replace("万", "0000") start_funds = start_funds.replace("千", "000") start_funds = start_funds.replace("百", "00") start_funds = start_funds.replace("元", "") re_data["START_FUNDS_"] = start_funds try: if float(re_data["START_FUNDS_"]) <= 10000: re_data["START_FUNDS_CODE_"] = "S0_1" elif 10000 < float(re_data["START_FUNDS_"]) <= 50000: re_data["START_FUNDS_CODE_"] = "S1_5" elif 50000 < float(re_data["START_FUNDS_"]) <= 100000: re_data["START_FUNDS_CODE_"] = "S5_10" elif 100000 < float(re_data["START_FUNDS_"]): re_data["START_FUNDS_CODE_"] = "S10_" except Exception as e: re_data["START_FUNDS_"] = 0 if "RISK_LEVEL_CODE_" in data: re_data["RISK_LEVEL_"] = self.risk_dict[ data["RISK_LEVEL_CODE_"]] re_data["RISK_LEVEL_CODE_"] = data["RISK_LEVEL_CODE_"] if "RISK_LEVEL_" in data: re_data["SOURCE_RISK_LEVEL_"] = data["RISK_LEVEL_"] elif "SOURCE_RISK_LEVEL_" in data: re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"] # # 募集起始日期 if "RAISE_START_" in data: re_data["RAISE_START_"] = data["RAISE_START_"] # # 募集结束日期 if "RAISE_END_" in data: re_data["RAISE_END_"] = data["RAISE_END_"] # # 产品起始日期 if "PRO_START_" in data: re_data["PRO_START_"] = data["PRO_START_"] # # 产品结束日期 if "PRO_END_" in data: re_data["PRO_END_"] = data["PRO_END_"] # 预期最低收益率 if "YIELD_LOW_" in data: re_data["YIELD_LOW_"] = data["YIELD_LOW_"].replace("%", "") # 预期最高收益率 if "YIELD_HIGH_" in data: re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"].replace("%", "") # 实际天数 if "REAL_DAYS_" in data: data["REAL_DAYS_"] = data["REAL_DAYS_"].replace(" ", "") if "年" in data["REAL_DAYS_"]: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("年", "") try: re_data["REAL_DAYS_"] = int( re_data["REAL_DAYS_"]) * 365 except Exception: re_data["REAL_DAYS_"] = 0 elif "月" in data: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("月", "") try: re_data["REAL_DAYS_"] = int(re_data["REAL_DAYS_"]) * 30 except Exception: re_data["REAL_DAYS_"] = 0 else: re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("天", "") else: if "PRO_START_" in data and "PRO_END_" in data: t_start = arrow.get(data["PRO_START_"], "YYY-MM-DD") t_end = arrow.get(data["PRO_END_"], "YYYY-MM-DD") real_days = t_end - t_start data["REAL_DAYS_"] = real_days.days if "INVEST_TYPE_" in data: re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"] # # 投资者类型 if "PRO_TYPE_" in data: re_data["PRO_TYPE_"] = data["PRO_TYPE_"] if "SALE_AREA_" in data: re_data["SALE_AREA_"] = data["SALE_AREA_"] # # 可否赎回 if "REDEEM_" in data: if "不" in data["REDEEM_"]: re_data["REDEEM_"] = "N" else: re_data['REDEEM_'] = "Y" if "INCREASE_" in data: increase = data["INCREASE_"].replace(" ", "") increase = increase.replace("亿", "00000000") increase = increase.replace("千万", "0000000") increase = increase.replace("百万", "000000") increase = increase.replace("十万", "00000") increase = increase.replace("万", "0000") increase = increase.replace("千", "000") increase = increase.replace("百", "00") increase = increase.replace("元", "") re_data["INCREASE_"] = increase # re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"] re_data["RECOMMEND_"] = "N" re_data["GOOD_SALE_"] = "N" re_data["NEW_SALE_"] = "N" re_data["SALE_SOURCE_"] = "NET" bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get("ENTITY_NAME_", ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) if not data["YIELD_LOW_"]: re_data['YIELD_LOW_'] = '--' if not data["YIELD_HIGH_"]: re_data['YIELD_HIGH_'] = '--' if not data["START_FUNDS_"]: re_data['START_FUNDS_'] = '--' re_data = super(BranchFinProduct, self).generic_shuffle(data=data, re_data=re_data, field=None) re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return { "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FINANCIAL_PRODUCT"), "DATA_": re_data }
def generic_shuffle(self, data): """ 清洗脚本写到这里 :param data: :return re_data: """ re_data = dict() serial_number = req_for_serial_number(code="WD_SS_YY") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 try: lat_result = get_lat_lng(address=data["ADDR_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: try: lat_result = get_lat_lng(address=data["CITY_NAME_"]+data["NAME_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误为{}".format(e)) except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误为{}".format(e)) if re_data["LNG_"]: try: area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info("获取地址信息失败错误为{}".format(e)) else: try: re_data["PROVINCE_NAME_"] = area_result["result"]["addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"]["addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" except KeyError: pass # 设备 if "DEVICE_" in data: re_data["DEVICE_"] = data["DEVICE_"] # 医院等级 if "GRADE_" in data: re_data["GRADE_"] = data["GRADE_"] # 特色 if "SPECIAL_" in data: re_data["SPECIAL_"] = data["SPECIAL_"] # 电话 if "TEL_" in data: re_data["TEL_"] = data["TEL_"] # 医院id if "HOSPITAL_ID_" in data: re_data["HOSPITAL_ID_"] = data["HOSPITAL_ID_"] # 医院名称 if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] # 床位 if "BEDS_" in data: re_data["BEDS_"] = data["BEDS_"] # 医院性质 if "TYPE_" in data: re_data["TYPE_"] = data["TYPE_"] # 网站 if "WEBSITE_" in data: re_data["WEBSITE_"] = data["WEBSITE_"] # 门诊量 if "VOLNUM_" in data: re_data["VOLNUM_"] = data["VOLNUM_"] # print(re_data) re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def generic_shuffle(self, data): """ 清洗脚本写到这里 :param data: :return re_data: """ re_data = dict() serial_number = req_for_serial_number(code="WD_SS_XX") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 try: lat_result = get_lat_lng(address=data["ADDR_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: self.logger.info("获取经纬度失败信息为{}".format(e)) if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" except KeyError: pass # 学校名称 if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 属性(市重点、区重点、全国重点) if "LEVEL_" in data: re_data["LEVEL_"] = data["LEVEL_"] # 图片 if "IMAGES_" in data: if data["IMAGES_"]: response = req_for_something(url=data["IMAGES_"]) if response: t = base64.b64encode(response.content) re_data["IMAGES_"] = t.decode("utf-8") # 学校类型 if "SCHOOL_TYPE_" in data: re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"] # 学校性质 if "SCHOOL_NATURE_" in data: re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"] # 电话 if "TEL_" in data: pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})") pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})") pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})") pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})") pattern5 = re.compile(r"(\d{8})(\d{11})") pattern6 = re.compile(r"(\d{8})(\d{8})") pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})") pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})") pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})") if re.match(pattern1, data["TEL_"]): phone_number = re.sub(pattern1, r"\1 \2", data["TEL_"]) elif re.match(pattern2, data["TEL_"]): phone_number = re.sub(pattern2, r"\1 \2", data["TEL_"]) elif re.match(pattern3, data["TEL_"]): phone_number = re.sub(pattern3, r"\1 \2", data["TEL_"]) elif re.match(pattern4, data["TEL_"]): phone_number = re.sub(pattern4, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern5, data["TEL_"]): phone_number = re.sub(pattern5, r"\1 \2", data["TEL_"]) elif re.match(pattern6, data["TEL_"]): phone_number = re.sub(pattern6, r"\1 \2", data["TEL_"]) elif re.match(pattern7, data["TEL_"]): phone_number = re.sub(pattern7, r"\1 \2", data["TEL_"]) elif re.match(pattern8, data["TEL_"]): phone_number = re.sub(pattern8, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern9, data["TEL_"]): phone_number = re.sub(pattern9, r"\1 \2", data["TEL_"]) else: phone_number = data["TEL_"] re_data["TEL_"] = phone_number # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] re_data = super(Branchssxx, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def generic_shuffle(self, data, field="CONTENT_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() if "TAGS_" in data: re_data["TAGS_"] = "" # re_data["HOT_"] = data[""] re_data["PRO_NAME_"] = data["PRO_NAME_"] re_data["PRO_CODE_"] = data["PRO_CODE_"] # 基本信息 插入基本信息表 if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ_ALL", "JRCP_JJ_TTJJ_JZ_ALL"]: data_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_BASIC")) # self.p_client.table_name = TABLE_NAME("CRMFUND_BASIC") source = re.findall(r"(https?://.*?)[/?]", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] # todo # re_data["SOURCE_CODE_"] = "" re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][8:12] basic_field_list = ["COM_NAME_", "FUND_TYPE_", "RISK_LEVEL_", "RELEASE_DATE_", "BUILD_DATE_", "BUILD_SCAL_", "ASSET_SCAL_", "SHARE_SCAL_", "MANAGER_", "TRUSTEE_", "HANDLER_", "DIVIDEND_", "MANAGE_FEE_RATE_", "HOST_FEE_RATE_", "SALE_FEE_RATE_", "MAX_SUB_RATE_", "MAX_APPLY_RATE_", "MAX_REDEEM_RATE_", "BENCHMARK_", "BID_", "CLOSE_", "DIM_"] for basic_field in basic_field_list: if basic_field == "FUND_TYPE_": fund_type = data.get("FUND_TYPE_", "其他") re_data["FUND_TYPE_"] = fund_type try: re_data["FUND_TYPE_CODE_"] = self.ft_dict[data["FUND_TYPE_"]] except KeyError: for ft in self.ft_dict.keys(): if ft[:2] in fund_type: re_data["FUND_TYPE_CODE_"] = self.ft_dict[ft] if "FUND_TYPE_CODE_" not in re_data: # self.logger.info(f"FUND_TYPE_CODE_ {fund_type}") re_data["FUND_TYPE_CODE_"] = "QT" elif basic_field == "RISK_LEVEL_": risk_level_ = data.get("RISK_LEVEL_", "未知") risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知" re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_] re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "") elif basic_field == "MAX_REDEEM_RATE_": max_redeem_rate_ = data.get("MAX_REDEEM_RATE_", "") re_data["MAX_REDEEM_RATE_"] = re.split(r'[|]', data.get("MAX_REDEEM_RATE_", ""))[-1].replace \ ("%", "") if max_redeem_rate_ else "" elif basic_field == "BENCHMARK_": re_data[basic_field] = data.get(basic_field, "") elif basic_field == "BUILD_DATE_" or basic_field == "RELEASE_DATE_": basic_date = re.findall(r"(\d{4}年\d{2}月\d{1,2})日", data[basic_field]) if basic_date: re_data[basic_field] = re.sub(r"[\u4e00-\u9fa5]", "-", basic_date[0]) elif basic_field == "HANDLER_": re_data[basic_field] = data.get(basic_field, "").replace('|', '') else: re_data[basic_field] = data.get(basic_field, "").replace("%", "") # 添加一个资产总额字段方便统计 if re_data["ASSET_SCAL_"]: asset_total = re.findall(r"(.*?亿元)(截止至:\d+年\d+月\d+日)", re_data["ASSET_SCAL_"]) if len(asset_total) > 0: re_data["ASSET_TOTAL_"] = asset_total[0] else: re_data["ASSET_TOTAL_"] = '0' # 基金基本信息默认都是CHECK re_data["DATA_STATUS_"] = "CHECK" re_data["DATA_VERSION_"] = "0" re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="TRUSTEE_") data_dict["DATA_"] = re_data return [data_dict] # 代销基金 插入代销基金表 elif "GW_ALL" in data["ENTITY_CODE_"]: agency_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_AGENCY")) # self.p_client.table_name = "CRMFUND_AGENCY" # self.p_client.table_name = TABLE_NAME("CRMFUND_AGENCY") serial_number = req_for_serial_number(code="JRCP_JJ_AGENT") re_data["ID_"] = serial_number source = re.findall(r"(https?://.*?)[/?]", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["PUBLISH_TIME_"] = data["DATETIME_"] re_data["SOURCE_TYPE_"] = "" # HOT_ 代销基金目前不需要热度字段 # re_data["HOT_"] = data[""] re_data["RECOMMEND_"] = "N" re_data["GOOD_SALE_"] = "N" re_data["NEW_SALE_"] = "N" re_data["PUBLISH_STATUS_"] = "Y" re_data["DATA_STATUS_"] = "CHECK" re_data["VERSION_"] = "0" re_data["DATA_VERSION_"] = "0" # 从基金和基金基本信息中获取 pro_code_ = data.get("PRO_CODE_") pro_name = data.get("PRO_NAME_") cur = self.connection.cursor() # TODO 查取不到 # 从基金基本信息表查询相关数据 if pro_code_: try: re_data["PRO_CODE_"] = pro_code_ detail_list = ["RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_","BUILD_DATE_" "COM_NAME_", "RELEASE_DATE_", "CLOSE_"] cur.execute(f"SELECT {','.join(detail_list)} " f"FROM CRMFUND_BASIC WHERE PRO_CODE_='{str(data['PRO_CODE_'])}' " f"ORDER BY CREATE_TIME_ DESC LIMIT 1") for index, item in enumerate(cur.fetchone()): re_data[detail_list[index]] = item except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" elif pro_name: try: pro_name = pro_name if not data.get("PRO_LIKE_NAME_") else data.get("PRO_LIKE_NAME_") detail_list = ["PRO_CODE_", "RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_", "COM_NAME_", "RELEASE_DATE_", "CLOSE_"] cur.execute(f"SELECT {','.join(detail_list)} " f"FROM CRMFUND_BASIC WHERE PRO_NAME_ LIKE '{pro_name}%' " f"ORDER BY CREATE_TIME_ DESC LIMIT 1") for index, item in enumerate(cur.fetchone()): re_data[detail_list[index]] = item except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" # 从基金历史净值表查询相关数据 if re_data.get("PRO_CODE_"): try: cur.execute(f"SELECT BUY_STATUS_, NEW_NAV_, NEW_SYR_ " f"FROM CRMFUND_DATA " f"WHERE PRO_CODE_= '{str(re_data['PRO_CODE_'])}' " f"ORDER BY TIME_ DESC LIMIT 1") re_data["BUY_STATUS_"], re_data["NEW_NAV_"], re_data["NEW_SYR_"] = cur.fetchone() if re_data["BUY_STATUS_"] and re_data["BUY_STATUS_"] in self.new_bs_dict.keys(): re_data["BUY_STATUS_CODE_"] = self.new_bs_dict[re_data["BUY_STATUS_"]] except Exception as e: re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" finally: cur.close() if not re_data.get("RISK_LEVEL_"): if "RISK_LEVEL_" not in data: risk_level_ = "未知" else: risk_level_ = data["RISK_LEVEL_"] risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知" re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_] re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "") # re_data["NEW_SYR_"] = data[""] if not (re_data.get("FUND_TYPE_") or re_data.get("RELEASE_DATE_")): re_data["PUBLISH_STATUS_"] = "N" re_data["DATA_STATUS_"] = "UNCHECK" re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") agency_dict["DATA_"] = re_data return [agency_dict] # 历史净值 插入基金表 elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ", "JRCP_JJ_TTJJ_JZ"]: serial_number = req_for_serial_number(code=data["ENTITY_CODE_"][:7]) re_data["ID_"] = serial_number # re_data["FUND_BASIC_ID_"] = data[""] 关联 BASIC_ID re_data["SERVICE_CHARGE_"] = data["SERVICE_CHARGE_"] re_data["RATING_AGENCIES_"] = data["RATING_AGENCIES_"].replace('jjpj', '') nom_field_list = ["TIME_", "NEW_NAV_", "NEW_ANV_", "OLD_TIME_", "OLD_NAV_", "OLD_ANV_", "DAY_GROWTH_", "DAY_GROWTH_RATE_", "ONE_MONTH_RATE_", "THREE_MONTH_RATE_", "SIX_MONTH_RATE_", "ONE_YEAR_RATE_", "THREE_YEAR_RATE_", "BUILD_RATE_", "NEW_TOI_", "NEW_SYR_", "OLD_TOI_", "OLD_SYR_", "FYR_", "TYR_", "MARKET_PRICE_", "DISCOUNT_RATE_", "VERSION_", "BUY_STATUS_", "REDEEM_STATUS_"] for nom_field in nom_field_list: if nom_field == "VERSION_": re_data[nom_field] = "0" elif nom_field == "BUY_STATUS_": re_data["BUY_STATUS_"] = data.get("BUY_STATUS_", "") re_data["BUY_STATUS_CODE_"] = self.new_bs_dict.get(re_data["BUY_STATUS_"], "") elif nom_field == "REDEEM_STATUS_": re_data["REDEEM_STATUS_"] = data.get("REDEEM_STATUS_") re_data["REDEEM_STATUS_CODE_"] = self.rs_dict.get(re_data["REDEEM_STATUS_"], "") else: re_data[nom_field] = data.get(nom_field, "").replace("%", "") re_data[nom_field] = re_data[nom_field].replace("--", "") re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ",]: re_data["APY_FOURTEEN_"] = data.get("APY_FOURTEEN_") re_data["APY_TWENTY_EIGHT_"] = data.get("APY_TWENTY_EIGHT_") re_data["NEW_TOI_"] = data.get("NEW_TOI_") re_data["NEW_SYR_"] = data.get("NEW_SYR_") try: re_data["APY_THIRTY_FIVE_"] = round(float(dict(self.db_spider_data.JRCP_JJ.find_one({'PRO_CODE_': data['PRO_CODE_'], 'TIME_': data['TIME_'], 'ENTITY_CODE_': 'JRCP_JJ_TTJJ_35NH'})).get('APY_THIRTY_FIVE_')) * 100) / 100.0 except: re_data["APY_THIRTY_FIVE_"] = '' elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_JZ",]: re_data["NEW_WORTH_"] = data.get("NEW_WORTH_") # 处理T-1日净值 # self.p_client.table_name = cur = self.connection.cursor() cur.execute(f"SELECT NEW_NAV_,NEW_ANV_,NEW_TOI_,NEW_SYR_ FROM CRMFUND_DATA where PRO_CODE_='{re_data['PRO_CODE_']}' and TIME_<'{re_data['TIME_']}' order by TIME_ desc limit 1") t_1data = cur.fetchone() if t_1data: self.logger.info(f"====T-1日数据===={t_1data}") # print(t_1data) re_data['OLD_NAV_'] = t_1data[0] re_data['OLD_ANV_'] = t_1data[1] re_data['OLD_TOI_'] = t_1data[2] re_data['OLD_SYR_'] = t_1data[3] # 更新代销基金数据 self.p_client.table_name = TABLE_NAME('CRMFUND_AGENCY') agences = self.p_client.search_all_from_phoenix(connection=self.connection, dict_status=True, where_condition=f"PRO_CODE_='{re_data['PRO_CODE_']}'") if agences: while True: try: agence_data = agences.__next__() self.logger.info(f"====更新代销基金数据===={agence_data}") agence_data['NEW_NAV_'] = re_data['NEW_NAV_'] agence_data['NEW_SYR_'] = re_data['NEW_SYR_'] agence_data['BUY_STATUS_'] = re_data['BUY_STATUS_'] agence_data['BUY_STATUS_CODE_'] = re_data['BUY_STATUS_CODE_'] except: break try: self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=agence_data) except jaydebeapi.DatabaseError: continue self.p_client.table_name = TABLE_NAME('CRMFUND_DATA') return [{"TABLE_NAME_": TABLE_NAME("CRMFUND_DATA"), "DATA_": re_data}]
def generic_shuffle(self, data): # print(data) re_data = dict() # 通用字段 # ID_ 历史信息 ID_ serial_number = req_for_serial_number(code="WD_JZ_FJ_DATA") re_data["ID_"] = serial_number re_data["URL_"] = data["URL_"] # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 实体编码、名称及 url re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time # 状态 if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" # 名称 re_data["NAME_"] = data["NAME_"].replace("|", "") # 类型: 住宅(ZZ)、写字楼(XZL)、商铺(SP) if "LISP" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "SP" elif "LIXQ" in data["ENTITY_CODE_"] or "LJXQ" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "ZZ" elif "LIXZL" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "XZL" # 验证名称是否在基本表中 verify_name = value_replace(re_data["NAME_"]) house_id = self.if_exists(name=verify_name, city_name="厦门市") # 基本表存在, 只插入 DATA 表 if house_id: re_data["P_ID_"] = house_id if "TITLE_" in data: re_data["TITLE_"] = data["TITLE_"].replace("|", "") if "PUBLISH_TIME_" in data: re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] else: re_data["PUBLISH_TIME_"] = data["DATETIME_"][:10] price = re.findall(r"[\d.]+", data["PRICE_"]) if price: re_data["PRICE_"] = price[0] else: re_data["PRICE_"] = 0 if "租赁" in data["ENTITY_NAME_"]: re_data["USE_TYPE_"] = "RENT" else: re_data["USE_TYPE_"] = "SALE" return [{"TABLE_NAME_": self.data_table_name, "DATA_": re_data}] else: # 基本信息表ID_ base_id = req_for_serial_number(code="WD_JZ_FJ_BASE") # DATA_ 表 data_dict = dict() data_dict.update(re_data) data_dict["P_ID_"] = base_id if "TITLE_" in data: data_dict["TITLE_"] = data["TITLE_"].replace("|", "") if "PUBLISH_TIME_" in data: data_dict["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] else: data_dict["PUBLISH_TIME_"] = data["DATETIME_"][:10] price = re.findall(r"[\d.]+", data["PRICE_"]) if price: data_dict["PRICE_"] = price[0] else: data_dict["PRICE_"] = 0 if "租赁" in data["ENTITY_NAME_"]: data_dict["USE_TYPE_"] = "RENT" else: data_dict["USE_TYPE_"] = "SALE" # 基本信息表 basic_dict = dict() basic_dict.update(re_data) basic_dict["ID_"] = base_id basic_dict["URL_"] = data["URL_"] basic_dict["PROVINCE_CODE_"] = "3500" basic_dict["PROVINCE_NAME_"] = "福建省" basic_dict["CITY_CODE_"] = "350200" basic_dict["CITY_NAME_"] = "厦门市" basic_dict["SALE_PRICE_"] = 0 basic_dict["RENT_PRICE_"] = 0 if "YEAR_" in data: year = re.findall(r"\d+", data["YEAR_"]) if year: basic_dict["YEAR_"] = year[0] # 地址分析 try: if basic_dict["PROVINCE_NAME_"] == basic_dict["CITY_NAME_"]: basic_dict["ADDR_"] = basic_dict[ "PROVINCE_NAME_"] + basic_dict["NAME_"] else: basic_dict[ "ADDR_"] = basic_dict["PROVINCE_NAME_"] + basic_dict[ "CITY_NAME_"] + basic_dict["NAME_"] # print(basic_dict["ADDR_"]) res = req_for_textLoc(text=basic_dict["ADDR_"]) # print(res) except Exception as e: self.logger.exception( f"2.2--err: 请求模型 req_for_textLoc 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if "error" not in res: if res["tagsId"] == "None" or res["tagsId"] is None: pass else: basic_dict["TAGS_"] = res["tagsId"] if res["flag"] == 1: basic_dict["ADDR_"] = res["full"] else: basic_dict["ADDR_"] = data["ADDR_"] try: lat_result = get_lat_lng(address=basic_dict["ADDR_"]) basic_dict["LAT_"] = lat_result["result"]["location"][ "lat"] basic_dict["LNG_"] = lat_result["result"]["location"][ "lng"] except KeyError: basic_dict["LAT_"] = None basic_dict["LNG_"] = None except Exception as e: self.logger.info(f"获取经纬度失败, ERROR: {e}") basic_dict["LAT_"] = None basic_dict["LNG_"] = None if basic_dict["LAT_"]: try: area_result = get_area(",".join([ str(basic_dict["LAT_"]), str(basic_dict["LNG_"]) ])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: basic_dict["AREA_NAME_"] = area_result[ "result"]["addressComponent"]["district"] basic_dict["AREA_CODE_"] = area_result[ "result"]["addressComponent"]["adcode"] except KeyError: pass try: basic_dict["ADDR_"] = area_result["result"][ "formatted_address"] except KeyError: pass # basic_dict["AREA_CODE_"] = data[""] # basic_dict["AREA_NAME_"] = data[""] # basic_dict["LAT_"] = data[""] # basic_dict["LNG_"] = data[""] # basic_dict["BANK_CODE_"] = data[""] # basic_dict["BANK_NAME_"] = data[""] # basic_dict["REMARK_"] = data[""] basic_dict["M_STATUS_"] = "N" basic_dict["DELETE_STATUS_"] = "N" # basic_dict["TAGS_"] = data[""] # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] basic_dict["SOURCE_TYPE_"] = "链家" # basic_dict["PRICE_TYPE_"] = data[""] basic_dict["ADDR_"] = data["ADDR_"] return [{ "TABLE_NAME_": self.data_table_name, "DATA_": data_dict }, { "TABLE_NAME_": self.base_table_name, "DATA_": basic_dict }]
def generic_shuffle(self, data, field="PRO_NAME_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule # 如果data是一个list if isinstance(data, list): re_data_list = [] for item in data: re_data_list.append({"TABLE_NAME_": self.script_name, "DATA_": self.generic_shuffle(item)}) return re_data_list re_data = dict() serial_number = req_for_serial_number(code="JRCP_BX") re_data["ID_"] = serial_number + "TEST" source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["VERSION_"] = "0" re_data["DATA_VERSION_"] = "0" # todo re_data["SOURCE_TYPE_"] = "" # 模型 re_data["HOT_"] = data["HOT_"] if "HOT_" in data else "0" re_data["PRO_NAME_"] = data["PRO_NAME_"] # 保险公司 if "COM_NAME_" in data: for each in self.company_list: if each["NAME_"]: if data["COM_NAME_"] in each["NAME_"] or each["NAME_"] in data["COM_NAME_"]: re_data["COM_NAME_"] = each["NAME_"] re_data["COM_NAME_CODE_"] = each["CODE_"] elif each["ALIAS_"] and data["COM_NAME_"] in each["ALIAS_"]: re_data["COM_NAME_"] = each["NAME_"] re_data["COM_NAME_CODE_"] = each["CODE_"] if "COM_NAME_" not in re_data: re_data["COM_NAME_"] = data["COM_NAME_"] # 保额 补录 if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # else: # re_data["ENSURE_PRICE_"] = [100000, 500000, 1000000][random.randint(0, 2)] # 保费 补录 if "ENSURE_FEE_" in data: re_data["ENSURE_FEE_"] = data["ENSURE_FEE_"] # else: # re_data["ENSURE_FEE_"] = [50, 100, 200, 150][random.randint(0, 3)] # 产品特色 补录 if "SPECAIL_" in data: re_data["SPECAIL_"] = data["SPECAIL_"] # 产品简介 补录 if "BRIEF_" in data: re_data["BRIEF_"] = data["BRIEF_"] # 承保年龄 补录 if "AGE_" in data: re_data["AGE_"] = data["AGE_"] # else: # re_data["AGE_"] = [50, 70, 60, 80][random.randint(0, 3)] # 保险期间 补录 if "ENSURE_DATE_" in data: re_data["ENSURE_DATE_"] = data["ENSURE_DATE_"] # else: # re_data["ENSURE_DATE_"] = ["至80岁", "至60岁", "一年", "五年", "十年", "终身"][random.randint(0, 5)] # 投保份数 补录 if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] # else: # re_data["BUY_LIMIT_"] = [1, 2, "不限"][random.randint(0, 2)] # 保单形式 补录 if "ENSURE_MODE_" in data: re_data["ENSURE_MODE_"] = data["ENSURE_MODE_"] # 保单 补录 if "ENSURE_MODE_CODE_" in data: re_data["ENSURE_MODE_CODE_"] = data["ENSURE_MODE_CODE_"] # 适用人群 补录 if "SUIT_" in data: re_data["SUIT_"] = data["SUIT_"] # else: # re_data["SUIT_"] = ["20岁以下", "20岁至50岁人群", "无重大疾病隐患者", "不限"][random.randint(0,3)] # 原始保险分类 补录 if "ENSURE_SOURCE_TYPE_" in data: re_data["ENSURE_SOURCE_TYPE_"] = data["ENSURE_SOURCE_TYPE_"] # 保险类型 补录 # type_dict = {"寿险": "SX", "年金险": "NJX", "意外险": "YWX", "个人财险": "GRCX", "企业财险": "QYCX", "旅游险": "LYX", "健康险": "JKX", "理财险": "LCX"} if "ENSURE_TYPE_" in data: re_data["ENSURE_TYPE_"] = data["ENSURE_TYPE_"] # re_data["ENSURE_TYPE_"] = ["寿险", "年金险", "意外险", "个人财险", "企业财险", "旅游险", "健康险", "理财险"][random.randint(0, 7)] # 保险类型分类 补录 # if 1: if "ENSURE_TYPE_CODE_" in data: re_data["ENSURE_TYPE_CODE_"] = data["ENSURE_TYPE_CODE_"] # re_data["ENSURE_TYPE_CODE_"] = type_dict[re_data["ENSURE_TYPE_"]] # 推荐 re_data["RECOMMEND_"] = "N" # 畅销 re_data["GOOD_SALE_"] = "N" # 最新 re_data["NEW_SALE_"] = "N" # 保障内容 补录 if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] # 投保须知 补录 if "NOTICE_" in data: re_data["NOTICE_"] = data["NOTICE_"] # 产品介绍 补录 if "PRO_DETAIL_" in data: re_data["PRO_DETAIL_"] = data["PRO_DETAIL_"] if "ENSURE_PAY_" in data.keys(): re_data["ENSURE_PAY_"] = data["ENSURE_PAY_"].strip().replace("交", "缴") if re_data["ENSURE_PAY_"] not in self.pay_type: re_data["ENSURE_PAY_"] = "其他" re_data["ENSURE_PAY_CODE_"] = self.pay_type[re_data["ENSURE_PAY_"]] # 如果没有缴费方式从产品名字中再获取一次 else: if re.findall(r"期[缴交]", data["PRO_NAME_"]): re_data["ENSURE_PAY_"] = "期缴" re_data["ENSURE_PAY_CODE_"] = "QJ" elif re.findall(r"趸[缴交]", data["PRO_NAME_"]): re_data["ENSURE_PAY_"] = "趸缴" re_data["ENSURE_PAY_CODE_"] = "DJ" # FDFS上传 if "LOCAL_PDF_PATH_" in data: try: p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_PDF", file_name=data["LOCAL_PDF_NAME_"], postfix="pdf", file=open(data["LOCAL_PDF_PATH_"], "rb")) p_response.close() except Exception as e: self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}") if "WORD_" in data: try: response = req_for_something(url=data["WORD_"]) except Exception as e: self.logger.warning(f"_id: {data['_id']},获取PDF失败, ERROR: {e}") else: if response: try: p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD", file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc", file=response.content) self.logger.info(f"{p_response.content.decode('utf-8')}") p_response.close() except Exception as e: self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}") finally: response.close() else: self.logger.warning(f'id: {data["_id"]},获取PDF失败') if "HTML_" in data: del data["HTML_"] re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]