def shuffle_for_area(self, re_data): if "STATUS_1" in re_data: del re_data["STATUS_1"] re_data["STATUS_"] = "1" location_result = get_lat_lng(re_data["ADDR_"]) if location_result["status"] == 0: re_data["LNG_"] = str(location_result["result"]["location"]["lng"]) re_data["LAT_"] = str(location_result["result"]["location"]["lat"]) address_result = get_area(lat_lng=re_data["LAT_"] + "," + re_data["LNG_"]) # todo use formatted_address or not re_data["DISTRICT_NAME_"] = address_result["result"][ "addressComponent"]["district"] re_data["DISTRICT_CODE_"] = address_result["result"][ "addressComponent"]["adcode"] re_data["AREA_CODE_"] = address_result["result"][ "addressComponent"]["adcode"] re_data["CITY_"] = address_result["result"]["addressComponent"][ "city"] re_data["CITY_CODE_"] = address_result["result"][ "addressComponent"]["adcode"][:4] + "00" re_data["PROVINCE_NAME_"] = address_result["result"][ "addressComponent"]["province"] re_data["PROVINCE_CODE_"] = address_result["result"][ "addressComponent"]["adcode"][:2] + "00" # todo use formatted_location or not # re_data["LNG_"] = lng_lat.split(",")[0] # re_data["LAT_"] = lng_lat.split(",")[1] else: re_data["LNG_"] = "" re_data["LAT_"] = "" # 网点 CODE_ hash_m = hashlib.md5() hash_m.update(re_data["ADDR_"].encode("utf-8")) hash_addr_ = hash_m.hexdigest() re_data["CODE_"] = re_data["BANK_CODE_"] + "_" + re_data[ "AREA_CODE_"] + "_" + hash_addr_ # for i in range(1, 10000): # branch_code = "ABC" + "_" + re_data["AREA_CODE_"] + "_" + "00000" # branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # print("*"*150) # print(re_data) return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() prov_n = "" prov_c = None city_n = None city_c = None area_n = None area_c = None # if data.get("AREA_CODE_"): # for area in area_list: # if area["CODE_"] == data["AREA_CODE_"]: # area_c = area["CODE_"] # area_n = area["NAME_"] # city_c = area["PARENT_"] # print(area_n) # break # # for city in city_list: # if city["CODE_"] == city_c: # city_n = city["NAME_"] # prov_c = city["PARENT_"] # break # elif data.get("CITY_CODE_"): # for city in city_list: # if city["CODE_"] == data["CITY_CODE_"]: # city_n = city["NAME_"] # prov_c = city["PARENT_"] # break # else: # prov_c = "" # prov_n = "" if data.get("AREA_CODE_"): prov_n = data["AREA_CODE_"][:2] + "00" elif data.get("CITY_CODE_"): prov_n = data["CITY_CODE_"][:2] + "00" else: prov_n = "" for pro in province_list: if pro["CODE_"] == prov_c: prov_n = pro["NAME_"] break # if prov_n in ["北京市", "上海市", "天津市", "重庆市"]: # city_n = prov_n # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "CMBC" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "CMBC" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-5] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "CMBC" + "_" + city_c # "F" re_data["ADDR_"] = prov_n + data["ADDR_"] # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CMBC" + "_" + re_data.get("CITY_CODE_", "") # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = "0:00-24:00" if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() prov_n = None prov_c = None city_n = None city_c = None area_n = None area_c = None addr_ = None # 市级信息清洗 for city in city_list: if city["NAME_"] == data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["CODE_"][:2] + "00" break if not city_n: for city in city_list: if city["NAME_"][:-1] == data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["CODE_"][:2] + "00" if not city_n: for area in area_list: if area["NAME_"][:-1] == data["CITY_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] city_c = area["CODE_"][:-2] + "00" prov_c = area["CODE_"][:2] + "00" break for city in city_list: if city["CODE_"] == city_c: city_n = city["NAME_"] # 省级信息清洗 for prov in province_list: if prov["NAME_"][:2] == data["CITY_NAME_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] city_n = prov["NAME_"] city_c = prov["CODE_"] break elif prov["CODE_"] == prov_c: prov_n = prov["NAME_"] break # 区县级信息清洗 if not area_n: for area in area_list: if city_n: if city_n == prov_n: if area["CODE_"][:2] == city_c[:2]: if area["NAME_"] in data["ADDR_"]: area_n = area["NAME_"] area_c = area["CODE_"] elif area["CODE_"][:4] == city_c[:4]: if area["NAME_"] in data["ADDR_"]: area_n = area["NAME_"] area_c = area["CODE_"] if not area_n: for area in area_list: if len(area["NAME_"]) < 3: continue elif area["CODE_"][:4] == city_c[:4]: if (area["NAME_"][:-1] in data["ADDR_"]) and ( area["NAME_"][:-1] != city_n[:-1]): check_index = data["ADDR_"].find(area["NAME_"][:-1]) if ("道" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]) \ and ("路" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]) \ and ("街" not in data["ADDR_"][check_index:check_index+len(area["NAME_"][:-1])+2]): area_n = area["NAME_"] area_c = area["CODE_"] # 特殊情况 if "个旧市" in data["ADDR_"]: area_n = "个旧市" area_c = "532501" city_n = "红河哈尼族彝族自治州" city_c = "532500" prov_n = "云南省" prov_c = "5300" elif "辛集市" in data["ADDR_"]: area_n = "辛集市" area_c = "139002" city_n = "石家庄市" city_c = "130100" prov_n = "河北省" prov_c = "1300" # 地址清洗 if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "PAB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "PAB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "PAB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() for each in ["北京市", "天津市", "上海市", "重庆市"]: if each in data["CITY_NAME_"]: for pro in province_list: if pro["NAME_"] == each: re_data["PROVINCE_NAME_"] = pro["NAME_"] re_data["PROVINCE_CODE_"] = pro["CODE_"] re_data["CITY_NAME_"] = pro["NAME_"] re_data["CITY_CODE_"] = pro["CODE_"][:3] + "100" break else: for city in city_list: if city["NAME_"] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] re_data["PROVINCE_CODE_"] = city["PARENT_"] break if re_data.get("PROVINCE_CODE_"): for pro in province_list: if pro["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = pro["NAME_"] break # "C" re_data["BANK_CODE_"] = "CBHB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["UNIT_CODE_"] = "CBHB" + re_data.get("CITY_CODE_", "") # "F" re_data["ADDR_"] = data["ADDR_"] re_data["NAME_"] = data["NAME_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CBHB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] re_data["BUSINESS_HOURS_"] = "0:00-24:00" if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() prov_n = None prov_c = None city_n = None city_c = None area_n = None area_c = None addr_ = None # 省市信息清洗 for area in area_list: if area["NAME_"][:2] == data["CITY_NAME_"][:2]: area_n = area["NAME_"] area_c = area["CODE_"] city_c = area["CODE_"][:-2] + "00" prov_c = area["CODE_"][:2] + "00" break for city in city_list: if city["NAME_"][:2] == data["CITY_NAME_"][:2]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["CODE_"][:2] + "00" elif city_c == city["CODE_"]: city_n = city["NAME_"] for prov in province_list: if prov["NAME_"][:2] == data["CITY_NAME_"][:2]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] city_n = prov["NAME_"] city_c = prov["CODE_"] elif prov_c == prov["CODE_"]: prov_n = prov["NAME_"] # 区县信息清洗 for area in area_list: if area["CODE_"][:2] == prov_c[:2]: if area["NAME_"] in data["ADDR_"]: area_n = area["NAME_"] area_c = area["CODE_"] # 地址清洗 if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "CMB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "CMB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-6] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "CMB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CMB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" return re_data
def data_shuffle(data, province_list, city_list, area_list): for city in city_list: if city["NAME_"] == "县": city_list.remove(city) re_data = dict() addr_ = None area_c = None area_n = None city_c = None city_n = None prov_c = None prov_n = None # 西藏地区编码与数据库编码不符,单独清理 if "西藏" in data["PROVINCE_NAME_"]: data["CITY_NAME_"] = data["CITY_NAME_"].replace("西藏自治区", "") if "西藏" in data["ADDR_"]: data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区") else: data["ADDR_"] = "西藏自治区" + data["ADDR_"] for city in city_list: if city["CODE_"][:2] == "54": if data["CITY_NAME_"][:2] == city["NAME_"][:2]: data["ADDR_"] = data["ADDR_"].replace( data["CITY_NAME_"], city["NAME_"]) data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] data["ADDR_"] = data["ADDR_"].replace( data["CITY_NAME_"][:-1] + "地区", data["CITY_NAME_"]) if data["CITY_NAME_"][:-1] not in data["ADDR_"]: data["ADDR_"] = data["ADDR_"][:5] + data[ "CITY_NAME_"] + data["ADDR_"][5:] # 青海地区编码与数据库编码不符,单独清理 if "青海" in data["PROVINCE_NAME_"]: data["PROVINCE_NAME_"] = "青海省" data["CITY_NAME_"] = data["CITY_NAME_"].replace("青海", "") if "青海省" not in data["ADDR_"]: data["ADDR_"] = "青海省" + data["ADDR_"] for city in city_list: if city["CODE_"][:2] == "63": if city["NAME_"][:2] == data["CITY_NAME_"][:2]: data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] if data["CITY_NAME_"][:-1] not in data["ADDR_"]: data["ADDR_"] = data["ADDR_"][:3] + data["CITY_NAME_"] + data[ "ADDR_"][3:] # 新疆地区编码与数据库编码不符,单独清理 if "新疆" in data["PROVINCE_NAME_"]: data["PROVINCE_NAME_"] = "新疆维吾尔自治区" data["CITY_NAME_"] = data["CITY_NAME_"].replace("新疆维吾尔自治区", "") data["CITY_NAME_"] = data["CITY_NAME_"].replace("新疆", "") if ("新疆维吾尔自治区" not in data["ADDR_"]) and ("新疆" not in data["ADDR_"]): data["ADDR_"] = "新疆维吾尔自治区" + data["ADDR_"] elif ("新疆" in data["ADDR_"]) and ("新疆维吾尔自治区" not in data["ADDR_"]): data["ADDR_"] = "新疆维吾尔自治区" + data["ADDR_"][2:] for city in city_list: if city["CODE_"][:2] == "65": if city["NAME_"][:2] == data["CITY_NAME_"][:2]: data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] # 哈密市只有一个伊州区,网点信息都是此区的 if data["CITY_NAME_"] == "哈密市": data["AREA_NAME_"] = "伊州区" data["AREA_CODE_"] = "650502" for area in area_list: if area["CODE_"][:2] == "65": if area["NAME_"][:2] in data["AREA_NAME_"]: data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] # 内蒙古, 广西, 宁夏 字段统一: if (("内蒙古" in data["ADDR_"]) or ("广西" in data["ADDR_"]) or ("新疆" in data["ADDR_"]) or ("宁夏" in data["ADDR_"])): if data["PROVINCE_NAME_"] not in data["ADDR_"]: data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区") data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区") data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区") if data["PROVINCE_NAME_"] in data["CITY_NAME_"]: data["CITY_NAME_"] = data["CITY_NAME_"].replace( data["PROVINCE_NAME_"], "") if data["CITY_NAME_"][:-1] not in data["ADDR_"]: data["ADDR_"] = data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data[ "CITY_NAME_"] + data["ADDR_"][len(data["PROVINCE_NAME_"]):] data["ADDR_"] = re.sub( r"{}{}地?区?市?".format(data["CITY_NAME_"], data["CITY_NAME_"][:2]), data["CITY_NAME_"], data["ADDR_"]) if "区区" in data["ADDR_"]: data["ADDR_"] = data["ADDR_"].replace("区区", "区") # 吉林省吉林市清洗 if "吉林" in data["PROVINCE_NAME_"]: if "吉林市" not in data["CITY_NAME_"]: data["CITY_NAME_"] = data["CITY_NAME_"].replace("吉林省", "") data["CITY_NAME_"] = data["CITY_NAME_"].replace("吉林", "") data["CITY_CODE_"] = "220200" # 省级名称清洗 for prov in province_list: if prov["CODE_"][:2] == data["PROVINCE_CODE_"][:2]: data["PROVINCE_CODE_"] = prov["CODE_"] data["PROVINCE_NAME_"] = prov["NAME_"] break # 市级清洗 if data["PROVINCE_NAME_"][:2] in data["CITY_NAME_"]: if data["CITY_NAME_"] == '北京市' or data["CITY_NAME_"] == '天津市' or data[ "CITY_NAME_"] == '上海市' or data["CITY_NAME_"] == '重庆市': pass else: data["CITY_NAME_"] = data["CITY_NAME_"].replace( data["PROVINCE_NAME_"], "") data["CITY_NAME_"] = data["CITY_NAME_"].replace( data["PROVINCE_NAME_"][:-1], "") data["CITY_NAME_"] = data["CITY_NAME_"].replace( data["PROVINCE_NAME_"][:3], "") data["CITY_NAME_"] = data["CITY_NAME_"].replace( data["PROVINCE_NAME_"][:2], "") for city in city_list: if city["NAME_"] == "市辖区": continue elif city["CODE_"][:2] == data["PROVINCE_CODE_"][:2]: if city["CODE_"] == data["CITY_CODE_"]: data["CITY_CODE_"] = city["CODE_"] data["CITY_NAME_"] = city["NAME_"] break elif (city["NAME_"][:2] == data["CITY_NAME_"][:2]) and ( city["CODE_"] != data["CITY_CODE_"]): data["CITY_CODE_"] = city["CODE_"] data["CITY_NAME_"] = city["NAME_"] break elif (city["NAME_"] in data["ADDR_"][:len(data["PROVINCE_NAME_"]) + len(city["NAME_"])]) and ( not data["CITY_NAME_"]): data["CITY_CODE_"] = city["CODE_"] data["CITY_NAME_"] = city["NAME_"] break # 区县级清洗 if data["PROVINCE_NAME_"][:2] in data["AREA_NAME_"]: data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["PROVINCE_NAME_"], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["PROVINCE_NAME_"][:-1], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["PROVINCE_NAME_"][:4], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["PROVINCE_NAME_"][:3], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["PROVINCE_NAME_"][:2], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace(data["CITY_NAME_"], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["CITY_NAME_"][:-1], "") data["AREA_NAME_"] = data["AREA_NAME_"].replace( data["CITY_NAME_"][:3], "") # data["AREA_NAME_"] = data["AREA_NAME_"].replace(data["CITY_NAME_"][:2], "") data["AREA_NAME_"] = data["AREA_NAME_"][:2].replace( "地区", "") + data["AREA_NAME_"][2:] for area in area_list: if area["CODE_"][:2] == data["PROVINCE_CODE_"][:2]: if area["CODE_"] == data["AREA_CODE_"]: data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] break elif (area["NAME_"] == data["AREA_NAME_"]) and ( area["CODE_"] != data["AREA_CODE_"]): data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] break elif ( (area["NAME_"] in data["ADDR_"][:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(area["NAME_"])]) and (not data["AREA_NAME_"])): data["CITY_CODE_"] = city["CODE_"] data["CITY_NAME_"] = city["NAME_"] # 地址清洗 # 地址中有省级和市级 if (data["PROVINCE_NAME_"] in data["ADDR_"]) and (data["CITY_NAME_"] in data["ADDR_"]): addr_ = data["ADDR_"] # 地址中有省级没有市级 elif (data["PROVINCE_NAME_"] in data["ADDR_"]) and (data["CITY_NAME_"] not in data["ADDR_"]): if data["CITY_NAME_"][:-1] in data["ADDR_"][:len(data["PROVINCE_NAME_"] ) + len(data["CITY_NAME_"])]: addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data["ADDR_"] [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:-1], data["CITY_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]):]) elif data["CITY_NAME_"][:3] in data[ "ADDR_"][:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])]: addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data["ADDR_"] [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:3], data["CITY_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]):]) elif data["CITY_NAME_"][:2] in data[ "ADDR_"][:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])]: addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data["ADDR_"] [len(data["PROVINCE_NAME_"]):len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])].replace(data["CITY_NAME_"][:2], data["CITY_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]):]) else: addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data["CITY_NAME_"] + data["ADDR_"][len(data["PROVINCE_NAME_"]):]) # 地址中没有省级有市级 elif (data["PROVINCE_NAME_"] not in data["ADDR_"]) and (data["CITY_NAME_"] in data["ADDR_"]): if data["PROVINCE_NAME_"][:-1] in data[ "ADDR_"][:len(data["PROVINCE_NAME_"])]: if data["CITY_NAME_"] == "吉林市" and ("吉林省" not in data["ADDR_"]): addr_ = data["PROVINCE_NAME_"] + data["ADDR_"] else: addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace( data["PROVINCE_NAME_"][:-1], data["PROVINCE_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]):]) elif (data["PROVINCE_NAME_"][:3] in data["ADDR_"][:len(data["PROVINCE_NAME_"])]) and \ (data["CITY_NAME_"] in data["ADDR_"]): addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace( data["PROVINCE_NAME_"][:3], data["PROVINCE_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]):]) elif (data["PROVINCE_NAME_"][:2] in data["ADDR_"][:len(data["PROVINCE_NAME_"])]) and\ (data["CITY_NAME_"] in data["ADDR_"]): addr_ = (data["ADDR_"][:len(data["PROVINCE_NAME_"])].replace( data["PROVINCE_NAME_"][:2], data["PROVINCE_NAME_"]) + data["ADDR_"][len(data["PROVINCE_NAME_"]):]) else: addr_ = data["PROVINCE_NAME_"] + data["ADDR_"] # 地址中没有省级没有市级 elif (data["PROVINCE_NAME_"] not in data["ADDR_"]) and (data["CITY_NAME_"] not in data["ADDR_"]): if data["CITY_NAME_"][:-1] in data["ADDR_"][:len(data["CITY_NAME_"])]: addr_ = (data["PROVINCE_NAME_"] + data["ADDR_"][:len(data["CITY_NAME_"])].replace( data["CITY_NAME_"][:-1], data["CITY_NAME_"]) + data["ADDR_"][len(data["CITY_NAME_"]):]) elif data["CITY_NAME_"][:3] in data["ADDR_"][:len(data["CITY_NAME_"])]: addr_ = (data["PROVINCE_NAME_"] + data["ADDR_"][:len(data["CITY_NAME_"])].replace( data["CITY_NAME_"][:3], data["CITY_NAME_"]) + data["ADDR_"][len(data["CITY_NAME_"]):]) elif data["CITY_NAME_"][:2] in data["ADDR_"][:len(data["CITY_NAME_"])]: addr_ = (data["PROVINCE_NAME_"] + data["ADDR_"][:len(data["CITY_NAME_"])].replace( data["CITY_NAME_"][:2], data["CITY_NAME_"]) + data["ADDR_"][len(data["CITY_NAME_"]):]) else: addr_ = data["PROVINCE_NAME_"] + data["CITY_NAME_"] + data["ADDR_"] # 地址中有区县级 if data["AREA_NAME_"] in addr_: pass # 直辖市 elif data["CITY_CODE_"] == data["PROVINCE_CODE_"]: if data["AREA_NAME_"][:-1] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])]: addr_ = ( addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:-1], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif data["AREA_NAME_"][:4] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])]: addr_ = ( addr_[:len(data["PROVINCE_NAME_"])].replace( data["AREA_NAME_"][:4], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif data["AREA_NAME_"][:3] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])]: addr_ = ( addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:3], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif data["AREA_NAME_"][:2] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])]: addr_ = ( addr_[:len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"])].replace(data["AREA_NAME_"][:2], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) else: addr_ = (addr_[:len(data["PROVINCE_NAME_"])] + data["AREA_NAME_"] + addr_[len(data["PROVINCE_NAME_"]):]) # 非直辖市 elif (data["AREA_NAME_"] == "城区") or (data["AREA_NAME_"] == "郊区"): addr_ = addr_.replace(data["AREA_NAME_"], "") elif (data["AREA_NAME_"][:-1] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])]): addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])].replace( data["AREA_NAME_"][:-1], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif (data["AREA_NAME_"][:4] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])]): addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])].replace( data["AREA_NAME_"][:4], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif (data["AREA_NAME_"][:3] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])]): addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])].replace( data["AREA_NAME_"][:3], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) elif (data["AREA_NAME_"][:2] in addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])]): addr_ = (addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]) + len(data["AREA_NAME_"])].replace( data["AREA_NAME_"][:2], data["AREA_NAME_"]) + addr_[len(data["PROVINCE_NAME_"]) + len(data["AREA_NAME_"]):]) else: if len(data["AREA_NAME_"]) > 3: addr_ = ( addr_[:len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"])] + data["AREA_NAME_"] + addr_[len(data["PROVINCE_NAME_"]) + len(data["CITY_NAME_"]):]) # 剩余数据在数据库中无区县级 if not addr_: if data["PROVINCE_NAME_"] not in data[ "ADDR_"][:len(data["PROVINCE_NAME_"])]: data["ADDR_"] = data["PROVINCE_NAME_"] + data["ADDR_"] if data["CITY_NAME_"] not in data["ADDR_"][:len(data["PROVINCE_NAME_"] ) + len(data["CITY_NAME_"])]: data["ADDR_"] = (data["ADDR_"][:len(data["PROVINCE_NAME_"])] + data["CITY_NAME_"] + data["ADDR_"][len(data["PROVINCE_NAME_"]):]) addr_ = data["ADDR_"] # data["AREA_CODE_"] = data["CITY_CODE_"] if "直辖" in data["CITY_NAME_"]: addr_ = data["ADDR_"] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "ABC" + "_" + data["CITY_CODE_"] + "_" + "00000" # branch_code = branch_code[:len(branch_code)-len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # re_data["_id"] = data["_id"] # "C" re_data["BANK_CODE_"] = "ABC" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] # re_data["AREA_CODE_"] = data["AREA_CODE_"] # re_data["AREA_NAME_"] = data["AREA_NAME_"] # re_data["UNIT_CODE_"] = "ABC" + "_" + data["CITY_CODE_"] # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = data["CITY_CODE_"] # re_data["CITY_NAME_"] = data["CITY_NAME_"] # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"][:4] # re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"] # re_data["DISTRICT_CODE_"] = data["AREA_CODE_"] # re_data["DISTRICT_NAME_"] = data["AREA_NAME_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "ABC" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["SPIDER_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] re_data["TEL_"] = data.get("TEL_", "") if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] if re_data["SOURCE_TYPE_NAME_"] == "营业网点": re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" else: re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() for each in ["北京市", "天津市", "上海市", "重庆市"]: if each in data["CITY_NAME_"]: for pro in province_list: if pro["NAME_"] == each: re_data["PROVINCE_NAME_"] = pro["NAME_"] re_data["PROVINCE_CODE_"] = pro["CODE_"] re_data["CITY_NAME_"] = pro["NAME_"] re_data["CITY_CODE_"] = pro["CODE_"][:3] + "100" break else: for city in city_list: if city["NAME_"] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] re_data["PROVINCE_CODE_"] = city["PARENT_"] break if re_data.get("PROVINCE_CODE_"): for pro in province_list: if pro["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = pro["NAME_"] break # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "CGB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "CGB" re_data["BANK_NAME_"] = "广发银行" re_data["SPIDER_TIME_"] = data["DATETIME_"] # "F" # "地址:宝山区牡丹江路1211号" re_data["ADDR_"] = data["ADDR_"].replace("地址:", "") re_data["NAME_"] = data["NAME_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CGB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: # 电话:02168037370 re_data["TEL_"] = data["TEL_"].replace("电话:", "") if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] # if ("社区银行" in re_data.get("SOURCE_TYPE_NAME_", "")) or ("网点" in re_data.get("SOURCE_TYPE_NAME_", "")): # re_data["TYPE_NAME_"] = "支行" # re_data["TYPE_"] = "ZH" # else: # re_data["TYPE_NAME_"] = "自助银行" # re_data["TYPE_"] = "ZZ" re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() # 省级信息清洗 for prov in province_list: if prov["NAME_"][:2] in data["PROVINCE_NAME_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] re_data["PROVINCE_CODE_"] = prov["CODE_"] break # 市级信息清洗 re_data["CITY_NAME_"] = '' for city in city_list: if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if city["NAME_"][:2] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] break # 区县级信息清洗 import re try: area_name = re.findall('[市县](.*[区镇县])', data.get('ADDR_'))[0] except: area_name = '' # # 区县级信息清洗 area_n = '' area_c = '' if area_name: for area in area_list: if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if area["NAME_"] == area_name: area_n = area["NAME_"] area_c = area["CODE_"] elif area["NAME_"][:-1] == area_name[:-1]: area_n = area["NAME_"] area_c = area["CODE_"] # 地址清洗 prov_n = re_data["PROVINCE_NAME_"] city_n = re_data["CITY_NAME_"] if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] re_data["BANK_CODE_"] = "SPDB" re_data["BANK_NAME_"] = "浦发银行" re_data["SPIDER_TIME_"] = data["DATETIME_"] # "F" re_data["ADDR_"] = addr_ result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if data["PROVINCE_NAME_"] == data["CITY_NAME_"]: re_data["CITY_NAME_"] = re_data["PROVINCE_NAME_"] re_data["UNIT_CODE_"] = "SPDB" + "_" + re_data.get("CITY_CODE_", "") re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() prov_c = "" prov_n = "" city_c = "" city_n = "" area_c = "" area_n = "" for city in city_list: if city["NAME_"] == data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["PARENT_"] for prov in province_list: if prov["CODE_"] == prov_c: prov_n = prov["NAME_"] if not city_c: for prov in province_list: if prov["NAME_"] == data["CITY_NAME_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] city_n = prov["NAME_"] city_c = prov_c[:2] + "0100" for area in area_list: if city_c: if area["PARENT_"] == city_c: if area["NAME_"] == data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] else: if area["NAME_"] == data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] city_c = area["PARENT_"] # 地址清洗 if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # if (not area_c) or (not area_n) or (not city_n) or (not city_c) or (not prov_c) or (not prov_n): # print(prov_n, prov_c, city_n, city_c, area_n, area_c) # print(data) # print(addr_) # "C" re_data["BANK_CODE_"] = "BCM" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "BCM" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() # 省级信息清洗 data["PROVINCE_NAME_"] = '广东' for prov in province_list: if prov["NAME_"][:2] in data["PROVINCE_NAME_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] re_data["PROVINCE_CODE_"] = prov["CODE_"] break # 市级信息清洗 re_data["CITY_NAME_"] = '' for city in city_list: if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if city["NAME_"][:2] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] break # 区县级信息清洗 for area in area_list: if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if area["NAME_"] == data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] elif area["NAME_"][:-1] == data["AREA_NAME_"][:-1]: area_n = area["NAME_"] area_c = area["CODE_"] # "C" re_data["BANK_CODE_"] = "NRCB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["UNIT_CODE_"] = "CBHB" + re_data.get("CITY_CODE_", "") # "F" re_data["ADDR_"] = data["ADDR_"] re_data["NAME_"] = data["NAME_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "NRCB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] re_data["BUSINESS_HOURS_"] = "0:00-24:00" if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() prov_n = "" prov_c = "" city_n = "" city_c = "" area_n = "" area_c = "" addr_ = "" # if "武候区" in data["ADDR_"]: # data["ADDR_"] = data["ADDR_"].replace("武候区", "武侯区") # elif "管城区" in data["ADDR_"]: # data["ADDR_"] = data["ADDR_"].replace("管城区", "管城回族区") # # # 省级市级信息清洗 # if data["CITY_"] == "吴江市": # data["CITY_"] = "吴江区" # elif data["CITY_"] == "香港": # data["CITY_"] = "香港特别行政区" # for area in area_list: # if area["NAME_"] == data["CITY_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # city_c = area["PARENT_"] # prov_c = area_c[:2] + "00" # break # for city in city_list: # if city_c == city["CODE_"]: # city_n = city["NAME_"] # break # elif city["NAME_"] == data["CITY_"]: # city_n = city["NAME_"] # city_c = city["CODE_"] # prov_c = city["PARENT_"] # break # for prov in province_list: # if prov_c == prov["CODE_"]: # prov_n = prov["NAME_"] # break # elif prov["NAME_"] == data["CITY_"]: # prov_n = prov["NAME_"] # prov_c = prov["CODE_"] # city_n = prov["NAME_"] # city_c = prov["CODE_"] # break # # 区县级信息清洗 # if not area_c: # for area in area_list: # if city_c == prov_c: # if area["CODE_"][:2] == prov_c[:2]: # if area["NAME_"] in data["ADDR_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # else: # if area["CODE_"][:4] == city_c[:4]: # if area["NAME_"] in data["ADDR_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # if not area_c: # for area in area_list: # if len(area["NAME_"]) > 2: # if area["CODE_"][:4] == city_c[:4]: # if (area["NAME_"][:-1] in data["ADDR_"]) and (area["NAME_"][:-1] != city_n[:-1]): # check_index = data["ADDR_"].find(area["NAME_"][:-1]) # if ("道" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]) \ # and ( # "路" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]) \ # and ( # "街" not in data["ADDR_"][check_index:check_index + len(area["NAME_"][:-1]) + 3]): # area_n = area["NAME_"] # area_c = area["CODE_"] # # # 地址清洗 # if ("广西自治区" in data["ADDR_"]) or ("新疆自治区" in data["ADDR_"]): # data["ADDR_"] = data["ADDR_"].replace("广西自治区", "广西壮族自治区") # data["ADDR_"] = data["ADDR_"].replace("新疆自治区", "新疆维吾尔自治区") # if prov_n in data["ADDR_"]: # addr_ = data["ADDR_"] # elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] # else: # addr_ = prov_n + data["ADDR_"] # # if city_n in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_ # elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] # else: # addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "CEB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "CEB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-2] re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["UNIT_CODE_"] = "CEB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_"] = city_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n # re_data["DISTRICT_CODE_"] = area_c # re_data["DISTRICT_NAME_"] = area_n result = get_lat_lng(address=re_data["NAME_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CEB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] re_data["TEL_"] = data["TEL_"] re_data["BUSINESS_HOURS_"] = "" return re_data
def generic_shuffle(self, data): """ 清洗脚本写到这里 :param data: :return re_data: """ re_data = dict() serial_number = req_for_serial_number(code="WD_SS_YY") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 try: lat_result = get_lat_lng(address=data["ADDR_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: try: lat_result = get_lat_lng(address=data["CITY_NAME_"]+data["NAME_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误为{}".format(e)) except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误为{}".format(e)) if re_data["LNG_"]: try: area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info("获取地址信息失败错误为{}".format(e)) else: try: re_data["PROVINCE_NAME_"] = area_result["result"]["addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"]["addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" except KeyError: pass # 设备 if "DEVICE_" in data: re_data["DEVICE_"] = data["DEVICE_"] # 医院等级 if "GRADE_" in data: re_data["GRADE_"] = data["GRADE_"] # 特色 if "SPECIAL_" in data: re_data["SPECIAL_"] = data["SPECIAL_"] # 电话 if "TEL_" in data: re_data["TEL_"] = data["TEL_"] # 医院id if "HOSPITAL_ID_" in data: re_data["HOSPITAL_ID_"] = data["HOSPITAL_ID_"] # 医院名称 if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] # 床位 if "BEDS_" in data: re_data["BEDS_"] = data["BEDS_"] # 医院性质 if "TYPE_" in data: re_data["TYPE_"] = data["TYPE_"] # 网站 if "WEBSITE_" in data: re_data["WEBSITE_"] = data["WEBSITE_"] # 门诊量 if "VOLNUM_" in data: re_data["VOLNUM_"] = data["VOLNUM_"] # print(re_data) re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def data_shuffle(data, province_list, city_list, area_list): data_list = list() for city in city_list: if city["NAME_"] == "县": city_list.remove(city) prov_c = "" prov_n = "" city_c = "" city_n = "" area_c = "" area_n = "" # 省市级信息 if "北京" in data["CITY_NAME_"]: prov_n = "北京市" prov_c = "1100" city_n = "北京市" city_c = "110100" elif "天津" in data["CITY_NAME_"]: prov_n = "天津市" prov_c = "1200" city_n = "天津市" city_c = "120100" elif "上海" in data["CITY_NAME_"]: prov_n = "上海市" prov_c = "3100" city_n = "上海市" city_c = "310100" elif "重庆" in data["CITY_NAME_"]: prov_n = "重庆市" prov_c = "5000" city_n = "重庆市" city_c = "500100" else: for city in city_list: if city["NAME_"][:-1] in data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["PARENT_"] break if prov_c: for prov in province_list: if prov["CODE_"] == prov_c: prov_n = prov["NAME_"] break response = req_for_something(data["URL_"]) a = re.sub(r"[^\w|,]+", "", response.content.decode("utf-8")) b = a.split("|") for each in b: re_data = dict() message = each.split(",") if len(message) == 1: continue # city_n = message[0] # city_c = message[1] name = message[2] addr_ = message[3] tel = message[4] business_time = message[5] + message[6] # lng = message[8] # lat = message[9] # # 区县级清洗 # for area in area_list: # if area["PARENT_"] == city_c: # if area["NAME_"] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:-1] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:4] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:3] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:2] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # 地址清洗 if prov_n in addr_: pass elif prov_n[:-1] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:-1], prov_n) + addr_[len(prov_n):] elif prov_n[:4] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:4], prov_n) + addr_[len(prov_n):] elif prov_n[:3] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:3], prov_n) + addr_[len(prov_n):] elif prov_n[:2] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:2], prov_n) + addr_[len(prov_n):] else: addr_ = prov_n + addr_ if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # "C" re_data["BANK_CODE_"] = "CZB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ re_data["PROVINCE_NAME_"] = prov_n re_data["PROVINCE_CODE_"] = prov_c re_data["CITY_CODE_"] = city_c re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = lat # re_data["LNG_"] = lng re_data["NAME_"] = name # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"][ "addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"][ "addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CZB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] re_data["TEL_"] = tel re_data["BUSINESS_HOURS_"] = business_time if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" data_list.append(re_data) return data_list
def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" )
def generic_shuffle(self, data): re_data = dict() serial_number = req_for_serial_number(code="MAPBAR") re_data["ID_"] = serial_number re_data["NAME_"] = data["NAME_"] re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "") re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "") # re_data["PROVINCE_CODE_"] = "3100" # re_data["PROVINCE_NAME_"] = "上海市" # re_data["CITY_CODE_"] = "310100" # re_data["CITY_NAME_"] = "上海市" re_data["HOT_"] = 0 # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = "图吧" re_data["SOURCE_TYPE_"] = "图吧" # 获取经纬度 try: if re_data["ADDRESS_"]: location_result = get_lat_lng(address=re_data["ADDRESS_"]) if location_result["status"] == 0: re_data["LNG_"] = str( location_result["result"]["location"]["lng"]) re_data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: re_data["LNG_"] = "" re_data["LAT_"] = "" self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: re_data["LNG_"] = "" re_data["LAT_"] = "" except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}") if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}") else: try: re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in self.city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in self.province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if not re_data.get("CITY_NAME_", ""): for city in self.city_list: if city["NAME_"][:2] in data["TYPE_"]: re_data["CITY_CODE_"] = city["CODE_"] re_data["CITY_NAME_"] = city["NAME_"] break if re_data.get("CITY_NAME_", ""): for prov in self.province_list: if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]: re_data["PROVINCE_CODE_"] = prov["CODE_"] re_data["PROVINCE_NAME_"] = prov["NAME_"] break # CHA_BRANCH_MAIN_ROUTE 主干道 if "道路" in data["TYPE_"]: road_data = dict() road_data.update(re_data) road_data["ID_"] = req_for_serial_number(code="WD_GD") road_data["ADDR_"] = road_data["ADDRESS_"] del road_data["ADDRESS_"] road_shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=road_data, field=None) # CHA_BRANCH_FACILITY 图吧 # serial_number = req_for_serial_number(code="MAPBAR") # re_data["ID_"] = serial_number re_data["TYPE1_"] = data["BTYPE_"] try: re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]] except KeyError: raise Exception("暂不需要清洗的数据") # 小分类清洗(合并部分分类) if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]: re_data["TYPE2_"] = "俱乐部" re_data["TYPE2_CODE_"] = "JLB" elif data["TYPE_"][2:] in ["连锁店", "便利店"]: re_data["TYPE2_"] = "便利店" re_data["TYPE2_CODE_"] = "BLD" elif data["TYPE_"][2:] in ["电子商城", "电器商城"]: re_data["TYPE2_"] = "家电数码" re_data["TYPE2_CODE_"] = "JDSM" elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]: re_data["TYPE2_"] = "门诊/卫生所" re_data["TYPE2_CODE_"] = "MZWSS" else: re_data["TYPE2_"] = data["TYPE_"][2:] re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"]) re_data["SOURCE_TYPE1_"] = data["BTYPE_"] re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get( re_data["SOURCE_TYPE1_"]) re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:] re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get( re_data["SOURCE_TYPE2_"]) re_data["PHONE_"] = data["PHONE_"].replace("无,", "") re_data["BUS_"] = data["BUS_"] re_data["BUSSTOP_"] = data["BUSSTOP_"] shuffle_data = super(MapbarScript, self).generic_shuffle(data=data, re_data=re_data, field=None) return_list = list() return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"), "DATA_": shuffle_data }) if "road_shuffle_data" in dir(): return_list.append({ "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"), "DATA_": road_shuffle_data }) return return_list
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data_list = list() # print(data["CONTENT_HTML_"]) # 根据CONTENT_HTML_ 获取商圈字典型列表("区域":"商圈名") soup = BeautifulSoup(data["CONTENT_HTML_"], "html.parser") dl = soup.find_all('dl', {"class": "list"}) # 商圈字典型列表 dt_dict = dict() for item in dl: # print(item) dt = item.dt.a.string li_list = list() for li in item.find_all('li'): # print(li) li_list.append(li.a.string) dt_dict[dt] = li_list for area_name in dt_dict: shopping_list = dt_dict[area_name] # print(dt_dict) # 得到各商圈经度和维度 补全省市区域数据 for shopping_name in shopping_list: re_data = dict() # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # ID serial_number = req_for_serial_number(code="WD_SS_SQ") re_data["ID_"] = serial_number try: lat_result = get_lat_lng(address=data["CITY_"] + "市" + area_name + shopping_name) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["ADDR_"] = area_result["result"][ "formatted_address"] re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data[ "AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: re_data["ADDR_"] = shopping_name re_data["PROVINCE_NAME_"] = None re_data["CITY_NAME_"] = data["CITY_"] + "市" re_data["AREA_NAME_"] = None re_data["AREA_CODE_"] = None re_data["CITY_CODE_"] = None re_data["PROVINCE_CODE_"] = None re_data["NAME_"] = shopping_name re_data = super(Branchsssq, self).generic_shuffle(data=data, re_data=re_data, field=None) re_data_list.append({ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }) # print(re_data_list) return re_data_list
def generic_shuffle(self, data): """ 清洗脚本写到这里 :param data: :return re_data: """ re_data = dict() serial_number = req_for_serial_number(code="WD_SS_XX") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 try: lat_result = get_lat_lng(address=data["ADDR_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: self.logger.info("获取经纬度失败信息为{}".format(e)) if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" except KeyError: pass # 学校名称 if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 属性(市重点、区重点、全国重点) if "LEVEL_" in data: re_data["LEVEL_"] = data["LEVEL_"] # 图片 if "IMAGES_" in data: if data["IMAGES_"]: response = req_for_something(url=data["IMAGES_"]) if response: t = base64.b64encode(response.content) re_data["IMAGES_"] = t.decode("utf-8") # 学校类型 if "SCHOOL_TYPE_" in data: re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"] # 学校性质 if "SCHOOL_NATURE_" in data: re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"] # 电话 if "TEL_" in data: pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})") pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})") pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})") pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})") pattern5 = re.compile(r"(\d{8})(\d{11})") pattern6 = re.compile(r"(\d{8})(\d{8})") pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})") pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})") pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})") if re.match(pattern1, data["TEL_"]): phone_number = re.sub(pattern1, r"\1 \2", data["TEL_"]) elif re.match(pattern2, data["TEL_"]): phone_number = re.sub(pattern2, r"\1 \2", data["TEL_"]) elif re.match(pattern3, data["TEL_"]): phone_number = re.sub(pattern3, r"\1 \2", data["TEL_"]) elif re.match(pattern4, data["TEL_"]): phone_number = re.sub(pattern4, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern5, data["TEL_"]): phone_number = re.sub(pattern5, r"\1 \2", data["TEL_"]) elif re.match(pattern6, data["TEL_"]): phone_number = re.sub(pattern6, r"\1 \2", data["TEL_"]) elif re.match(pattern7, data["TEL_"]): phone_number = re.sub(pattern7, r"\1 \2", data["TEL_"]) elif re.match(pattern8, data["TEL_"]): phone_number = re.sub(pattern8, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern9, data["TEL_"]): phone_number = re.sub(pattern9, r"\1 \2", data["TEL_"]) else: phone_number = data["TEL_"] re_data["TEL_"] = phone_number # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] re_data = super(Branchssxx, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def data_shuffle(data, province_list, city_list, area_list): for city in city_list: if city["NAME_"] == "县": city_list.remove(city) re_data = dict() prov_n = "" prov_c = "" city_n = "" city_c = "" area_n = "" area_c = "" addr_ = "" # 内蒙古, 广西, 新疆, 宁夏, 西藏 字段统一: if ("内蒙古" in data["ADDR_"][:3] or "广西" in data["ADDR_"][:2] or "新疆" in data["ADDR_"][:2] or "宁夏" in data["ADDR_"][:2] or "西藏" in data["ADDR_"][:2]): if "自治区" not in data["ADDR_"]: data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区") data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区") data["ADDR_"] = data["ADDR_"].replace("新疆", "新疆维吾尔自治区") data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区") data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区") elif "京山县" in data["AREA_NAME_"]: data["AREA_NAME_"] = data["AREA_NAME_"].replace("荆州", "荆门") for city in city_list: if city["NAME_"] in data["AREA_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["CODE_"][:2] + "00" break for area in area_list: if city_c: if area["PARENT_"] == city_c: if area["NAME_"] in data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] break else: if (area["NAME_"][-1] == "区") and (len(area["NAME_"]) == 2): continue if area["NAME_"] in data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] city_c = area["CODE_"][:-2] + "00" prov_c = area["CODE_"][:2] + "00" break for prov in province_list: if prov_c: if prov["CODE_"] == prov_c: prov_n = prov["NAME_"] prov_c = prov["CODE_"] break else: if prov["NAME_"] in data["AREA_NAME_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] break elif prov["NAME_"][:-1] in data["AREA_NAME_"]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] break if data["AREA_NAME_"] == "洋浦经济开发区": prov_n = "海南省" prov_c = "4600" city_n = "儋州市" city_c = "460400" area_n = "洋浦经济开发区" area_c = "" elif ("北京" in data["AREA_NAME_"][:3] or "天津" in data["AREA_NAME_"][:3] or "上海" in data["AREA_NAME_"][:3] or "重庆" in data["AREA_NAME_"][:3]): city_n = prov_n city_c = prov_c if not area_c: for area in area_list: if area["PARENT_"] == city_c: if area["NAME_"][:2] in data["AREA_NAME_"][-len(area["NAME_"] ):]: area_n = area["NAME_"] area_c = area["CODE_"] break if not area_c: for area in area_list: if area["PARENT_"] == city_c: if area["NAME_"] in data["ADDR_"]: area_n = area["NAME_"] area_c = area["CODE_"] # 地址清洗 if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # 去除地址尾部邮编 addr_ = re.sub(r"[((][0-9]{5,6}[))]|[((][))]", "", addr_) # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "BOC" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "BOC" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "BOC" + "_" + city_c # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n address = re_data["ADDR_"].replace("#", "号") if len(address) > 50: address_re = re.findall(r".*?号", address) if address_re: address = address_re[0] else: address = address[:50] result = get_lat_lng(address=address) if result["status"] == 2: result = get_lat_lng(address=address[:-5]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if not re_data.get("PROVINCE_CODE_"): re_data["PROVINCE_CODE_"] = prov_c if not re_data.get("PROVINCE_NAME_"): re_data["PROVINCE_NAME_"] = prov_n if not re_data.get("CITY_CODE_"): re_data["CITY_CODE_"] = city_c if not re_data.get("CITY_NAME_"): re_data["CITY_NAME_"] = city_n re_data["UNIT_CODE_"] = "BOC" + "_" + re_data.get("CITY_CODE_", "") # re_data["LAT_"] = "" # re_data["LNG_"] = "" re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["SPIDER_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() dis_result = get_area(",".join([data["LAT_"], data["LNG_"]])) try: data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: data["AREA_NAME_"] = "" try: data["AREA_CODE_"] = dis_result["result"]["addressComponent"]["adcode"] except KeyError: data["AREA_CODE_"] = "" else: data["CITY_CODE_"] = data["AREA_CODE_"][:4] + "00" data["PROVINCE_CODE_"] = data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == data["CITY_CODE_"]: data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == data["PROVINCE_CODE_"]: data["PROVINCE_NAME_"] = prov["NAME_"] break if not data["AREA_NAME_"]: for prov in province_list: if prov["NAME_"] in data["ADDR_"] or prov["NAME_"] in data["NAME_"]: data["PROVINCE_NAME_"] = prov["NAME_"] data["PROVINCE_CODE_"] = prov["CODE_"] break elif prov["NAME_"][:-1] in data["ADDR_"] or prov[ "NAME_"][:-1] in data["NAME_"]: data["PROVINCE_NAME_"] = prov["NAME_"] data["PROVINCE_CODE_"] = prov["CODE_"] break elif prov["NAME_"][:2] in data["ADDR_"] or prov[ "NAME_"][:2] in data["NAME_"]: data["PROVINCE_NAME_"] = prov["NAME_"] data["PROVINCE_CODE_"] = prov["CODE_"] break for city in city_list: if city["NAME_"] in data["ADDR_"] or city["NAME_"] in data["NAME_"]: data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] break elif city["NAME_"][:-1] in data["ADDR_"] or city[ "NAME_"][:-1] in data["NAME_"]: data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] break elif city["NAME_"][:2] in data["ADDR_"] or city[ "NAME_"][:2] in data["NAME_"]: data["CITY_NAME_"] = city["NAME_"] data["CITY_CODE_"] = city["CODE_"] break if data["CITY_CODE_"] and not data["PROVINCE_CODE_"]: data["PROVINCE_CODE_"] = data["CITY_CODE_"][:2] + "00" for prov in province_list: if prov["CODE_"] == data["PROVINCE_CODE_"]: data["PROVINCE_NAME_"] = prov["NAME_"] if data.get("CITY_CODE_"): for area in area_list: if area["PARENT_"] == data["CITY_CODE_"]: if area["NAME_"] in data["ADDR_"]: data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] break elif area["NAME_"][:-1] in data["ADDR_"]: data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] break elif area["NAME_"][:2] in data["ADDR_"]: data["AREA_NAME_"] = area["NAME_"] data["AREA_CODE_"] = area["CODE_"] break if data.get("PROVINCE_NAME_", "no value") in ["北京市", "天津市", "上海市", "重庆市"]: data["CITY_NAME_"] = data["PROVINCE_NAME_"] data["CITY_CODE_"] = data["PROVINCE_CODE_"] prov_n = data.get("PROVINCE_NAME_", "") city_n = data.get("CITY_NAME_", "") # 地址清洗 if ("广西自治区" in data["ADDR_"]) or ("新疆自治区" in data["ADDR_"]): data["ADDR_"] = data["ADDR_"].replace("广西自治区", "广西壮族自治区") data["ADDR_"] = data["ADDR_"].replace("新疆自治区", "新疆维吾尔自治区") if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "SPDB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "SPDB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_"] = city_n # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n # re_data["DISTRICT_CODE_"] = area_c # re_data["DISTRICT_NAME_"] = area_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "SPDB" + "_" + re_data.get("CITY_CODE_", "") re_data["NAME_"] = data["NAME_"] re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] # re_data["TEL_"] = "" # re_data["BUSINESS_HOURS_"] = "" if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() # ID serial_number = req_for_serial_number(code="WD_JT_DT") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 temp_location = data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find( "|")] + data["STATION_NAME_"] + "地铁站" # print(temp_location) # try: # res = req_for_textLoc(text=data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find("|")] + data["STATION_NAME_"]+"地铁站") # except Exception as e: # self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误." # f" 原始数据 collection = {self.m_client.mongo_collection};" # f" ENTITY_CODE_ = {self.entity_code};" # f" 原始数据 _id = {data['_id']};" # f" error: {e}.") # else: # if "error" not in res: # if res["tagsId"] == "None" or res["tagsId"] is None: # pass # else: # re_data["TAGS_"] = res["tagsId"] # if res["flag"] == 1: try: lat_result = get_lat_lng(address=temp_location) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data["LAT_"]: lat_handle = "" try: lat_origin = ",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])]) i = 0 find_tag = False while True: s3 = get_periphery(classify="地铁站", tag="交通设施", lat_lng=lat_origin, radius=3000, page_num=i) for nearby in s3["results"]: if nearby["name"] == data["STATION_NAME_"]: find_tag = True lat = str(nearby["location"]["lat"]) lng = str(nearby["location"]["lng"]) re_data["LAT_"] = lat re_data["LNG_"] = lng lat_handle = lat + "," + lng break if find_tag: break i += 1 if len(s3["results"]) != 20: break except Exception as e: self.logger.info(f"获取精确经纬度失败, ERROR: {e}") if len(lat_handle) > 0: # 获取精确经纬度后根据精确经纬度补全地址信息 try: # area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) area_result = get_area(lat_handle) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass else: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass # print(re_data) # 站点名称 if "STATION_NAME_" in data: re_data["STATION_NAME_"] = data["STATION_NAME_"] # 途经路线(地铁几号线) temp_subway = data["SUBWAY_NAME_"].replace("|", "-") AROUND_ROUTE_ = re.findall(r"地铁\d+号线", temp_subway) if len(AROUND_ROUTE_) == 1: re_data["AROUND_ROUTE_"] = AROUND_ROUTE_[0] elif len(AROUND_ROUTE_) > 1: re_data["AROUND_ROUTE_"] = ",".join(AROUND_ROUTE_) else: re_data["AROUND_ROUTE_"] = "" # 地铁名称 if "SUBWAY_NAME_" in data: SUBWAY_NAME_ = data["SUBWAY_NAME_"].replace("|", "-") if "," in SUBWAY_NAME_: re_data_list = list() SUBWAY_LIST = SUBWAY_NAME_.split(",") for subway in SUBWAY_LIST: # 拆开的地铁名称需要再获取serial_number serial_number = req_for_serial_number(code="WD_JT_DT") re_data["ID_"] = serial_number re_data["SUBWAY_NAME_"] = subway + "-" + re_data[ "STATION_NAME_"] re_data = super(Branchjtdt, self).generic_shuffle(data=data, re_data=re_data, field=None) # temp_dict = deepcopy(re_data) temp_dict = deepcopy({ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }) re_data_list.append(temp_dict) return re_data_list else: re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[ "STATION_NAME_"] re_data = super(Branchjtdt, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{ "TABLE_NAME_": self.p_client.table_name, "DATA_": re_data }]
def generic_shuffle(self, data, field="CONTENT_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() if "PUBLISH_TIME_" not in data: return None # 时间维度 if re.findall(r"\d{4}-\d{1,2}-\d{1,2}", data["PUBLISH_TIME_"]): pass elif re.findall(r"\d{4}年\d{1,2}月\d{1,2}日", data["PUBLISH_TIME_"]): data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "") else: if ("年" in data["PUBLISH_TIME_"]) and ("月" in data["PUBLISH_TIME_"]) and ("二" in data["PUBLISH_TIME_"]): format_list = list() for i in data["PUBLISH_TIME_"][:10]: format_list.append(self.number_dict[i]) data["PUBLISH_TIME_"] = "".join(format_list) # 暂无其他情形 # elif else: find_time = re.findall(r"\|(\w{4}[-年]\w{1,2}[-月]\w{1,2})日?\W?\|", data["CONTENT_"]) if find_time: if "二" in find_time[0]: format_list = list() for i in find_time[0]: format_list.append(self.number_dict[i]) data["PUBLISH_TIME_"] = "".join(format_list) else: data["PUBLISH_TIME_"] = find_time[0] data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "") else: data["PUBLISH_TIME_"] = "" if data["PUBLISH_TIME_"]: shuffle_list = data["PUBLISH_TIME_"].split("-") shuffle_list[0] = shuffle_list[0][:4] if len(shuffle_list[1]) == 2: pass elif len(shuffle_list[1]) == 1: shuffle_list[1] = "0" + shuffle_list[1] elif len(shuffle_list[1]) > 2: shuffle_list[1] = shuffle_list[1][:2] if len(shuffle_list[2]) == 2: pass elif len(shuffle_list[2]) == 1: shuffle_list[2] = "0" + shuffle_list[2] elif len(shuffle_list[2]) > 2: shuffle_list[2] = shuffle_list[2][:2] data["PUBLISH_TIME_"] = "-".join(shuffle_list) re_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "") # re_data["REMARK_"] = "" # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # 数据来源编码 s_index = data["ENTITY_CODE_"].rfind("_") re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:7] re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] re_data["TITLE_"] = data["TITLE_"] # 作者 if "AUTHOR_" in data: if "编辑" in data["AUTHOR_"]: re_data["AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["AUTHOR_"])[0] else: re_data["AUTHOR_"] = data["AUTHOR_"] re_data["IMPORTANCE_"] = "N" # 阅读数 if "READ_" in data: re_data["READS_"] = data["READ_"] else: re_data["READS_"] = 0 # 点赞数 if "LIKES_" in data: re_data["LIKES_"] = data["LIKES_"] else: re_data["LIKES_"] = 0 # 评论数 if "COMMENTS_" in data: re_data["COMMENTS_"] = data["COMMENTS_"] elif "COMMENT_" in data: re_data["COMMENTS_"] = data["COMMENT_"] else: re_data["COMMENTS_"] = 0 # 参与数 if "JOINS_" in data: re_data["JOINS_"] = data["JOINS_"] elif "JOIN_" in data: re_data["JOINS_"] = data["JOIN_"] else: re_data["JOINS_"] = 0 # 内容 re_data["CONTENT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["CONTENT_"]) # HTML 标签 re_data['CONTENT_HTML_'] = data["HTML_"] data["CONTENT_HTML_"] = data["HTML_"] re_data["CONTENT_HTML_"] = re.sub(r"href=\".*?\"", "href=\"javaScript:void(0);\"", re_data["CONTENT_HTML_"]) if '28857' in re_data['CONTENT_HTML_'] or '您的IP' in re_data['CONTENT_HTML_']: try: soup = BeautifulSoup(re_data['CONTENT_HTML_']) soup.find('div', attrs={'class': 'online-desc-con'}).decompose() soup.find_all('script')[0].decompose() re_data['CONTENT_HTML_'] = soup.prettify() except Exception as e: self.logger.exception(f'IP检测内容清除出错') # TODO del data["HTML_] is wrong del data["HTML_"] re_data["CONTENT_"] = re_data["CONTENT_"].replace("|", "") re_data["TITLE_"] = re_data["TITLE_"].replace("|", "") # 是否营销活动 re_data["ACT_"] = "N" # 版本 re_data["VERSION_"] = "0" if "IMAGE_" in data: try: response = req_for_something(url=data["IMAGE_"]) if response: t = base64.b64encode(response.content) data["IMAGE_"] = t.decode("utf-8") response.close() except Exception: pass # 调用模型 # 摘要 try: brief = req_for_ts(re_data["CONTENT_"][0:1000]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_ts 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if brief: re_data["BRIEF_"] = brief["summary"] else: re_data["BRIEF_"] = '暂无摘要' # 情感分析 try: sentiment = req_for_senti(re_data["TITLE_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_senti 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if sentiment: if sentiment["sentiment"] == "中性": re_data["EMOTION_"] = "NORMAL" if sentiment["sentiment"] == "正面": re_data["EMOTION_"] = "POSITIVE" if sentiment["sentiment"] == "敏感": re_data["EMOTION_"] = "NAGETIVE" # 是否敏感 try: censor = req_for_censor(re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_censor 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if censor: if censor["censor"] == "N": re_data["SENSITIVE_"] = "N" else: re_data["SENSITIVE_"] = "Y" re_data["SENSITIVE_WORD_"] = censor["words"] # 热度 try: hot = req_for_news_hot(title=re_data["TITLE_"], content=re_data["CONTENT_"][0:1000]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_news_hot 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if hot: re_data["HOT_"] = hot["level"] # 地址分析 try: res = req_for_textLoc(text=re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if "error" not in res: if res["tagsId"] == "None" or res["tagsId"] is None: pass else: re_data["TAGS_"] = res["tagsId"] if res["flag"] == 1: address = res["full"] else: address = res["addr"] try: lat_result = get_lat_lng(address=address) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: self.logger.info(f"获取经纬度失败, ERROR: {e}") re_data["LAT_"] = None re_data["LNG_"] = None if re_data["LAT_"]: try: area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in self.city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in self.province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break # 信用卡关联性 try: res = req_for_credit_relative(text=re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if res["creditrelative"]: re_data["MODULE_TYPE_"] = "CREDITCARD" # 银行名称、编码 if "BANK_NAME_" in data: re_data["BANK_NAME_"] = data["BANK_NAME_"] if "BANK_CODE_" in data: re_data["BANK_CODE_"] = data["BANK_CODE_"] re_data = super(BranchNews, self).generic_shuffle(data=data, re_data=re_data, field="CONTENT_") # 财资直接发布 re_data['DATA_STATUS_'] = 'CHECK' # 是否发布 if not re_data.get("PUBLISH_TIME_"): re_data["PUBLISH_STATUS_"] = "N" else: re_data["PUBLISH_STATUS_"] = "Y" return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() for city in city_list: if city["NAME_"] == "县": city_list.remove(city) prov_c = "" prov_n = "" city_c = "" city_n = "" addr_ = data["ADDR_"] # 省市级信息 if "北京" in data["CITY_NAME_"]: prov_n = "北京市" prov_c = "1100" city_n = "北京市" city_c = "110100" elif "天津" in data["CITY_NAME_"]: prov_n = "天津市" prov_c = "1200" city_n = "天津市" city_c = "120100" elif "上海" in data["CITY_NAME_"]: prov_n = "上海市" prov_c = "3100" city_n = "上海市" city_c = "310100" elif "重庆" in data["CITY_NAME_"]: prov_n = "重庆市" prov_c = "5000" city_n = "重庆市" city_c = "500100" else: for city in city_list: if city["NAME_"][:-1] in data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["PARENT_"] break if prov_c: for prov in province_list: if prov["CODE_"] == prov_c: prov_n = prov["NAME_"] break # 地址清洗 if prov_n in addr_: pass elif prov_n[:-1] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:-1], prov_n) + addr_[len(prov_n):] elif prov_n[:4] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:4], prov_n) + addr_[len(prov_n):] elif prov_n[:3] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:3], prov_n) + addr_[len(prov_n):] elif prov_n[:2] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:2], prov_n) + addr_[len(prov_n):] else: addr_ = prov_n + addr_ if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # "C" re_data["BANK_CODE_"] = "PAB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-4] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = lat # re_data["LNG_"] = lng re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] # re_data["TEL_"] = tel # re_data["BUSINESS_HOURS_"] = business_time if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "PAB" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break return re_data
def data_shuffle(data, province_list, city_list, area_list): bank_dict = {'邮储银行': 'PSBC', '光大银行': 'CEB', '农商银行': 'RCB'} if data.get('BANK_NAME_') not in bank_dict.keys(): return None re_data = dict() # 省级信息清洗 for prov in province_list: if prov["NAME_"][:2] in data["PROVINCE_NAME_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] re_data["PROVINCE_CODE_"] = prov["CODE_"] break # 市级信息清洗 for city in city_list: if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if city["NAME_"][:2] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] break # 区县级信息清洗 for area in area_list: if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if area["NAME_"] == data["AREA_NAME_"]: area_n = area["NAME_"] area_c = area["CODE_"] elif area["NAME_"][:-1] == data["AREA_NAME_"][:-1]: area_n = area["NAME_"] area_c = area["CODE_"] # 地址清洗 prov_n = re_data["PROVINCE_NAME_"] city_n = re_data["CITY_NAME_"] if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "ICBC" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = bank_dict.get(data.get('BANK_NAME_')) re_data["BANK_NAME_"] = data.get('BANK_NAME_') re_data["SPIDER_TIME_"] = data["DATETIME_"] # "F" re_data["ADDR_"] = addr_ result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break if data["PROVINCE_NAME_"] == data["CITY_NAME_"]: re_data["CITY_NAME_"] = re_data["PROVINCE_NAME_"] re_data["UNIT_CODE_"] = re_data["BANK_CODE_"] + "_" + re_data.get( "CITY_CODE_", "") re_data["NAME_"] = data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): for city in city_list: if city["NAME_"] == "县": city_list.remove(city) re_data = dict() # for prov in province_list: # if prov["NAME_"][:2] in data["PROVINCE_NAME_"]: # re_data["PROVINCE_CODE_"] = prov["CODE_"] # re_data["PROVINCE_NAME_"] = prov["NAME_"] # break # for city in city_list: # if city["NAME_"][:2] in data["CITY_NAME_"]: # re_data["CITY_CODE_"] = city["CODE_"] # re_data["CITY_NAME_"] = city["NAME_"] # break # prov_n = "" # prov_c = "" # city_n = "" # city_c = "" # area_n = "" # area_c = "" # addr_ = "" # # # 内蒙古, 广西, 新疆, 宁夏, 西藏 字段统一: # if ("内蒙古" in data["ADDR_"][:3] or "广西" in data["ADDR_"][:2] or "新疆" in data["ADDR_"][:2] or # "宁夏" in data["ADDR_"][:2] or "西藏" in data["ADDR_"][:2]): # if "自治区" not in data["ADDR_"]: # data["ADDR_"] = data["ADDR_"].replace("内蒙古", "内蒙古自治区") # data["ADDR_"] = data["ADDR_"].replace("广西", "广西壮族自治区") # data["ADDR_"] = data["ADDR_"].replace("新疆", "新疆维吾尔自治区") # data["ADDR_"] = data["ADDR_"].replace("宁夏", "宁夏回族自治区") # data["ADDR_"] = data["ADDR_"].replace("西藏", "西藏自治区") # # elif "京山县" in data["AREA_NAME_"]: # data["AREA_NAME_"] = data["AREA_NAME_"].replace("荆州", "荆门") # # for city in city_list: # if city["NAME_"] in data["AREA_NAME_"]: # city_n = city["NAME_"] # city_c = city["CODE_"] # prov_c = city["CODE_"][:2] + "00" # break # for area in area_list: # if city_c: # if area["PARENT_"] == city_c: # if area["NAME_"] in data["AREA_NAME_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # break # else: # if (area["NAME_"][-1] == "区") and (len(area["NAME_"]) == 2): # continue # if area["NAME_"] in data["AREA_NAME_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # city_c = area["CODE_"][:-2] + "00" # prov_c = area["CODE_"][:2] + "00" # break # for prov in province_list: # if prov_c: # if prov["CODE_"] == prov_c: # prov_n = prov["NAME_"] # prov_c = prov["CODE_"] # break # else: # if prov["NAME_"] in data["AREA_NAME_"]: # prov_n = prov["NAME_"] # prov_c = prov["CODE_"] # break # elif prov["NAME_"][:-1] in data["AREA_NAME_"]: # prov_n = prov["NAME_"] # prov_c = prov["CODE_"] # break # # if data["AREA_NAME_"] == "洋浦经济开发区": # prov_n = "海南省" # prov_c = "4600" # city_n = "儋州市" # city_c = "460400" # area_n = "洋浦经济开发区" # area_c = "" # elif ("北京" in data["AREA_NAME_"][:3] or "天津" in data["AREA_NAME_"][:3] or # "上海" in data["AREA_NAME_"][:3] or "重庆" in data["AREA_NAME_"][:3]): # city_n = prov_n # city_c = prov_c # # if not area_c: # for area in area_list: # if area["PARENT_"] == city_c: # if area["NAME_"][:2] in data["AREA_NAME_"][-len(area["NAME_"]):]: # area_n = area["NAME_"] # area_c = area["CODE_"] # break # if not area_c: # for area in area_list: # if area["PARENT_"] == city_c: # if area["NAME_"] in data["ADDR_"]: # area_n = area["NAME_"] # area_c = area["CODE_"] # # # 地址清洗 # if prov_n in data["ADDR_"]: # addr_ = data["ADDR_"] # elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] # elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: # addr_ = data["ADDR_"][:len(prov_n)].replace(prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] # else: # addr_ = prov_n + data["ADDR_"] # # if city_n in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_ # elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] # elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: # addr_ = addr_[:len(prov_n) + len(city_n)].replace( # city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] # else: # addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # # # 添加分行编码 # # branch_code = None # # for i in range(1, 10000): # # branch_code = "BOC" + "_" + city_c + "_" + "00000" # # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # # if branch_code in branch_code_list: # # continue # # else: # # branch_code_list.append(branch_code) # # break # "C" re_data["BANK_CODE_"] = "BOC" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-7] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "BOC" + "_" + re_data["CITY_CODE_"] # "F" re_data[ "ADDR_"] = data["PROVINCE_NAME_"] + data["CITY_NAME_"] + data['ADDR_'] # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = "" # re_data["LNG_"] = "" re_data["NAME_"] = re_data["ADDR_"] + data["NAME_"] # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "BOC" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["SPIDER_TIME_"] = data["DATETIME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "BUSINESS_HOURS_" in data: re_data["BUSINESS_HOURS_"] = data["BUSINESS_HOURS_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "自助银行" re_data["TYPE_"] = "ZZ" return re_data
def generic_shuffle(self, data): # print(data) re_data = dict() # 通用字段 # ID_ 历史信息 ID_ serial_number = req_for_serial_number(code="WD_JZ_FJ_DATA") re_data["ID_"] = serial_number re_data["URL_"] = data["URL_"] # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 实体编码、名称及 url re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time # 状态 if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" # 名称 re_data["NAME_"] = data["NAME_"].replace("|", "") # 类型: 住宅(ZZ)、写字楼(XZL)、商铺(SP) if "LISP" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "SP" elif "LIXQ" in data["ENTITY_CODE_"] or "LJXQ" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "ZZ" elif "LIXZL" in data["ENTITY_CODE_"]: re_data["TYPE_"] = "XZL" # 验证名称是否在基本表中 verify_name = value_replace(re_data["NAME_"]) house_id = self.if_exists(name=verify_name, city_name="厦门市") # 基本表存在, 只插入 DATA 表 if house_id: re_data["P_ID_"] = house_id if "TITLE_" in data: re_data["TITLE_"] = data["TITLE_"].replace("|", "") if "PUBLISH_TIME_" in data: re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] else: re_data["PUBLISH_TIME_"] = data["DATETIME_"][:10] price = re.findall(r"[\d.]+", data["PRICE_"]) if price: re_data["PRICE_"] = price[0] else: re_data["PRICE_"] = 0 if "租赁" in data["ENTITY_NAME_"]: re_data["USE_TYPE_"] = "RENT" else: re_data["USE_TYPE_"] = "SALE" return [{"TABLE_NAME_": self.data_table_name, "DATA_": re_data}] else: # 基本信息表ID_ base_id = req_for_serial_number(code="WD_JZ_FJ_BASE") # DATA_ 表 data_dict = dict() data_dict.update(re_data) data_dict["P_ID_"] = base_id if "TITLE_" in data: data_dict["TITLE_"] = data["TITLE_"].replace("|", "") if "PUBLISH_TIME_" in data: data_dict["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] else: data_dict["PUBLISH_TIME_"] = data["DATETIME_"][:10] price = re.findall(r"[\d.]+", data["PRICE_"]) if price: data_dict["PRICE_"] = price[0] else: data_dict["PRICE_"] = 0 if "租赁" in data["ENTITY_NAME_"]: data_dict["USE_TYPE_"] = "RENT" else: data_dict["USE_TYPE_"] = "SALE" # 基本信息表 basic_dict = dict() basic_dict.update(re_data) basic_dict["ID_"] = base_id basic_dict["URL_"] = data["URL_"] basic_dict["PROVINCE_CODE_"] = "3500" basic_dict["PROVINCE_NAME_"] = "福建省" basic_dict["CITY_CODE_"] = "350200" basic_dict["CITY_NAME_"] = "厦门市" basic_dict["SALE_PRICE_"] = 0 basic_dict["RENT_PRICE_"] = 0 if "YEAR_" in data: year = re.findall(r"\d+", data["YEAR_"]) if year: basic_dict["YEAR_"] = year[0] # 地址分析 try: if basic_dict["PROVINCE_NAME_"] == basic_dict["CITY_NAME_"]: basic_dict["ADDR_"] = basic_dict[ "PROVINCE_NAME_"] + basic_dict["NAME_"] else: basic_dict[ "ADDR_"] = basic_dict["PROVINCE_NAME_"] + basic_dict[ "CITY_NAME_"] + basic_dict["NAME_"] # print(basic_dict["ADDR_"]) res = req_for_textLoc(text=basic_dict["ADDR_"]) # print(res) except Exception as e: self.logger.exception( f"2.2--err: 请求模型 req_for_textLoc 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if "error" not in res: if res["tagsId"] == "None" or res["tagsId"] is None: pass else: basic_dict["TAGS_"] = res["tagsId"] if res["flag"] == 1: basic_dict["ADDR_"] = res["full"] else: basic_dict["ADDR_"] = data["ADDR_"] try: lat_result = get_lat_lng(address=basic_dict["ADDR_"]) basic_dict["LAT_"] = lat_result["result"]["location"][ "lat"] basic_dict["LNG_"] = lat_result["result"]["location"][ "lng"] except KeyError: basic_dict["LAT_"] = None basic_dict["LNG_"] = None except Exception as e: self.logger.info(f"获取经纬度失败, ERROR: {e}") basic_dict["LAT_"] = None basic_dict["LNG_"] = None if basic_dict["LAT_"]: try: area_result = get_area(",".join([ str(basic_dict["LAT_"]), str(basic_dict["LNG_"]) ])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: basic_dict["AREA_NAME_"] = area_result[ "result"]["addressComponent"]["district"] basic_dict["AREA_CODE_"] = area_result[ "result"]["addressComponent"]["adcode"] except KeyError: pass try: basic_dict["ADDR_"] = area_result["result"][ "formatted_address"] except KeyError: pass # basic_dict["AREA_CODE_"] = data[""] # basic_dict["AREA_NAME_"] = data[""] # basic_dict["LAT_"] = data[""] # basic_dict["LNG_"] = data[""] # basic_dict["BANK_CODE_"] = data[""] # basic_dict["BANK_NAME_"] = data[""] # basic_dict["REMARK_"] = data[""] basic_dict["M_STATUS_"] = "N" basic_dict["DELETE_STATUS_"] = "N" # basic_dict["TAGS_"] = data[""] # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] basic_dict["SOURCE_TYPE_"] = "链家" # basic_dict["PRICE_TYPE_"] = data[""] basic_dict["ADDR_"] = data["ADDR_"] return [{ "TABLE_NAME_": self.data_table_name, "DATA_": data_dict }, { "TABLE_NAME_": self.base_table_name, "DATA_": basic_dict }]
def generic_shuffle(self, data): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :param data: :return: """ re_data = dict() serial_number = req_for_serial_number(code="WD_JT_GJ") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # # 补全经度纬度和省市等信息 # try: # city = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-")+1:] # lat_result_list = get_infomation(data["NAME_"], city) # print(lat_result_list) # except KeyError: # re_data["LAT_"] = None # re_data["LNG_"] = None # except Exception as e: # re_data["LAT_"] = None # re_data["LNG_"] = None # self.logger.info("获取经纬度失败{}".format(e)) # if lat_result_list.get('result') and len(lat_result_list['result']) > 0: # for lat_result in lat_result_list['result']: # if lat_result["name"] == "{}-公交车站".format(data["NAME_"]): # print("找到公交") # re_data["LAT_"] = lat_result["location"]["lat"] # re_data["LNG_"] = lat_result["location"]["lng"] # break temp_location = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-") + 1:] + data["NAME_"] + "公交车站" try: lat_result = get_lat_lng(address=temp_location) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: re_data["LAT_"] = None re_data["LNG_"] = None self.logger.info("获取经纬度失败错误信息为{}".format(e)) if re_data.get("LAT_"): # 根据前面查询的经纬度获取周围公交车站精确经纬度 lat_handle = "" try: lat_origin = ",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])]) i = 0 find_tag = False while True: s3 = get_periphery(classify="公交车站", tag="交通设施", lat_lng=lat_origin, radius=3000, page_num=i) for nearby in s3["results"]: if data["NAME_"] in nearby["name"]: find_tag = True lat = str(nearby["location"]["lat"]) lng = str(nearby["location"]["lng"]) re_data["LAT_"] = lat re_data["LNG_"] = lng lat_handle = lat + "," + lng break if find_tag: break i += 1 if len(s3["results"]) != 20: break except Exception as e: self.logger.info(f"获取精确经纬度失败, ERROR: {e}") if len(lat_handle) > 0: # 获取精确经纬度后根据精确经纬度补全地址信息 try: area_result = get_area(lat_handle) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass else: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data[ "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data[ "AREA_CODE_"][:2] + "00" except KeyError: pass # 站点描述 re_data["DESCRIBE_"] = data["DESCRIBE_"] # 周边站点 re_data["AROUND_STATIONS_"] = self.handle_special_text( data["AROUND_STATIONS_"]).replace("|", ",") # 途径路线 re_data["AROUND_ROUTE_"] = self.handle_special_text( data["AROUND_ROUTE_"]).replace("|", ",") if re_data["AROUND_ROUTE_"]: re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace( "公交线路", "") # 站点名称 re_data["NAME_"] = data["NAME_"] re_data = super(Branchjtgj, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() ############## # 先处理省市区, 经纬度 ############## branch_name = data.get('CITY_NAME_') # 市级信息清洗 jsonpath.jsonpath(province_list, '$.[*].CODE_') for city in city_list: # if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if city["NAME_"][:2] in branch_name: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] parent = city['PARENT_'] # 省级信息清洗, 获取 break try: # 利用市编码先用jsonpath找到所有城市编码,使用index方法查到对应的索引,利用下标取值 province = province_list[jsonpath.jsonpath( province_list, '$.[*].CODE_').index(parent)] re_data["PROVINCE_NAME_"] = province["NAME_"] re_data["PROVINCE_CODE_"] = province["CODE_"] except: re_data["PROVINCE_NAME_"] = '' re_data["PROVINCE_CODE_"] = '' import re try: area_name = re.findall('市(.*[区镇县])', data.get('ADDR_'))[0] except: area_name = '' # # 区县级信息清洗 area_n = '' area_c = '' if area_name: for area in area_list: if area["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if area["NAME_"] == area_name: area_n = area["NAME_"] area_c = area["CODE_"] elif area["NAME_"][:-1] == area_name[:-1]: area_n = area["NAME_"] area_c = area["CODE_"] # 地址清洗 prov_n = re_data["PROVINCE_NAME_"] city_n = re_data["CITY_NAME_"] if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] # 将市的名称补全 if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: # '市' 缺失 ,添加市 addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # # 添加分行编码 # branch_code = None # for i in range(1, 10000): # branch_code = "ICBC" + "_" + city_c + "_" + "00000" # branch_code = branch_code[:len(branch_code) - len(str(i))] + "{}".format(i) # if branch_code in branch_code_list: # continue # else: # branch_code_list.append(branch_code) # break # "C" re_data["BANK_CODE_"] = "PAB" re_data["BANK_NAME_"] = "平安银行" re_data["SPIDER_TIME_"] = data["DATETIME_"] re_data["AREA_CODE_"] = area_c re_data["AREA_NAME_"] = area_n # "F" re_data["ADDR_"] = addr_ # re_data["CITY_CODE_"] = city_c # re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = data["LAT_"] # re_data["LNG_"] = data["LNG_"] result = get_lat_lng(address=re_data["ADDR_"]) # 获取经纬度 try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: # 通过百度接口获取 AREA_NAME_ dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break ############## # 其他数据项 ############## # 银行简称与 CITY_CODE_ 的拼接 re_data["UNIT_CODE_"] = "PAB" + "_" + re_data.get("CITY_CODE_", "") re_data["NAME_"] = data["NAME_"] re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data
def data_shuffle(data, province_list, city_list, area_list): re_data = dict() # 省级信息清洗 for prov in province_list: if prov["NAME_"][:2] in data["PROVINCE_NAME_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] re_data["PROVINCE_CODE_"] = prov["CODE_"] break # 市级信息清洗 for city in city_list: if city["CODE_"][:2] == re_data["PROVINCE_CODE_"][:2]: if city["NAME_"][:2] in data["CITY_NAME_"]: re_data["CITY_NAME_"] = city["NAME_"] re_data["CITY_CODE_"] = city["CODE_"] break # 区县级信息清洗 import re import copy addr = copy.deepcopy(data["ADDR_"]) data["ADDR_"] = re.findall(r'地址:([\w\S]+),', addr)[0] if re.findall( r'地址:([\w\S]+),', addr) else '' # '地址:石家庄市裕华区谈固东街150号,电话:0311-85081812' data["TEL_"] = re.findall(r'电话:([\w\d\-]*)', addr)[0] if re.findall( r'电话:([\w\d\-]*)', addr) else '' # 地址清洗 prov_n = re_data.get("PROVINCE_NAME_") city_n = re_data.get("CITY_NAME_") if prov_n in data["ADDR_"]: addr_ = data["ADDR_"] elif prov_n[:-1] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:-1], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:4] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:4], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:3] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:3], prov_n) + data["ADDR_"][len(prov_n):] elif prov_n[:2] in data["ADDR_"][:len(prov_n)]: addr_ = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: addr_ = prov_n + data["ADDR_"] # 将市的名称补全 if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: # '市' 缺失 ,添加市 addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # "C" re_data["BANK_CODE_"] = "BHB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # "F" re_data["ADDR_"] = data["ADDR_"] re_data["NAME_"] = data["NAME_"] result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"]["addressComponent"][ "district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"]["addressComponent"][ "adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "BHB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] if "TEL_" in data: re_data["TEL_"] = data["TEL_"] re_data["BUSINESS_HOURS_"] = "0:00-24:00" if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" return re_data