Esempio n. 1
0
    def __shuffle(self, data):
        serial_number = req_for_serial_number(code="WD_TY")
        data["ID_"] = serial_number

        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        data["CREATE_TIME_"] = create_time
        data["CREATE_BY_ID_"] = CREATE_ID
        data["CREATE_BY_NAME_"] = CREATE_NAME
        data["M_STATUS_"] = "N"
        data["DELETE_STATUS_"] = "N"
        data["DATA_STATUS_"] = "UNCHECK"
        data["PUBLISH_STATUS_"] = "N"
        data["HOT_"] = "0"
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        if source:
            data["SOURCE_"] = source[0]
        data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

        if data["PROVINCE_NAME_"] == data["CITY_NAME_"] and data[
                "PROVINCE_CODE_"] == data["CITY_CODE_"]:
            data["CITY_CODE_"] = data["CITY_CODE_"][:3] + "100"

        return data
Esempio n. 2
0
    def generic_shuffle(self, data, re_data, field="CONTENT_"):

        re_data = deepcopy(data)

        # 文件存储
        for _ in range(1, 10):
            if f"FJ{_}_NAME_" in data and data.get(f'FJ{_}_URL_'):
                type = find_type(data.get(f'FJ{_}_URL_')) if find_type(
                    data.get(f'FJ{_}_URL_')) else find_type(
                        data.get(f"FJ{_}_NAME_"))
                if not type:
                    return re_data
                try:
                    response = req_for_something(url=data[f'FJ{_}_URL_'])
                except Exception as e:
                    self.logger.exception('文件获取出错')
                else:
                    if response:
                        try:
                            # todo 文件上传出错是否继续还是跳过
                            number = 3932
                            serial_number = req_for_serial_number(
                                code="GOV_ZX_GDS")

                            file_name = src_dir + str(
                                int(serial_number[5:13]) - number
                            ) + '-' + data.get(f"FJ{_}_NAME_").replace(
                                '.xlsx', '').replace('.xls', '').replace(
                                    '.doc', '').replace('.docx', '').replace(
                                        '.zip',
                                        '').replace('.pdf', '').replace(
                                            '.PDF', '') + type

                            re_data[f'FILE_NAME_{_}_'] = str(
                                int(serial_number[5:13]) - number
                            ) + '-' + data.get(f"FJ{_}_NAME_").replace(
                                '.xlsx', '').replace('.xls', '').replace(
                                    '.docx', '').replace('.doc', '').replace(
                                        '.zip',
                                        '').replace('.pdf', '').replace(
                                            '.PDF', '') + type
                            with open(file_name, 'wb+') as fp:
                                fp.write(response.content)
                            print('保存文件成功', '  ', re_data[f'FILE_NAME_{_}_'])
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f"error: {e}.")
                        finally:
                            response.close()

        return re_data
Esempio n. 3
0
    def __shuffle(self, data):
        serial_number = req_for_serial_number(code="JRCP_LCCP_INFO")
        data["ID_"] = serial_number
        content = ''
        data['conten_type'] = find_type(data.get('FJ1_URL_')) if find_type(data.get('FJ1_URL_')) else find_type(data.get('FJ1_NAME_'))
        # 文本分类模型
        try:
            response = requests.post('http://172.22.69.39:8099/ZHclassify', data={'title': data.get('TITLE_')}).json()
        except Exception as e:
            self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}")
        else:
            if response:
                data["type"] = response["type"]
            else:
                data["type"] = '发展规划_其他'
        if data.get('CONTENT_'):
            if len(data.get('CONTENT_')) < 500:
                data['accessory'] = str(transform_data(data.get('FJ1_URL_'), data)) if data.get('FJ1_URL_') else ''
            try:
                content = data.get('CONTENT_').replace('|', '') + data.get('accessory') if data.get('FJ1_URL_') else data.get('CONTENT_').replace('|', '')
            except:
                content = data.get('CONTENT_').replace('|', '')
            if content:
                # 文本摘要模型
                try:
                    response = requests.post('http://172.22.69.39:8101/ZHsummary', data={'text': content[:500]}).json()
                except Exception as e:
                    self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}")
                    data["summary"] = ''
                else:
                    if response:
                        data["summary"] = response.get("summary")
                    else:
                        data["summary"] = ''

                # 地名及其置信度模型
                try:
                    response = requests.post('http://172.22.69.39:8100/ZHlocation', data={'text': content[:500]}).json()
                except Exception as e:
                    self.logger.exception(f"err: 请求模型 http://172.22.69.39:8099/ZHclassify 错误. {e}")
                    data["location"] = ''
                else:
                    if response:
                        data["location"] = response.get("address")
                    else:
                        data["location"] = ''
        re_data = super(BranchOrganize, self).generic_shuffle(data=data, re_data=data, field="ENTITY_NAME_")
        return re_data
Esempio n. 4
0
    def __shuffle(self, data):
        serial_number = req_for_serial_number(code="CRM_MARKET_ACT")
        data["ID_"] = serial_number

        bank_list = list()
        bank_code_list = list()
        for each in self.bank_list:
            if data.get('BANK_NAME_') in each['ALIAS_']:
                bank_list.append(each["NAME_"])
                bank_code_list.append(each["CODE_"])
        if bank_list:
            data["BANK_NAME_"] = "|".join(bank_list)
        if bank_code_list:
            data["BANK_CODE_"] = "|".join(bank_code_list)

        return data
Esempio n. 5
0
    def generic_shuffle(self, data, field="CONTENT_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = copy.deepcopy(data)
        serial_number = req_for_serial_number(code="CRM_NEWS")
        re_data["ID_"] = serial_number

        # 作者
        if "NEWS_AUTHOR_" in data:
            if "编辑" in data["NEWS_AUTHOR_"]:
                re_data["NEWS_AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["NEWS_AUTHOR_"])[0]

        # 内容
        re_data["NEWS_DESC_TEXT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["NEWS_DESC_TEXT_"]).replace("|", "")

        # 调用模型  -- 实体识别
        try:
            res = req_for_ner(text=re_data["NEWS_DESC_TEXT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if res.get("Organ"):
                bank_name = res.get("Organ").get("entity")
                if bank_name and '银行' in bank_name:
                    re_data["BANK_NAME_"] = bank_name
                    bank_list = list()
                    bank_code_list = list()
                    for each in self.bank_list:
                        if re_data.get('BANK_NAME_') in each['ALIAS_']:
                            bank_list.append(each["NAME_"])
                            bank_code_list.append(each["CODE_"])
                    if bank_list:
                        re_data["BANK_NAME_"] = "|".join(bank_list)
                    if bank_code_list:
                        re_data["BANK_CODE_"] = "|".join(bank_code_list)

        return [{"TABLE_NAME_": 'CRM_NEWS', "DATA_": re_data}]
Esempio n. 6
0
    def __shuffle(self, data):
        serial_number = req_for_serial_number(code="CRM_JJK")
        data["ID_"] = serial_number

        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        data["CREATE_TIME_"] = create_time
        data["CREATE_BY_ID_"] = CREATE_ID
        data["CREATE_BY_NAME_"] = CREATE_NAME
        data["M_STATUS_"] = "N"
        data["DELETE_STATUS_"] = "N"
        data["DATA_STATUS_"] = "UNCHECK"
        data["PUBLISH_STATUS_"] = "N"
        data["HOT_"] = "0"
        data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        if source:
            data["SOURCE_"] = source[0]
        data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

        # 处理图片
        if "IMG" in data and data["IMG"]:
            try:
                response = req_for_something(url=data["IMG"])
            except Exception as e:
                self.logger.exception(f"2.1--err: IMG"
                                      f" 原始数据 collection = {self.m_client.mongo_collection};"
                                      f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                      f"error: {e}.")
            else:
                if response:
                    content = response.content
                    encode_data = base64.b64encode(content)
                    data["IMG_"] = encode_data.decode("utf-8")
                response.close()
        else:
            data["IMG_"] = ""

        del data["IMG"]
        del data["DATETIME_"]
        return data
Esempio n. 7
0
    def generic_shuffle(self, data):
        re_data = dict()
        # print(data)
        # print(data["DEALTIME_"])
        time_array = time.localtime(int(data["DEALTIME_"]))
        period_time = time.strftime("%Y%m%d", time_array)

        serial_number = req_for_serial_number(code="WEIBO_BASIC_INFO")
        re_data["ID_"] = serial_number

        # 对BANK_NAME_作处理
        # 对特殊微信BANK_NAME 做处理
        for key, value in self.name_dict.items():
            if key[:2] in data["ENTITY_NAME_"]:
                re_data["BANK_NAME_"] = key
                re_data["BANK_CODE_"] = value
                break
        if "BANK_NAME_" in re_data:
            if re_data["BANK_NAME_"] == "建信":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建设银行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "农业银行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "工行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "工商银行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "民生银行":
                re_data["BANK_NAME_"] = "中国民生银行"
            if re_data["BANK_NAME_"] == "光大银行":
                re_data["BANK_NAME_"] = "中国光大银行"
            if re_data["BANK_NAME_"] == "交行":
                re_data["BANK_NAME_"] = "交通银行"
            if re_data["BANK_NAME_"] == "招行":
                re_data["BANK_NAME_"] = "招商银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "中行":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "中银":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "邮储银行":
                re_data["BANK_NAME_"] = "中国邮政储蓄银行"
            if re_data["BANK_NAME_"] == "邮政储蓄银行":
                re_data["BANK_NAME_"] = "中国邮政储蓄银行"
            if re_data["BANK_NAME_"] == "南海农商银行":
                re_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司"
            if re_data["BANK_NAME_"] == "顺德农村商业银行":
                re_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司"

        re_data["PERIOD_CODE_"] = period_time
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["MAIN_URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        re_data["SOURCE_TYPE_"] = ""
        re_data["HOT_"] = "0"
        re_data["WEIBO_CODE_"] = data["WEIBO_CODE_"]
        re_data["WEIBO_NAME_"] = data["ENTITY_NAME_"]
        re_data["FOCUS_"] = data["FOCUS_"]
        re_data["FANS_"] = data["FANS_"]
        # 对错误COMPANY 处理
        if re.match(r"\d+-\d+-\d+", data["COMPANY_"]):
            data["COMPANY_"] = data["ENTITY_NAME_"] + "股份有限公司"
        re_data["COMPANY_"] = data["COMPANY_"]
        re_data["VIRIFIED_"] = data["VIRIFIED_"]
        re_data["BRIEF_"] = data["BIREF_"]
        re_data["VERSION_"] = "0"
        # 添加大V认证 默认银行官微都为大V
        re_data["VERIFIED_"] = "Y"
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        if re_data["ENTITY_NAME_"] == "华夏银行":
            re_data["ENTITY_NAME_"] = "华夏银行微博"
        re_data["URL_"] = data["MAIN_URL_"]
        re_data = super(WeiboBasicInfoScript,
                        self).generic_shuffle(data=data,
                                              re_data=re_data,
                                              field="ENTITY_NAME_")
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Esempio n. 8
0
    def generic_shuffle(self, data):
        re_data = dict()
        serial_number = req_for_serial_number(code="MAPBAR")
        re_data["ID_"] = serial_number
        re_data["NAME_"] = data["NAME_"]
        re_data["ADDRESS_"] = data["ADDRESS_"].replace("|", "")
        re_data["ADDRESS_"] = re_data["ADDRESS_"].replace("地址:", "")
        # re_data["PROVINCE_CODE_"] = "3100"
        # re_data["PROVINCE_NAME_"] = "上海市"
        # re_data["CITY_CODE_"] = "310100"
        # re_data["CITY_NAME_"] = "上海市"
        re_data["HOT_"] = 0
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = "图吧"
        re_data["SOURCE_TYPE_"] = "图吧"
        # 获取经纬度
        try:
            if re_data["ADDRESS_"]:
                location_result = get_lat_lng(address=re_data["ADDRESS_"])
                if location_result["status"] == 0:
                    re_data["LNG_"] = str(
                        location_result["result"]["location"]["lng"])
                    re_data["LAT_"] = str(
                        location_result["result"]["location"]["lat"])
                else:
                    re_data["LNG_"] = ""
                    re_data["LAT_"] = ""
                    self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
            else:
                re_data["LNG_"] = ""
                re_data["LAT_"] = ""
        except Exception as e:
            self.logger.exception(f"_id: {data['_id']} 获取经纬度失败, error: {e}")
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.exception(f"_id: {data['_id']} 获取地址失败, error: {e}")
            else:
                try:
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                except KeyError:
                    re_data["AREA_NAME_"] = ""
                try:
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                except KeyError:
                    re_data["AREA_CODE_"] = ""
                else:
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                    for city in self.city_list:
                        if city["CODE_"] == re_data["CITY_CODE_"]:
                            re_data["CITY_NAME_"] = city["NAME_"]
                            break
                    for prov in self.province_list:
                        if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                            re_data["PROVINCE_NAME_"] = prov["NAME_"]
                            break

        if not re_data.get("CITY_NAME_", ""):
            for city in self.city_list:
                if city["NAME_"][:2] in data["TYPE_"]:
                    re_data["CITY_CODE_"] = city["CODE_"]
                    re_data["CITY_NAME_"] = city["NAME_"]
                    break
            if re_data.get("CITY_NAME_", ""):
                for prov in self.province_list:
                    if prov["CODE_"][:2] == re_data["CITY_CODE_"][:2]:
                        re_data["PROVINCE_CODE_"] = prov["CODE_"]
                        re_data["PROVINCE_NAME_"] = prov["NAME_"]
                        break

        # CHA_BRANCH_MAIN_ROUTE 主干道
        if "道路" in data["TYPE_"]:
            road_data = dict()
            road_data.update(re_data)
            road_data["ID_"] = req_for_serial_number(code="WD_GD")
            road_data["ADDR_"] = road_data["ADDRESS_"]
            del road_data["ADDRESS_"]
            road_shuffle_data = super(MapbarScript,
                                      self).generic_shuffle(data=data,
                                                            re_data=road_data,
                                                            field=None)

        # CHA_BRANCH_FACILITY 图吧
        # serial_number = req_for_serial_number(code="MAPBAR")
        # re_data["ID_"] = serial_number
        re_data["TYPE1_"] = data["BTYPE_"]
        try:
            re_data["TYPE1_CODE_"] = self.type1_dict[re_data["TYPE1_"]]
        except KeyError:
            raise Exception("暂不需要清洗的数据")
        # 小分类清洗(合并部分分类)
        if data["TYPE_"][2:] in ["户外运动俱乐部", "赛马场及马术俱乐部", "室内运动健身俱乐部"]:
            re_data["TYPE2_"] = "俱乐部"
            re_data["TYPE2_CODE_"] = "JLB"
        elif data["TYPE_"][2:] in ["连锁店", "便利店"]:
            re_data["TYPE2_"] = "便利店"
            re_data["TYPE2_CODE_"] = "BLD"
        elif data["TYPE_"][2:] in ["电子商城", "电器商城"]:
            re_data["TYPE2_"] = "家电数码"
            re_data["TYPE2_CODE_"] = "JDSM"
        elif data["TYPE_"][2:] in ["诊所/卫生所", "门诊/急诊部"]:
            re_data["TYPE2_"] = "门诊/卫生所"
            re_data["TYPE2_CODE_"] = "MZWSS"
        else:
            re_data["TYPE2_"] = data["TYPE_"][2:]
            re_data["TYPE2_CODE_"] = self.type2_dict.get(re_data["TYPE2_"])
        re_data["SOURCE_TYPE1_"] = data["BTYPE_"]
        re_data["SOURCE_TYPE1_CODE_"] = self.type1_dict.get(
            re_data["SOURCE_TYPE1_"])
        re_data["SOURCE_TYPE2_"] = data["TYPE_"][2:]
        re_data["SOURCE_TYPE2_CODE_"] = self.source_type2_dict.get(
            re_data["SOURCE_TYPE2_"])
        re_data["PHONE_"] = data["PHONE_"].replace("无,", "")
        re_data["BUS_"] = data["BUS_"]
        re_data["BUSSTOP_"] = data["BUSSTOP_"]

        shuffle_data = super(MapbarScript,
                             self).generic_shuffle(data=data,
                                                   re_data=re_data,
                                                   field=None)

        return_list = list()
        return_list.append({
            "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FACILITY"),
            "DATA_": shuffle_data
        })
        if "road_shuffle_data" in dir():
            return_list.append({
                "TABLE_NAME_":
                TABLE_NAME("CHA_BRANCH_MAIN_ROUTE"),
                "DATA_":
                road_shuffle_data
            })
        return return_list
Esempio n. 9
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :param data:
        :return:
        """

        re_data = dict()
        serial_number = req_for_serial_number(code="WD_JT_GJ")
        re_data["ID_"] = serial_number

        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # # 补全经度纬度和省市等信息
        # try:
        #     city = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-")+1:]
        #     lat_result_list = get_infomation(data["NAME_"], city)
        #     print(lat_result_list)
        # except KeyError:
        #     re_data["LAT_"] = None
        #     re_data["LNG_"] = None
        # except Exception as e:
        #     re_data["LAT_"] = None
        #     re_data["LNG_"] = None
        #     self.logger.info("获取经纬度失败{}".format(e))
        # if lat_result_list.get('result') and len(lat_result_list['result']) > 0:
        #     for lat_result in lat_result_list['result']:
        #         if lat_result["name"] == "{}-公交车站".format(data["NAME_"]):
        #             print("找到公交")
        #             re_data["LAT_"] = lat_result["location"]["lat"]
        #             re_data["LNG_"] = lat_result["location"]["lng"]
        #             break

        temp_location = data["ENTITY_NAME_"][data["ENTITY_NAME_"].rfind("-") +
                                             1:] + data["NAME_"] + "公交车站"
        try:
            lat_result = get_lat_lng(address=temp_location)
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误信息为{}".format(e))
        if re_data.get("LAT_"):
            # 根据前面查询的经纬度获取周围公交车站精确经纬度
            lat_handle = ""
            try:
                lat_origin = ",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])])
                i = 0
                find_tag = False
                while True:
                    s3 = get_periphery(classify="公交车站",
                                       tag="交通设施",
                                       lat_lng=lat_origin,
                                       radius=3000,
                                       page_num=i)
                    for nearby in s3["results"]:
                        if data["NAME_"] in nearby["name"]:
                            find_tag = True
                            lat = str(nearby["location"]["lat"])
                            lng = str(nearby["location"]["lng"])
                            re_data["LAT_"] = lat
                            re_data["LNG_"] = lng
                            lat_handle = lat + "," + lng
                            break
                    if find_tag:
                        break
                    i += 1
                    if len(s3["results"]) != 20:
                        break
            except Exception as e:
                self.logger.info(f"获取精确经纬度失败, ERROR: {e}")
            if len(lat_handle) > 0:
                # 获取精确经纬度后根据精确经纬度补全地址信息
                try:
                    area_result = get_area(lat_handle)
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
            else:
                try:
                    area_result = get_area(",".join(
                        [str(re_data["LAT_"]),
                         str(re_data["LNG_"])]))
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass

        # 站点描述
        re_data["DESCRIBE_"] = data["DESCRIBE_"]
        # 周边站点
        re_data["AROUND_STATIONS_"] = self.handle_special_text(
            data["AROUND_STATIONS_"]).replace("|", ",")
        # 途径路线
        re_data["AROUND_ROUTE_"] = self.handle_special_text(
            data["AROUND_ROUTE_"]).replace("|", ",")
        if re_data["AROUND_ROUTE_"]:
            re_data["AROUND_ROUTE_"] = re_data["AROUND_ROUTE_"].replace(
                "公交线路", "")
        # 站点名称
        re_data["NAME_"] = data["NAME_"]
        re_data = super(Branchjtgj, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)

        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Esempio n. 10
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data_list = list()

        # print(data["CONTENT_HTML_"])

        # 根据CONTENT_HTML_ 获取商圈字典型列表("区域":"商圈名")
        soup = BeautifulSoup(data["CONTENT_HTML_"], "html.parser")
        dl = soup.find_all('dl', {"class": "list"})
        # 商圈字典型列表
        dt_dict = dict()
        for item in dl:
            # print(item)
            dt = item.dt.a.string
            li_list = list()
            for li in item.find_all('li'):
                # print(li)
                li_list.append(li.a.string)
            dt_dict[dt] = li_list
        for area_name in dt_dict:

            shopping_list = dt_dict[area_name]
            # print(dt_dict)

            # 得到各商圈经度和维度 补全省市区域数据
            for shopping_name in shopping_list:
                re_data = dict()

                # 时间维度
                re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace(
                    "-", "")
                # 标签
                if "TAGS_" in data:
                    re_data["TAGS_"] = ""
                # SOURCE
                source = re.findall(r"(https?://.*?)/", data["URL_"])
                re_data["SOURCE_"] = source[0]
                # 数据来源名称
                re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
                # # 数据来源编码
                # s_index = data["ENTITY_CODE_"].rfind("_")
                # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
                # 资讯来源分类
                re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
                # ID
                serial_number = req_for_serial_number(code="WD_SS_SQ")
                re_data["ID_"] = serial_number
                try:
                    lat_result = get_lat_lng(address=data["CITY_"] + "市" +
                                             area_name + shopping_name)
                    re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                    re_data["LNG_"] = lat_result["result"]["location"]["lng"]
                except KeyError:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                except Exception as e:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                    self.logger.info("获取经纬度失败错误信息为{}".format(e))
                if re_data["LAT_"]:
                    try:
                        area_result = get_area(",".join(
                            [str(re_data["LAT_"]),
                             str(re_data["LNG_"])]))
                    except Exception as e:
                        self.logger.info(f"获取地址失败, ERROR: {e}")
                    else:
                        try:

                            re_data["ADDR_"] = area_result["result"][
                                "formatted_address"]
                            re_data["PROVINCE_NAME_"] = area_result["result"][
                                "addressComponent"]["province"]
                            re_data["CITY_NAME_"] = area_result["result"][
                                "addressComponent"]["city"]
                            re_data["AREA_NAME_"] = area_result["result"][
                                "addressComponent"]["district"]
                            re_data["AREA_CODE_"] = area_result["result"][
                                "addressComponent"]["adcode"]
                            re_data["CITY_CODE_"] = re_data[
                                "AREA_CODE_"][:4] + "00"
                            re_data["PROVINCE_CODE_"] = re_data[
                                "AREA_CODE_"][:2] + "00"
                        except KeyError:
                            re_data["ADDR_"] = shopping_name
                            re_data["PROVINCE_NAME_"] = None
                            re_data["CITY_NAME_"] = data["CITY_"] + "市"
                            re_data["AREA_NAME_"] = None
                            re_data["AREA_CODE_"] = None
                            re_data["CITY_CODE_"] = None
                            re_data["PROVINCE_CODE_"] = None

                re_data["NAME_"] = shopping_name
                re_data = super(Branchsssq,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                re_data_list.append({
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                })
        # print(re_data_list)
        return re_data_list
Esempio n. 11
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()
        # ID
        serial_number = req_for_serial_number(code="WD_JT_DT")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        temp_location = data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find(
            "|")] + data["STATION_NAME_"] + "地铁站"
        # print(temp_location)
        # try:
        #     res = req_for_textLoc(text=data["SUBWAY_NAME_"][:data["SUBWAY_NAME_"].find("|")] + data["STATION_NAME_"]+"地铁站")
        # except Exception as e:
        #     self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误."
        #                           f" 原始数据 collection = {self.m_client.mongo_collection};"
        #                           f" ENTITY_CODE_ = {self.entity_code};"
        #                           f" 原始数据 _id = {data['_id']};"
        #                           f" error: {e}.")
        # else:
        #     if "error" not in res:
        #         if res["tagsId"] == "None" or res["tagsId"] is None:
        #             pass
        #         else:
        #             re_data["TAGS_"] = res["tagsId"]
        #         if res["flag"] == 1:
        try:
            lat_result = get_lat_lng(address=temp_location)
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误信息为{}".format(e))
        if re_data["LAT_"]:
            lat_handle = ""
            try:
                lat_origin = ",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])])
                i = 0
                find_tag = False
                while True:
                    s3 = get_periphery(classify="地铁站",
                                       tag="交通设施",
                                       lat_lng=lat_origin,
                                       radius=3000,
                                       page_num=i)
                    for nearby in s3["results"]:
                        if nearby["name"] == data["STATION_NAME_"]:
                            find_tag = True
                            lat = str(nearby["location"]["lat"])
                            lng = str(nearby["location"]["lng"])
                            re_data["LAT_"] = lat
                            re_data["LNG_"] = lng
                            lat_handle = lat + "," + lng
                            break
                    if find_tag:
                        break
                    i += 1
                    if len(s3["results"]) != 20:
                        break
            except Exception as e:
                self.logger.info(f"获取精确经纬度失败, ERROR: {e}")
            if len(lat_handle) > 0:
                # 获取精确经纬度后根据精确经纬度补全地址信息
                try:
                    # area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
                    area_result = get_area(lat_handle)
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
            else:
                try:
                    area_result = get_area(",".join(
                        [str(re_data["LAT_"]),
                         str(re_data["LNG_"])]))
                except Exception as e:
                    self.logger.info(f"获取地址失败, ERROR: {e}")
                else:
                    try:
                        re_data["PROVINCE_NAME_"] = area_result["result"][
                            "addressComponent"]["province"]
                        re_data["CITY_NAME_"] = area_result["result"][
                            "addressComponent"]["city"]
                        re_data["AREA_NAME_"] = area_result["result"][
                            "addressComponent"]["district"]
                        re_data["AREA_CODE_"] = area_result["result"][
                            "addressComponent"]["adcode"]
                        re_data[
                            "CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                        re_data["PROVINCE_CODE_"] = re_data[
                            "AREA_CODE_"][:2] + "00"
                    except KeyError:
                        pass
        # print(re_data)
        # 站点名称
        if "STATION_NAME_" in data:
            re_data["STATION_NAME_"] = data["STATION_NAME_"]
        # 途经路线(地铁几号线)
        temp_subway = data["SUBWAY_NAME_"].replace("|", "-")
        AROUND_ROUTE_ = re.findall(r"地铁\d+号线", temp_subway)
        if len(AROUND_ROUTE_) == 1:
            re_data["AROUND_ROUTE_"] = AROUND_ROUTE_[0]
        elif len(AROUND_ROUTE_) > 1:
            re_data["AROUND_ROUTE_"] = ",".join(AROUND_ROUTE_)
        else:
            re_data["AROUND_ROUTE_"] = ""

        # 地铁名称
        if "SUBWAY_NAME_" in data:
            SUBWAY_NAME_ = data["SUBWAY_NAME_"].replace("|", "-")
            if "," in SUBWAY_NAME_:
                re_data_list = list()
                SUBWAY_LIST = SUBWAY_NAME_.split(",")
                for subway in SUBWAY_LIST:
                    # 拆开的地铁名称需要再获取serial_number
                    serial_number = req_for_serial_number(code="WD_JT_DT")
                    re_data["ID_"] = serial_number
                    re_data["SUBWAY_NAME_"] = subway + "-" + re_data[
                        "STATION_NAME_"]
                    re_data = super(Branchjtdt,
                                    self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
                    # temp_dict = deepcopy(re_data)
                    temp_dict = deepcopy({
                        "TABLE_NAME_": self.p_client.table_name,
                        "DATA_": re_data
                    })
                    re_data_list.append(temp_dict)
                return re_data_list
            else:
                re_data["SUBWAY_NAME_"] = SUBWAY_NAME_ + "-" + re_data[
                    "STATION_NAME_"]
                re_data = super(Branchjtdt,
                                self).generic_shuffle(data=data,
                                                      re_data=re_data,
                                                      field=None)
                return [{
                    "TABLE_NAME_": self.p_client.table_name,
                    "DATA_": re_data
                }]
Esempio n. 12
0
    def generic_shuffle(self, data, re_data, field=None):
        """
        通用清洗规则写在这里, 现只有从字段中匹配银行。
        :param data: 要清洗的数据 type: dict
        :param re_data: 要清洗的数据 type: dict
        :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ...
                                          NoneType: None 无需清洗
        :return: 清洗完毕的数据 type: dict
        """

        # 涉及银行统一在 __init_____.py 中处理
        # if field:
        #     if "BANK_NAME_" not in re_data:
        #         for bank in self.bank_list:
        #             if data["ENTITY_NAME_"][:-4] in bank["ALIAS_"]:
        #                 re_data["BACK_CODE_"] = bank["CODE_"]  # 银行编码
        #                 re_data["BACK_NAME_"] = bank["NAME_"]  # 银行名称
        #                 break

        if "ID_" not in re_data:
            serial_number = req_for_serial_number(
                code=data["ENTITY_CODE_"][:8])
            re_data["ID_"] = serial_number
        # 文件上传
        if "YJBG_" in data["ENTITY_CODE_"]:
            tc = "YJBG"

        if data["FILE_URL_"]:
            re_postfix = re.findall(r"\.([pd][do][fc]x?$)", data["FILE_URL_"])
            if re_postfix or data.get('ENTITY_CODE_') in [
                    'XYK_YJBG_GFYH', 'XYK_YJBG_JTYH'
            ]:
                postfix = re_postfix[0] if re_postfix else 'pdf'
                if "FILE_NAME_" in data:
                    file_name = data["FILE_NAME_"]
                else:
                    re_file_name = re.findall(rf"/(.*?)\.{postfix}",
                                              data["FILE_URL_"], re.IGNORECASE)
                    if re_file_name:
                        file_name = re_file_name[0]
                    else:
                        file_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["FILE_URL_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    print('附件请求成功')
                    if response:
                        try:
                            # p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_{tc}_{postfix.upper()}",
                            p_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_YJBG",
                                file_name=file_name,
                                postfix=postfix,
                                file=response.content)
                            if "error" in p_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {p_response.content.decode('utf-8')}."
                                )
                                raise Exception("上传文件出错")
                            else:
                                self.logger.info(
                                    f"2.3--success: 文件上传成功."
                                    f"{p_response.content.decode('utf-8')}")
                            p_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                            raise Exception("上传文件出错")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")
                        raise Exception("文件请求失败")

        if "ENTITY_CODE_" not in re_data:
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        if "ENTITY_NAME_" not in re_data:
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        if "URL_" not in re_data:
            if "URL_" in data:
                re_data["URL_"] = data["URL_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME

        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        if "PERIOD_CODE_" not in re_data:
            re_data["PERIOD_CODE_"] = re_data.get("PUBLISH_TIME_", "")
        if "M_STATUS_" not in re_data:
            re_data["M_STATUS_"] = "N"
        if "DELETE_STATUS_" not in re_data:
            re_data["DELETE_STATUS_"] = "N"
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "VERSION_" not in re_data:
            re_data["VERSION_"] = "0"
        if "DATA_VERSION_" not in re_data:
            re_data["DATA_VERSION_"] = "0"
        if "MICROBLOG" not in re_data[
                "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"

        return re_data
Esempio n. 13
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里,如不需要通用清洗规则则不继承, 从大文本中筛选数据
        :param data:
        :param field:
        :return:
        """
        re_data = dict()
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]

        serial_number = req_for_serial_number(code="JRCP_XYK")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")

        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        re_data["SOURCE_TYPE_"] = "WAK"

        # 对特殊微信 BANK_NAME 做处理
        for key, value in self.name_dict.items():
            if key[:2] in data["PRO_NAME_"]:
                re_data["BANK_NAME_"] = key
                re_data["BANK_CODE_"] = value
                break
        if "BANK_NAME_" in re_data:
            if re_data["BANK_NAME_"] == "建信":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建设银行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "农业银行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "工行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "工商银行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "民生银行":
                re_data["BANK_NAME_"] = "中国民生银行"
            if re_data["BANK_NAME_"] == "光大银行":
                re_data["BANK_NAME_"] = "中国光大银行"
            if re_data["BANK_NAME_"] == "交行":
                re_data["BANK_NAME_"] = "交通银行"
            if re_data["BANK_NAME_"] == "招行":
                re_data["BANK_NAME_"] = "招商银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "中行":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "中银":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "邮储银行":
                re_data["BANK_NAME_"] = "中国邮政储蓄银行"

        # 信用卡名称
        if "PRO_NAME_" in data:
            if "(" in data["PRO_NAME_"]:
                data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"].
                                                      find("(")]
            elif "(" in data["PRO_NAME_"]:
                data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"].
                                                      find("(")]
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
        # 卡币种
        if "CURRENCY_TYPE_" in data:
            re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
        # 卡币种类型
        if data["CURRENCY_TYPE_"] == "人民币":
            re_data["CURRENCY_TYPE_CODE_"] = "RMB"
        if re.match(r"人民币/.*?", data["CURRENCY_TYPE_"]):
            re_data["CURRENCY_TYPE_CODE_"] = "DBZ"
        if data["CURRENCY_TYPE_"] == "美元":
            re_data["CURRENCY_TYPE_CODE_"] = "DBZ"
        # 卡组织|结算渠道
        if "BRAND_" in data:
            re_data["BRAND_"] = data["BRAND_"]

        # 卡组织CODE
        for brand_key in self.brand_dict:
            if brand_key in data["BRAND_"]:
                re_data["BRAND_CODE_"] = self.brand_dict[brand_key]
                break

        # 卡等级
        if "LEVEL_" in data:
            re_data["LEVEL_"] = data["LEVEL_"]
        # 卡等级CODE
        for level_key in self.level_dict:
            if level_key[:2] in data["LEVEL_"][:2]:
                re_data["LEVEL_CODE_"] = self.level_dict[level_key]
                break
        # 取现额度
        if "CONSUME_LIMIT_" in data:
            re_data["CONSUME_LIMIT_"] = data["CONSUME_LIMIT_"]

        # 这里开始从大文本清洗
        # 免息期
        GRACE_PERIODS_ = re.findall(r".*?免息期[::]\|(.*?)\|", data["CONTENT_"])
        if len(GRACE_PERIODS_) > 0:
            GRACE_PERIODS_ = GRACE_PERIODS_[0]
            # 处理到20天50天的错误数据
            pattern = re.compile(r"到(\d+)天(\d+)天")
            if re.match(pattern, GRACE_PERIODS_):
                GRACE_PERIODS_ = pattern.sub(r"\1天到\2天", GRACE_PERIODS_)

            if GRACE_PERIODS_ == "消费验证方式:":
                GRACE_PERIODS_ = ""

            if GRACE_PERIODS_ == "预借现金额度:" or GRACE_PERIODS_ == "预借现金额度:":
                GRACE_PERIODS_ = ""
            if re.match(r"最长\d+天最长\d+天", GRACE_PERIODS_):
                a = re.match(r"(最长\d+天)最长\d+天", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1)

            if re.match(r"\d+天到\d+天\d+天到\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)到(\d+天)(\d+天)到\d+天", GRACE_PERIODS_)
                if a.group(1) == a.group(2):
                    GRACE_PERIODS_ = a.group(1) + "到" + a.group(3)
                else:
                    GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"\d+天\d+天\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)\d+天(\d+天)\d+天", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)(\d+天)", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"至\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"至(\d+天)(\d+天)", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            re_data["GRACE_PERIODS_"] = GRACE_PERIODS_
        else:
            re_data["GRACE_PERIODS_"] = data["GRACE_PERIODS_"]

        # 免年费政策
        FREE_POLICY_ = re.findall(r".*?免年费政策[::]\|(.*?)\|", data["CONTENT_"])
        if len(FREE_POLICY_) > 0:
            FREE_POLICY_ = FREE_POLICY_[0]
            # 删除重复数据
            pattern = re.compile(r"(免\d+年年费){2,9}")
            if re.match(pattern, FREE_POLICY_):
                a = re.match(pattern, FREE_POLICY_)
                FREE_POLICY_ = a.group(1)
            pattern = re.compile(r"(终身免年费){2,9}")
            if re.match(pattern, FREE_POLICY_):
                a = re.match(pattern, FREE_POLICY_)
                FREE_POLICY_ = a.group(1)
            re_data["FREE_POLICY_"] = FREE_POLICY_

        # 主卡年费
        FEE_ = re.findall(r".*?主卡年费[::]\|(.*?)\|", data["CONTENT_"])
        if len(FEE_) > 0:
            FEE_ = FEE_[0]
            tempfee = re.findall(r".*?(\d+).*?", FEE_)
            if len(tempfee) > 0:
                re_data["FEE_"] = tempfee[0]
            else:
                re_data["FEE_"] = ""
        else:
            re_data["FEE_"] = "0"

        # 预借现金额度
        PRE_BORROW_ = re.findall(r".*?预借现金额度[::]\|(.*?)\|", data["CONTENT_"])
        if len(PRE_BORROW_) > 0:
            PRE_BORROW_ = PRE_BORROW_[0]
            if PRE_BORROW_ == "免息期:":
                PRE_BORROW_ = ""
            if PRE_BORROW_ == "免年费政策:":
                PRE_BORROW_ = ""
            # 去除重复的数据
            pattern = re.compile(r"(信用额度的\d+%)信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(信用额度的\d+-\d+%)信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(普卡信用额度的\d+%)白金卡信用额度的\d+%金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(普卡信用额度的\d+%)金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(白金卡信用额度的\d+%)金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            re_data["PRE_BORROW_"] = PRE_BORROW_
        else:
            re_data["PRE_BORROW_"] = ""

        # 消费验证方式
        re_data["VALID_CONSUME_"] = "密码+签名 签名"

        # 账单日
        BILL_DATE_ = re.findall(r".*?账单日[::]\|(.*?)\|", data["CONTENT_"])
        if len(BILL_DATE_) > 0:
            BILL_DATE_ = BILL_DATE_[0]
            # 处理重复的账单日数据 比如:账单日21号账单日21号账单日21号
            pattern = re.compile(r"(账单日\d+号){2,9}")
            if re.match(pattern, BILL_DATE_):
                a = re.match(pattern, BILL_DATE_)
                BILL_DATE_ = a.group(1)
            re_data["BILL_DATE_"] = BILL_DATE_
        else:
            re_data["BILL_DATE_"] = ""

        # 积分方式
        POINTS_ = re.findall(r".*?积分方式[::]\|(.*?)\|", data["CONTENT_"])
        if len(POINTS_) > 0:
            POINTS_ = POINTS_[0]
            if re_data.get("BANK_CODE_") and re_data["BANK_CODE_"] == "CMB":
                POINTS_ = POINTS_.replace("元", "元 ")
            else:
                POINTS_ = POINTS_.replace("分", "分 ")
                POINTS_ = POINTS_.replace("倍", "倍 ")
                POINTS_ = POINTS_.replace("积分 的2倍", "积分的2倍")
            re_data["POINTS_"] = POINTS_
        else:
            re_data["POINTS_"] = ""

        # 积分有效期
        VALID_DATE_POINTS_ = re.findall(r".*?积分有效期[::]\|(.*?)\|",
                                        data["CONTENT_"])
        if len(VALID_DATE_POINTS_) > 0:
            VALID_DATE_POINTS_ = VALID_DATE_POINTS_[0]
            # 给几组有效期之间加上空格
            pattern = re.compile(r"(白金卡\d+年)(金卡\d+年)(普卡\d+年)")
            if re.match(pattern, VALID_DATE_POINTS_):
                VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3",
                                            VALID_DATE_POINTS_)

            pattern = re.compile(r"(\d+年到\d+年)(\d+年)(永久有效)")
            if re.match(pattern, VALID_DATE_POINTS_):
                VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3",
                                            VALID_DATE_POINTS_)

            re_data["VALID_DATE_POINTS_"] = VALID_DATE_POINTS_
        else:
            re_data["VALID_DATE_POINTS_"] = ""

        # 循环信用利息
        DAILY_INTEREST_ = re.findall(r".*?循环信用利息(日息)[::]?\|(.*?)\|",
                                     data["CONTENT_"])
        if len(DAILY_INTEREST_) > 0:
            DAILY_INTEREST_ = DAILY_INTEREST_[0]
            if DAILY_INTEREST_ == "消费短信通知费:":
                DAILY_INTEREST_ = ""
            re_data["DAILY_INTEREST_"] = DAILY_INTEREST_
        else:
            re_data["DAILY_INTEREST_"] = ""

        # 最低还款
        MIN_REPAY_ = re.findall(r".*?最低还款[::]?\|(.*?)\|", data["CONTENT_"])
        if len(MIN_REPAY_) > 0:
            MIN_REPAY_ = MIN_REPAY_[0]
            if re.match(r"最低应还所欠金额的\d+%最低应还所欠金额的\d+%", MIN_REPAY_):
                a = re.match(r"(最低应还所欠金额的\d+%)最低应还所欠金额的\d+%", MIN_REPAY_)
                MIN_REPAY_ = a.group(1)
            if MIN_REPAY_ == "账单日:":
                MIN_REPAY_ = ""
            re_data["MIN_REPAY_"] = MIN_REPAY_
        else:
            re_data["MIN_REPAY_"] = ""

        # 卡片特色
        if "SPECIAL_" in data and len(data["SPECIAL_"]) > 0:
            re_data["SPECIAL_"] = data["SPECIAL_"].replace("|", "<br/>")

        # 增值服务
        if "VAS_" in data and len(data["VAS_"]) > 0:
            re_data["VAS_"] = data["VAS_"].replace("|", "<br/>")

        # 信用卡图片
        # 处理错误的信用卡图片URL
        if "IMAGES_" in data:
            pattern = re.compile(r"https:(http://.*)")
            if re.match(pattern, data["IMAGES_"]):
                a = re.match(pattern, data["IMAGES_"])
                image_url = a.group(1)
            else:
                image_url = data["IMAGES_"]
            response = req_for_something(url=image_url)
            if response:
                t = base64.b64encode(response.content)
                re_data["IMAGE_"] = t.decode("utf-8")

        re_data = super(BranchXyk, self).generic_shuffle(data=data,
                                                         re_data=re_data,
                                                         field=None)
        # print(re_data)
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
Esempio n. 14
0
    def generic_shuffle(self, data):
        re_data = dict()
        if data["TITLE_"]:
            serial_number = req_for_serial_number(code="WECHAT")
            re_data["ID_"] = serial_number

            re_data["PERIOD_CODE_"] = data["PERIOD_CODE_"].replace("-", "")

            # re_data["SOURCE_"] = data[""]
            # re_data["SOURCE_NAME_"] = data[""]

            re_data["SOURCE_TYPE_"] = "WECHAT"
            re_data["HOT_"] = "0"
            re_data["PUBLISH_TIME_"] = data["PERIOD_CODE_"]
            # .replace("&quot;", "").replace("&amp;", "")
            re_data["TITLE_"] = data["TITLE_"]
            t = base64.b64encode(re_data["TITLE_"].encode("utf-8"))
            re_data["TITLE_CODE_"] = t.decode("utf-8")
            re_data["WECHAT_ID_"] = data["WECHAT_"].strip()
            for wechat_item in self.excel_dict:
                if re_data["WECHAT_ID_"] == wechat_item["WECHAT_CODE_"]:
                    re_data["WECHAT_NAME_"] = wechat_item["WECHAT_NAME_"]
                    re_data["PROVINCE_NAME_"] = wechat_item["PROVINCE_NAME_"]
                    re_data["PROVINCE_CODE_"] = str(
                        wechat_item["PROVINCE_CODE_"])
                    if "." in re_data["PROVINCE_CODE_"]:
                        re_data["PROVINCE_CODE_"] = re_data[
                            "PROVINCE_CODE_"].split(".")[0]
                    re_data["CITY_NAME_"] = wechat_item["CITY_NAME_"]
                    re_data["CITY_CODE_"] = str(wechat_item["CITY_CODE_"])
                    if "." in re_data["CITY_CODE_"]:
                        re_data["CITY_CODE_"] = re_data["CITY_CODE_"].split(
                            ".")[0]
                    re_data["LAT_"] = str(wechat_item["LAT_"])
                    re_data["LNG_"] = str(wechat_item["LNG_"])
                    break

            re_data["IMPORTANCE_"] = "N"
            re_data["READS_"] = "0"
            re_data["COMMENTS_"] = "0"
            # re_data["ACT_"] = data[""]
            # re_data["ACT_TYPE_"] = data[""]
            # 补录
            # re_data["TYPE_"] = data[""]
            # re_data["TYPE_CODE_"] = data[""]

            re_data["PUBLISH_STATUS_"] = "N"
            re_data["SENSITIVE_"] = "N"
            # # 模型
            # censor = req_for_censor("".join(re.findall(r"\w+", data["CONTENT_"])))
            # if censor:
            #     if censor["censor"] == "N":
            #         re_data["SENSITIVE_"] = "N"
            #     else:
            #         re_data["SENSITIVE_"] = "Y"
            #         re_data["SENSITIVE_WORD_"] = censor["words"]

            re_data["VERSION_"] = "0"
            re_data["RECOMMEND_"] = "0"

            html = re.sub(r"[\n\t\r]+", "", data["CONTENT_"])
            html = re.sub(r"<script.*?</script>", "", html)
            html = re.sub(r"href=\".*?\"", "href=\"javascript:void(0);\"",
                          html)

            del data["CONTENT_"]
            data["HTML_"] = html

            re_data = super(WechatScript,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field="ENTITY_NAME_")
            if re_data.get('_id'):
                del re_data['_id']
            return [{
                "TABLE_NAME_": self.p_client.table_name,
                "DATA_": re_data
            }]
        else:
            return
Esempio n. 15
0
    def generic_shuffle(self, data):
        re_data = list()
        # CHA_BRANCH_WEIBO_INFO
        info_data = dict()
        serial_number = req_for_serial_number(code="WEIBO_INFO")
        info_data["ID_"] = serial_number
        print(serial_number)

        info_data["ENTITY_CODE_"] = data["BANK_CODE_"]

        info_data["URL_"] = data["CONTENT_URL_"]

        info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "")
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"])
        info_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        info_data["SOURCE_TYPE_"] = "WEIBO"

        info_data["LIKES_"] = data["PRAISES_"]
        if not info_data["LIKES_"]:
            info_data["LIKES_"] = 0
        info_data["COMMENTS_"] = data["REPLIES_"]
        if not info_data["COMMENTS_"]:
            info_data["COMMENTS_"] = 0
        info_data["RELAYS_"] = data["RELAYS_"]
        if not info_data["RELAYS_"]:
            info_data["RELAYS_"] = 0
        info_data["IMPORTANCE_"] = "N"
        info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
        info_data["CONTENT_"] = data["CONTENT_"]
        if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0:
            for each_image in data["CONTENT_IMAGES_"]:
                response = req_for_something(url=each_image)
                if response:
                    t = base64.b64encode(response.content)
                    info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8")
                    response.close()

        # 补录
        # info_data["TYPE_"] = data[""]
        # info_data["TYPE_CODE_"] = data[""]
        info_data["PUBLISH_STATUS_"] = "N"
        if "OWN_" in data:
            if data["OWN_"] == "转载":
                info_data["OWN_"] = "N"
            else:
                info_data["OWN_"] = "Y"

        for each in self.weibo_list:
            if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]:
                info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"]
                info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"]
                break
        # 模型
        # 摘要
        try:
            brief = req_for_ts(info_data["CONTENT_"])
            if brief:
                info_data["BRIEF_"] = brief["summary"]
        except Exception as e:
            self.logger.info(f"调用模型req_for_ts失败,原因为{e}")
            info_data["BRIEF_"] = ""
        # 是否敏感
        try:
            censor = req_for_censor(info_data["CONTENT_"])
            if censor:
                if censor["censor"] == "N":
                    info_data["SENSITIVE_"] = "N"
                else:
                    info_data["SENSITIVE_"] = "Y"
                    info_data["SENSITIVE_WORD_"] = censor["words"]
        except Exception as e:
            self.logger.info(f"调用模型censor失败,错误为{e}")
            info_data["SENSITIVE_"] = "N"

        info_data["VERSION_"] = "0"
        info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_")
        # 清洗浦发银行BANK_NAME_和BANK_CODE_
        if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博":
            info_data["BANK_NAME_"] = "浦发银行"
            info_data["BANK_CODE_"] = "SPDB"
        if info_data["ENTITY_NAME_"] == "南海农商银行微博":
            info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "NRC"
        if info_data["ENTITY_NAME_"] == "顺德农商银行微博":
            info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "sdebank"

        comment = data["INFO_COMMENTS_"]
        verifieds = 0
        for c in comment:
            if c.get("VERIFIED_", ""):
                verifieds += 1

        # 微博热度
        try:
            hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"],
                                    replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds)
            if hot:
                info_data["HOT_"] = hot["level"]
        except Exception as e:
            self.logger.info(f"调用模型weibo_hot失败,错误为{e}")

        re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data})
        if len(comment) > 0:
            comment_count = 0
            for each in comment:
                # CHA_BRANCH_WEIBO_COMMENT
                # 每次需要初始化comment_data不然导致数据重复
                comment_data = dict()
                # HBase row_key
                serial_number = req_for_serial_number(code="WEIBO_COMMENT")
                comment_data["ID_"] = serial_number
                comment_data["INFO_ID_"] = info_data["ID_"]
                comment_data["COMMENT_"] = each["COMMENT_"]
                comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"]
                comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"]
                comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"]
                comment_data["REPLIER_"] = each["REPLIER_"]
                comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"]
        # 情感分析

                if each.get("COMMENT_") and len(each["COMMENT_"]) > 0:
                    try:
                        sentiment = req_for_comment(each["COMMENT_"])
                        if sentiment:
                            if sentiment["sentiment"] == "中性":
                                comment_data["EMOTION_"] = "NORMAL"
                            if sentiment["sentiment"] == "积极":
                                comment_data["EMOTION_"] = "POSITIVE"
                            if sentiment["sentiment"] == "敏感":
                                comment_data["EMOTION_"] = "NAGETIVE"
                        else:
                            comment_data["EMOTION_"] = "NORMAL"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["EMOTION_"] = "NORMAL"

        # 是否敏感
                    try:
                        censor = req_for_censor(each["COMMENT_"])
                        if censor:
                            if censor["censor"] == "N":
                                comment_data["SENSITIVE_"] = "N"
                            else:
                                comment_data["SENSITIVE_"] = "Y"
                                comment_data["SENSITIVE_WORD_"] = censor["words"]
                        else:
                            comment_data["SENSITIVE_"] = "N"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["SENSITIVE_"] = "N"

                comment_data["VERSION_"] = "0"
                comment_data["CREATE_BY_ID_"] = "P0131857"
                comment_data["CREATE_BY_NAME_"] = "钟楷文"
                re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data})
                comment_count += 1
            # 打相关评论日志方便调试
            self.logger.info(f'清洗的URL为{info_data["URL_"]}')
            self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}')
            self.logger.info(f'插入到comment表的数量为{comment_count}')
        # print(re_data)
        return re_data
Esempio n. 16
0
    def generic_shuffle(self, data, re_data, field="CONTENT_"):
        """
        父类通用清洗规则写在这里, 现只有从字段中匹配银行。
        :param data: 要清洗的数据 type: dict
        :param re_data: 要清洗的数据 type: dict
        :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ...
                                          NoneType: None 无需清洗
        :return: 清洗完毕的数据 type: dict
        """

        if not field:
            pass
        # 涉及银行统一在 __init_____.py 中处理
        else:
            if "BANK_NAME_" not in re_data:
                if "ZX" in data.get("ENTITY_CODE_", "")[:2]:
                    if field in data:
                        try:
                            result = req_for_ner(data[field])
                        except Exception as e:
                            self.logger.exception(
                                f"2.2--err: 请求模型 req_for_ner 错误."
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f" error: {e}.")
                        else:
                            if result:
                                if "Organ" in result:
                                    if result["Organ"].get("entity", ""):
                                        organ = result["Organ"]["entity"]
                                        for each in self.bank_list:
                                            if organ in each["ALIAS_"]:
                                                re_data["BANK_NAME_"] = each[
                                                    "NAME_"]
                                                re_data["BANK_CODE_"] = each[
                                                    "CODE_"]
                                                break
                else:
                    bank_list = list()
                    bank_code_list = list()
                    for each in self.bank_list:
                        if each["NAME_"] in data.get(field, ""):
                            bank_list.append(each["NAME_"])
                            bank_code_list.append(each["CODE_"])
                    if bank_list:
                        re_data["BANK_NAME_"] = "|".join(bank_list)
                    if bank_code_list:
                        re_data["BANK_CODE_"] = "|".join(bank_code_list)
        # 地址信息
        #     # todo 机构
        #     # data["UNIT_CODE_"] = ""
        #     # data["UNIT_NAME_"] = ""
        if "ID_" not in re_data:
            serial_number = req_for_serial_number(
                code=data["ENTITY_CODE_"][:7])
            re_data["ID_"] = serial_number

        # FDFS 存储
        if "ENTITY_CODE_" in data:
            if data["ENTITY_CODE_"][:2] == "ZX":
                tc = "NEWS"
            elif "WECHAT" in data["ENTITY_CODE_"]:
                tc = "WECHAT"
            elif "JRCP_BX" in data["ENTITY_CODE_"]:
                tc = "INSURANCE"
            elif "JRCP_LCCP" in data["ENTITY_CODE_"]:
                tc = "LCCP"

        elif "BANK_CODE_" in data:
            if "MICROBLOG" in data["BANK_CODE_"]:
                tc = "WEIBOBASIC"
        if "HTML_" in data:
            if data["HTML_"]:
                if "HTML_NAME_" in data:
                    html_name = data["HTML_NAME_"]
                elif "PDF_NAME_" in data:
                    html_name = data["PDF_NAME_"]
                else:
                    html_name = str(uuid.uuid1())
                try:
                    response_file = req_for_file_save(
                        id=re_data["ID_"],
                        type_code=f"CHA_{tc}_HTML",
                        file_name=html_name,
                        postfix="html",
                        file=data["HTML_"].encode("utf-8"))
                    if "error" in response_file.content.decode("utf-8"):
                        self.logger.info(
                            f"2.3--err:文件上传错误."
                            f" 原始数据collection={self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f" error: {response_file.content.decode('utf-8')}."
                        )
                        raise Exception(
                            f"附件上传错误{response_file.content.decode('utf-8')}")
                    response_file.close()
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
        elif "PDF_" in data:
            if data["PDF_"]:
                if "HTML_NAME_" in data:
                    pdf_name = data["HTML_NAME_"]
                elif "PDF_NAME_" in data:
                    pdf_name = data["PDF_NAME_"]
                else:
                    if ".PDF" in data["PDF_"] or ".pdf" in data["PDF_"]:
                        file_name = re.findall(r"/(.*?).pdf", data["PDF_"],
                                               re.IGNORECASE)
                        if file_name:
                            pdf_name = file_name[0]
                        else:
                            pdf_name = str(uuid.uuid1())
                    else:
                        pdf_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["PDF_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    if response:
                        try:
                            # todo 文件上传出错是否继续还是跳过
                            p_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_{tc}_PDF",
                                file_name=pdf_name,
                                postfix="pdf",
                                file=response.content)
                            if "error" in p_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {p_response.content.decode('utf-8')}."
                                )
                            p_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")
        elif "PDF_1_" in data:
            if data["PDF_1_"]:
                for i in range(10):
                    try:
                        if f"PDF_{i}_NAME_" in data:
                            pdf_name = data[f"PDF_{i}_NAME_"]
                        else:
                            if ".PDF" in data[f"PDF_{i}_"] or ".pdf" in data[
                                    f"PDF_{i}_"]:
                                file_name = re.findall(r"/(.*?).pdf",
                                                       data[f"PDF_{i}_"],
                                                       re.IGNORECASE)
                                if file_name:
                                    pdf_name = file_name[0]
                                else:
                                    pdf_name = str(uuid.uuid1())
                            else:
                                pdf_name = str(uuid.uuid1())
                        try:
                            response = req_for_something(url=data[f"PDF_{i}_"])
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        else:
                            if response:
                                try:
                                    p_response = req_for_file_save(
                                        id=re_data["ID_"],
                                        type_code=f"CHA_{tc}_PDF",
                                        file_name=pdf_name,
                                        postfix="pdf",
                                        file=response.content)
                                    if "error" in p_response.content.decode(
                                            "utf-8"):
                                        self.logger.info(
                                            f"2.3--err:文件上传错误."
                                            f" 原始数据collection={self.m_client.mongo_collection};"
                                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                            f" 原始数据 _id = {data['_id']};"
                                            f" error: {p_response.content.decode('utf-8')}."
                                        )
                                    p_response.close()
                                except Exception as e:
                                    self.logger.exception(
                                        f"2.1--err: PDF"
                                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                        f" 原始数据 _id = {data['_id']};"
                                        f"error: {e}.")
                                finally:
                                    response.close()
                            else:
                                self.logger.exception(
                                    f"2.1--err: PDF"
                                    f" 原始数据 collection = {self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f"error: PDF 请求失败.")
                    except KeyError:
                        break
        elif "PDF_URL_" in data:
            if data["PDF_URL_"]:
                if "PDF_NAME_" in data:
                    pdf_name = data["PDF_NAME_"]
                else:
                    if ".PDF" in data["PDF_URL_"] or ".pdf" in data["PDF_URL_"]:
                        file_name = re.findall(r"/(.*?).pdf", data["PDF_URL_"],
                                               re.IGNORECASE)
                        if file_name:
                            pdf_name = file_name[0]
                        else:
                            pdf_name = str(uuid.uuid1())
                    else:
                        pdf_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["PDF_URL_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    if response:
                        try:
                            f_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_{tc}_PDF",
                                file_name=pdf_name,
                                postfix="pdf",
                                file=response.content)
                            if "error" in f_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {f_response.content.decode('utf-8')}."
                                )

                            f_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")

        if "ENTITY_CODE_" not in re_data:
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        if "ENTITY_NAME_" not in re_data:
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        if "URL_" not in re_data:
            if "URL_" in data:
                re_data["URL_"] = data["URL_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME

        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        if "M_STATUS_" not in re_data:
            re_data["M_STATUS_"] = "N"
        if "DELETE_STATUS_" not in re_data:
            re_data["DELETE_STATUS_"] = "N"
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "MICROBLOG" not in re_data[
                "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"

        return re_data
Esempio n. 17
0
    def __shuffle(self, data):
        re_data = dict()
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]

        if "中国理财网" in data["ENTITY_NAME_"]:
            serial_number = req_for_serial_number(code="JRCP_LCCP_INFO")
            re_data["ID_"] = serial_number
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
            re_data["PRO_ORG_"] = data["PRO_ORG_"]
            re_data["REGIST_CODE_"] = data["REGIST_CODE_"]
            re_data["PRO_STATUS_"] = data["PRO_STATUS_"]
            re_data["OPT_MODE_"] = data["OPT_MODE_"]

            re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
            # re_data["YIELD_TYPE_CODE_"] = data[""]
            re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
            # re_data["CURRENCY_TYPE_CODE_"] = data[""]
            re_data["START_FUNDS_"] = data["START_FUNDS_"]
            try:
                if float(data["START_FUNDS_"]) <= 10000:
                    re_data["START_FUNDS_CODE_"] = "S0_1"
                elif 10000 < float(data["START_FUNDS_"]) <= 50000:
                    re_data["START_FUNDS_CODE_"] = "S1_5"
                elif 50000 < float(data["START_FUNDS_"]) < 100000:
                    re_data["START_FUNDS_CODE_"] = "S5_10"
                elif 100000 < float(data["START_FUNDS_"]):
                    re_data["START_FUNDS_CODE_"] = "S10_"
            except Exception:
                re_data["START_FUNDS_"] = 0

            org = {
                '01': '国有银行',
                '02': '股份制银行',
                '03': '城商行',
                '04': '外资银行',
                '05': '农村合作金融机构',
                '06': '其他',
                '07': '其他',
                '08': '其他',
                '09': '其他',
                '00': '其他',
                '10': '理财子公司'
            }

            re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"]
            re_data['ORG_TYPE_'] = org.get(data.get('ORG_TYPE_'))
            re_data["RAISE_START_"] = data["RAISE_START_"]
            re_data["RAISE_END_"] = data["RAISE_END_"]
            re_data["PRO_START_"] = data["PRO_START_"]
            re_data["PRO_END_"] = data["PRO_END_"]
            re_data["YIELD_LOW_"] = data["YIELD_LOW_"]
            re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"]
            re_data["REAL_DAYS_"] = data["REAL_DAYS_"]
            re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"]
            re_data["DATE_TYPE_"] = data["DATE_TYPE_"]
            re_data["YIELD_"] = data["YIELD_"]
            re_data["RAISE_TYPE_"] = data["RAISE_TYPE_"]
            re_data["INVEST_PROPERTIES_"] = data["INVEST_PROPERTIES_"]
            re_data["BUS_START_"] = data["BUS_START_"]
            re_data["BUS_END_"] = data["BUS_END_"]
            re_data["START_VALUE_"] = data["START_VALUE_"]
            re_data["PRO_VALUE_"] = data["PRO_VALUE_"]
            re_data["TOTAL_VALUE_"] = data["TOTAL_VALUE_"]
            re_data["RECENT_YIELD_"] = data["RECENT_YIELD_"]

            re_data["PRO_TYPE_"] = data["PRO_TYPE_"]
            re_data["SALE_AREA_"] = data["SALE_AREA_"]
            if "PROVINCE_NAME_" in data:
                re_data["PROVINCE_NAME_"] = data["PROVINCE_NAME_"]
            if "PROVINCE_NAME_" in data:
                re_data["PROVINCE_CODE_"] = data["PROVINCE_CODE_"]
            if "CITY_NAME_" in data:
                re_data["CITY_NAME_"] = data["CITY_NAME_"]
            if "CITY_CODE_" in data:
                re_data["CITY_CODE_"] = data["CITY_CODE_"]

            # re_data["REDEEM_"] = data[""]
            # re_data["INCREASE_"] = data[""]
            # re_data["INVEST_RANGE_"] = data[""]
            bank_list = list()
            bank_code_list = list()
            for each in self.bank_list:
                if each["NAME_"] in data.get("ENTITY_NAME_", ""):
                    bank_list.append(each["NAME_"])
                    bank_code_list.append(each["CODE_"])
            if bank_list:
                re_data["BANK_NAME_"] = "|".join(bank_list)
            if bank_code_list:
                re_data["BANK_CODE_"] = "|".join(bank_code_list)

            # del re_data["CREATE_TIME_"]
            # del re_data["SPIDER_TIME_"]
            # del re_data["M_STATUS_"]
            # del re_data["DELETE_STATUS_"]
            # del re_data["DATA_STATUS_"]
            # del re_data["PUBLISH_STATUS_"]

            re_data = super(BranchFinProduct,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field=None)

            if not data["YIELD_LOW_"]:
                re_data['YIELD_LOW_'] = '--'

            if not data["YIELD_HIGH_"]:
                re_data['YIELD_HIGH_'] = '--'

            if not data["START_FUNDS_"]:
                re_data['START_FUNDS_'] = '--'
            return {"TABLE_NAME_": TABLE_NAME("CRMLCCP"), "DATA_": re_data}
        else:
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
            serial_number = req_for_serial_number(code="JRCP_LCCP")
            re_data["ID_"] = serial_number
            re_data["SOURCE_TYPE_"] = ""
            # if "PRO_NAME_" not in data:
            #     return
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
            f_index = data["ENTITY_NAME_"].find("-")
            re_data["PRO_ORG_"] = data["ENTITY_NAME_"][:f_index]
            if "PRO_CODE_" in data:
                re_data["PRO_CODE_"] = data["PRO_CODE_"]
            # 登记编码
            if "REGIST_CODE_" in data:
                re_data["REGIST_CODE_"] = data["REGIST_CODE_"]
            else:
                if "PDF_" in data:
                    try:
                        text = parse(data["PDF_"])
                        registration_code = re.findall(r"C\d{13}", text)
                        if registration_code:
                            re_data["REGIST_CODE_"] = registration_code[0]
                    except Exception as e:
                        self.logger.exception(
                            f"2.1--err: PDF."
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f" error: {e}.")
            # 预售(PRE)、在售(ON)、停售(STOP)
            # 全部为 在售
            re_data["PRO_STATUS_"] = "ON"
            if "OPT_MODE_" in data:
                re_data["OPT_MODE_"] = data["OPT_MODE_"]

            if "YIELD_TYPE_" in data:
                re_data["YIELD_TYPE_"] = data["YIELD_TYPE_"]
                # re_data["YIELD_TYPE_CODE_"] = data[""]
            if "CURRENCY_TYPE_" in data:
                re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
            # re_data["CURRENCY_TYPE_CODE_"] = data[""]
            # 起购金额
            if "START_FUNDS_" in data:
                start_funds = data["START_FUNDS_"].replace(" ", "")
                start_funds = start_funds.replace("亿", "00000000")
                start_funds = start_funds.replace("千万", "0000000")
                start_funds = start_funds.replace("百万", "000000")
                start_funds = start_funds.replace("十万", "00000")
                start_funds = start_funds.replace("万", "0000")
                start_funds = start_funds.replace("千", "000")
                start_funds = start_funds.replace("百", "00")
                start_funds = start_funds.replace("元", "")

                re_data["START_FUNDS_"] = start_funds

                try:
                    if float(re_data["START_FUNDS_"]) <= 10000:
                        re_data["START_FUNDS_CODE_"] = "S0_1"
                    elif 10000 < float(re_data["START_FUNDS_"]) <= 50000:
                        re_data["START_FUNDS_CODE_"] = "S1_5"
                    elif 50000 < float(re_data["START_FUNDS_"]) <= 100000:
                        re_data["START_FUNDS_CODE_"] = "S5_10"
                    elif 100000 < float(re_data["START_FUNDS_"]):
                        re_data["START_FUNDS_CODE_"] = "S10_"
                except Exception as e:
                    re_data["START_FUNDS_"] = 0

            if "RISK_LEVEL_CODE_" in data:
                re_data["RISK_LEVEL_"] = self.risk_dict[
                    data["RISK_LEVEL_CODE_"]]
                re_data["RISK_LEVEL_CODE_"] = data["RISK_LEVEL_CODE_"]

            if "RISK_LEVEL_" in data:
                re_data["SOURCE_RISK_LEVEL_"] = data["RISK_LEVEL_"]
            elif "SOURCE_RISK_LEVEL_" in data:
                re_data["SOURCE_RISK_LEVEL_"] = data["SOURCE_RISK_LEVEL_"]
            # # 募集起始日期
            if "RAISE_START_" in data:
                re_data["RAISE_START_"] = data["RAISE_START_"]
            # # 募集结束日期
            if "RAISE_END_" in data:
                re_data["RAISE_END_"] = data["RAISE_END_"]
            # # 产品起始日期
            if "PRO_START_" in data:
                re_data["PRO_START_"] = data["PRO_START_"]
            # # 产品结束日期
            if "PRO_END_" in data:
                re_data["PRO_END_"] = data["PRO_END_"]
            # 预期最低收益率
            if "YIELD_LOW_" in data:
                re_data["YIELD_LOW_"] = data["YIELD_LOW_"].replace("%", "")
            # 预期最高收益率
            if "YIELD_HIGH_" in data:
                re_data["YIELD_HIGH_"] = data["YIELD_HIGH_"].replace("%", "")
            # 实际天数
            if "REAL_DAYS_" in data:
                data["REAL_DAYS_"] = data["REAL_DAYS_"].replace(" ", "")
                if "年" in data["REAL_DAYS_"]:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("年", "")
                    try:
                        re_data["REAL_DAYS_"] = int(
                            re_data["REAL_DAYS_"]) * 365
                    except Exception:
                        re_data["REAL_DAYS_"] = 0
                elif "月" in data:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("月", "")
                    try:
                        re_data["REAL_DAYS_"] = int(re_data["REAL_DAYS_"]) * 30
                    except Exception:
                        re_data["REAL_DAYS_"] = 0
                else:
                    re_data["REAL_DAYS_"] = data["REAL_DAYS_"].replace("天", "")
            else:
                if "PRO_START_" in data and "PRO_END_" in data:
                    t_start = arrow.get(data["PRO_START_"], "YYY-MM-DD")
                    t_end = arrow.get(data["PRO_END_"], "YYYY-MM-DD")
                    real_days = t_end - t_start
                    data["REAL_DAYS_"] = real_days.days

            if "INVEST_TYPE_" in data:
                re_data["INVEST_TYPE_"] = data["INVEST_TYPE_"]

            # # 投资者类型
            if "PRO_TYPE_" in data:
                re_data["PRO_TYPE_"] = data["PRO_TYPE_"]
            if "SALE_AREA_" in data:
                re_data["SALE_AREA_"] = data["SALE_AREA_"]
            # # 可否赎回
            if "REDEEM_" in data:
                if "不" in data["REDEEM_"]:
                    re_data["REDEEM_"] = "N"
                else:
                    re_data['REDEEM_'] = "Y"
            if "INCREASE_" in data:
                increase = data["INCREASE_"].replace(" ", "")
                increase = increase.replace("亿", "00000000")
                increase = increase.replace("千万", "0000000")
                increase = increase.replace("百万", "000000")
                increase = increase.replace("十万", "00000")
                increase = increase.replace("万", "0000")
                increase = increase.replace("千", "000")
                increase = increase.replace("百", "00")
                increase = increase.replace("元", "")
                re_data["INCREASE_"] = increase
                # re_data["INVEST_RANGE_"] = data["INVEST_RANGE_"]
            re_data["RECOMMEND_"] = "N"
            re_data["GOOD_SALE_"] = "N"
            re_data["NEW_SALE_"] = "N"
            re_data["SALE_SOURCE_"] = "NET"

            bank_list = list()
            bank_code_list = list()
            for each in self.bank_list:
                if each["NAME_"] in data.get("ENTITY_NAME_", ""):
                    bank_list.append(each["NAME_"])
                    bank_code_list.append(each["CODE_"])
            if bank_list:
                re_data["BANK_NAME_"] = "|".join(bank_list)
            if bank_code_list:
                re_data["BANK_CODE_"] = "|".join(bank_code_list)
            if not data["YIELD_LOW_"]:
                re_data['YIELD_LOW_'] = '--'

            if not data["YIELD_HIGH_"]:
                re_data['YIELD_HIGH_'] = '--'

            if not data["START_FUNDS_"]:
                re_data['START_FUNDS_'] = '--'

            re_data = super(BranchFinProduct,
                            self).generic_shuffle(data=data,
                                                  re_data=re_data,
                                                  field=None)
            re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
            return {
                "TABLE_NAME_": TABLE_NAME("CHA_BRANCH_FINANCIAL_PRODUCT"),
                "DATA_": re_data
            }
Esempio n. 18
0
    def generic_shuffle(self, data):
        """
        清洗脚本写到这里
        :param data:
        :return re_data:
        """
        re_data = dict()
        serial_number = req_for_serial_number(code="WD_SS_YY")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        try:
            lat_result = get_lat_lng(address=data["ADDR_"])
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            try:
                lat_result = get_lat_lng(address=data["CITY_NAME_"]+data["NAME_"])
                re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                re_data["LNG_"] = lat_result["result"]["location"]["lng"]
            except KeyError:
                re_data["LAT_"] = None
                re_data["LNG_"] = None
            except Exception as e:
                re_data["LAT_"] = None
                re_data["LNG_"] = None
                self.logger.info("获取经纬度失败错误为{}".format(e))
        except Exception as e:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
            self.logger.info("获取经纬度失败错误为{}".format(e))
        if re_data["LNG_"]:
            try:
                area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.info("获取地址信息失败错误为{}".format(e))
            else:
                try:
                    re_data["PROVINCE_NAME_"] = area_result["result"]["addressComponent"]["province"]
                    re_data["CITY_NAME_"] = area_result["result"]["addressComponent"]["city"]
                    re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"]
                    re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"]
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                except KeyError:
                    pass

        # 设备
        if "DEVICE_" in data:
            re_data["DEVICE_"] = data["DEVICE_"]
        # 医院等级
        if "GRADE_" in data:
            re_data["GRADE_"] = data["GRADE_"]
        # 特色
        if "SPECIAL_" in data:
            re_data["SPECIAL_"] = data["SPECIAL_"]
        # 电话
        if "TEL_" in data:
            re_data["TEL_"] = data["TEL_"]
        # 医院id
        if "HOSPITAL_ID_" in data:
            re_data["HOSPITAL_ID_"] = data["HOSPITAL_ID_"]
        # 医院名称
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        # 床位
        if "BEDS_" in data:
            re_data["BEDS_"] = data["BEDS_"]
        # 医院性质
        if "TYPE_" in data:
            re_data["TYPE_"] = data["TYPE_"]
        # 网站
        if "WEBSITE_" in data:
            re_data["WEBSITE_"] = data["WEBSITE_"]
        # 门诊量
        if "VOLNUM_" in data:
            re_data["VOLNUM_"] = data["VOLNUM_"]
        # print(re_data)
        re_data = super(Branchssyy, self).generic_shuffle(data=data, re_data=re_data, field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Esempio n. 19
0
    def generic_shuffle(self, data):
        """
        清洗脚本写到这里
        :param data:
        :return re_data:
        """

        re_data = dict()
        serial_number = req_for_serial_number(code="WD_SS_XX")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        try:
            lat_result = get_lat_lng(address=data["ADDR_"])
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            self.logger.info("获取经纬度失败信息为{}".format(e))
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.info(f"获取地址失败, ERROR: {e}")
            else:
                try:
                    re_data["PROVINCE_NAME_"] = area_result["result"][
                        "addressComponent"]["province"]
                    re_data["CITY_NAME_"] = area_result["result"][
                        "addressComponent"]["city"]
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                except KeyError:
                    pass

        # 学校名称
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 属性(市重点、区重点、全国重点)
        if "LEVEL_" in data:
            re_data["LEVEL_"] = data["LEVEL_"]
        # 图片
        if "IMAGES_" in data:
            if data["IMAGES_"]:
                response = req_for_something(url=data["IMAGES_"])
                if response:
                    t = base64.b64encode(response.content)
                    re_data["IMAGES_"] = t.decode("utf-8")
        # 学校类型
        if "SCHOOL_TYPE_" in data:
            re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"]
        # 学校性质
        if "SCHOOL_NATURE_" in data:
            re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"]
        # 电话
        if "TEL_" in data:
            pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})")
            pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})")
            pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})")
            pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})")
            pattern5 = re.compile(r"(\d{8})(\d{11})")
            pattern6 = re.compile(r"(\d{8})(\d{8})")
            pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})")
            pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})")
            pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})")
            if re.match(pattern1, data["TEL_"]):
                phone_number = re.sub(pattern1, r"\1  \2", data["TEL_"])
            elif re.match(pattern2, data["TEL_"]):
                phone_number = re.sub(pattern2, r"\1  \2", data["TEL_"])
            elif re.match(pattern3, data["TEL_"]):
                phone_number = re.sub(pattern3, r"\1  \2", data["TEL_"])
            elif re.match(pattern4, data["TEL_"]):
                phone_number = re.sub(pattern4, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern5, data["TEL_"]):
                phone_number = re.sub(pattern5, r"\1  \2", data["TEL_"])
            elif re.match(pattern6, data["TEL_"]):
                phone_number = re.sub(pattern6, r"\1  \2", data["TEL_"])
            elif re.match(pattern7, data["TEL_"]):
                phone_number = re.sub(pattern7, r"\1  \2", data["TEL_"])
            elif re.match(pattern8, data["TEL_"]):
                phone_number = re.sub(pattern8, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern9, data["TEL_"]):
                phone_number = re.sub(pattern9, r"\1  \2", data["TEL_"])
            else:
                phone_number = data["TEL_"]
            re_data["TEL_"] = phone_number
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        re_data = super(Branchssxx, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Esempio n. 20
0
    def generic_shuffle(self, data, field="CONTENT_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()

        if "TAGS_" in data:
            re_data["TAGS_"] = ""

        # re_data["HOT_"] = data[""]

        re_data["PRO_NAME_"] = data["PRO_NAME_"]
        re_data["PRO_CODE_"] = data["PRO_CODE_"]
        # 基本信息 插入基本信息表
        if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ_ALL", "JRCP_JJ_TTJJ_JZ_ALL"]:
            data_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_BASIC"))
            # self.p_client.table_name = TABLE_NAME("CRMFUND_BASIC")
            source = re.findall(r"(https?://.*?)[/?]", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

            # todo
            # re_data["SOURCE_CODE_"] = ""
            re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][8:12]
            basic_field_list = ["COM_NAME_", "FUND_TYPE_", "RISK_LEVEL_", "RELEASE_DATE_", "BUILD_DATE_",
                                "BUILD_SCAL_", "ASSET_SCAL_", "SHARE_SCAL_", "MANAGER_", "TRUSTEE_", "HANDLER_",
                                "DIVIDEND_", "MANAGE_FEE_RATE_", "HOST_FEE_RATE_", "SALE_FEE_RATE_", "MAX_SUB_RATE_",
                                "MAX_APPLY_RATE_", "MAX_REDEEM_RATE_", "BENCHMARK_", "BID_", "CLOSE_", "DIM_"]
            for basic_field in basic_field_list:
                if basic_field == "FUND_TYPE_":
                    fund_type = data.get("FUND_TYPE_", "其他")
                    re_data["FUND_TYPE_"] = fund_type
                    try:
                        re_data["FUND_TYPE_CODE_"] = self.ft_dict[data["FUND_TYPE_"]]
                    except KeyError:
                        for ft in self.ft_dict.keys():
                            if ft[:2] in fund_type:
                                re_data["FUND_TYPE_CODE_"] = self.ft_dict[ft]
                        if "FUND_TYPE_CODE_" not in re_data:
                            # self.logger.info(f"FUND_TYPE_CODE_ {fund_type}")
                            re_data["FUND_TYPE_CODE_"] = "QT"
                elif basic_field == "RISK_LEVEL_":
                    risk_level_ = data.get("RISK_LEVEL_", "未知")
                    risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知"
                    re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_]
                    re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "")
                elif basic_field == "MAX_REDEEM_RATE_":
                    max_redeem_rate_ = data.get("MAX_REDEEM_RATE_", "")
                    re_data["MAX_REDEEM_RATE_"] = re.split(r'[|]', data.get("MAX_REDEEM_RATE_", ""))[-1].replace \
                        ("%", "") if max_redeem_rate_ else ""
                elif basic_field == "BENCHMARK_":
                    re_data[basic_field] = data.get(basic_field, "")
                elif basic_field == "BUILD_DATE_" or basic_field == "RELEASE_DATE_":
                    basic_date = re.findall(r"(\d{4}年\d{2}月\d{1,2})日", data[basic_field])
                    if basic_date:
                        re_data[basic_field] = re.sub(r"[\u4e00-\u9fa5]", "-", basic_date[0])
                elif basic_field == "HANDLER_":
                    re_data[basic_field] = data.get(basic_field, "").replace('|', '')
                else:
                    re_data[basic_field] = data.get(basic_field, "").replace("%", "")
            # 添加一个资产总额字段方便统计
            if re_data["ASSET_SCAL_"]:
                asset_total = re.findall(r"(.*?亿元)(截止至:\d+年\d+月\d+日)", re_data["ASSET_SCAL_"])
                if len(asset_total) > 0:
                    re_data["ASSET_TOTAL_"] = asset_total[0]
                else:
                    re_data["ASSET_TOTAL_"] = '0'
            # 基金基本信息默认都是CHECK
            re_data["DATA_STATUS_"] = "CHECK"
            re_data["DATA_VERSION_"] = "0"

            re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="TRUSTEE_")
            data_dict["DATA_"] = re_data
            return [data_dict]
        # 代销基金 插入代销基金表
        elif "GW_ALL" in data["ENTITY_CODE_"]:
            agency_dict = dict(TABLE_NAME_=TABLE_NAME("CRMFUND_AGENCY"))
            # self.p_client.table_name = "CRMFUND_AGENCY"
            # self.p_client.table_name = TABLE_NAME("CRMFUND_AGENCY")

            serial_number = req_for_serial_number(code="JRCP_JJ_AGENT")
            re_data["ID_"] = serial_number
            source = re.findall(r"(https?://.*?)[/?]", data["URL_"])
            re_data["SOURCE_"] = source[0]
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
            re_data["PUBLISH_TIME_"] = data["DATETIME_"]
            re_data["SOURCE_TYPE_"] = ""
            # HOT_ 代销基金目前不需要热度字段
            # re_data["HOT_"] = data[""]

            re_data["RECOMMEND_"] = "N"
            re_data["GOOD_SALE_"] = "N"
            re_data["NEW_SALE_"] = "N"
            re_data["PUBLISH_STATUS_"] = "Y"
            re_data["DATA_STATUS_"] = "CHECK"
            re_data["VERSION_"] = "0"
            re_data["DATA_VERSION_"] = "0"
            # 从基金和基金基本信息中获取
            pro_code_ = data.get("PRO_CODE_")
            pro_name = data.get("PRO_NAME_")
            cur = self.connection.cursor()
            # TODO 查取不到
            # 从基金基本信息表查询相关数据
            if pro_code_:
                try:
                    re_data["PRO_CODE_"] = pro_code_
                    detail_list = ["RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_","BUILD_DATE_"
                                   "COM_NAME_", "RELEASE_DATE_", "CLOSE_"]
                    cur.execute(f"SELECT {','.join(detail_list)} "
                                f"FROM CRMFUND_BASIC WHERE PRO_CODE_='{str(data['PRO_CODE_'])}' "
                                f"ORDER BY CREATE_TIME_ DESC LIMIT 1")
                    for index, item in enumerate(cur.fetchone()):
                        re_data[detail_list[index]] = item
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"
            elif pro_name:
                try:
                    pro_name = pro_name if not data.get("PRO_LIKE_NAME_") else data.get("PRO_LIKE_NAME_")
                    detail_list = ["PRO_CODE_", "RISK_LEVEL_", "RISK_LEVEL_CODE_", "FUND_TYPE_", "FUND_TYPE_CODE_",
                                   "COM_NAME_", "RELEASE_DATE_", "CLOSE_"]
                    cur.execute(f"SELECT {','.join(detail_list)} "
                                f"FROM CRMFUND_BASIC WHERE PRO_NAME_ LIKE '{pro_name}%' "
                                f"ORDER BY CREATE_TIME_ DESC LIMIT 1")
                    for index, item in enumerate(cur.fetchone()):
                        re_data[detail_list[index]] = item
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"
            # 从基金历史净值表查询相关数据
            if re_data.get("PRO_CODE_"):
                try:
                    cur.execute(f"SELECT BUY_STATUS_, NEW_NAV_, NEW_SYR_ "
                                f"FROM CRMFUND_DATA "
                                f"WHERE PRO_CODE_= '{str(re_data['PRO_CODE_'])}' "
                                f"ORDER BY TIME_ DESC LIMIT 1")
                    re_data["BUY_STATUS_"], re_data["NEW_NAV_"], re_data["NEW_SYR_"] = cur.fetchone()
                    if re_data["BUY_STATUS_"] and re_data["BUY_STATUS_"] in self.new_bs_dict.keys():
                        re_data["BUY_STATUS_CODE_"] = self.new_bs_dict[re_data["BUY_STATUS_"]]
                except Exception as e:
                    re_data["PUBLISH_STATUS_"] = "N"
                    re_data["DATA_STATUS_"] = "UNCHECK"

                finally:
                    cur.close()
            if not re_data.get("RISK_LEVEL_"):
                if "RISK_LEVEL_" not in data:
                    risk_level_ = "未知"
                else:
                    risk_level_ = data["RISK_LEVEL_"]
                risk_level_ = re.split(r'[|]', risk_level_.strip())[-1] if risk_level_ else "未知"
                re_data["RISK_LEVEL_"] = self.rl_name_dict[risk_level_]
                re_data["RISK_LEVEL_CODE_"] = self.rl_dict.get(re_data["RISK_LEVEL_"], "")
            # re_data["NEW_SYR_"] = data[""]
            if not (re_data.get("FUND_TYPE_") or re_data.get("RELEASE_DATE_")):
                re_data["PUBLISH_STATUS_"] = "N"
                re_data["DATA_STATUS_"] = "UNCHECK"
            re_data = super(BranchFund, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_")
            agency_dict["DATA_"] = re_data
            return [agency_dict]
        # 历史净值 插入基金表
        elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ", "JRCP_JJ_TTJJ_JZ"]:
            serial_number = req_for_serial_number(code=data["ENTITY_CODE_"][:7])
            re_data["ID_"] = serial_number
            # re_data["FUND_BASIC_ID_"] = data[""]   关联   BASIC_ID
            re_data["SERVICE_CHARGE_"] = data["SERVICE_CHARGE_"]
            re_data["RATING_AGENCIES_"] = data["RATING_AGENCIES_"].replace('jjpj', '')
            nom_field_list = ["TIME_", "NEW_NAV_", "NEW_ANV_", "OLD_TIME_", "OLD_NAV_", "OLD_ANV_", "DAY_GROWTH_",
                              "DAY_GROWTH_RATE_", "ONE_MONTH_RATE_", "THREE_MONTH_RATE_", "SIX_MONTH_RATE_",
                              "ONE_YEAR_RATE_", "THREE_YEAR_RATE_", "BUILD_RATE_", "NEW_TOI_", "NEW_SYR_", "OLD_TOI_",
                              "OLD_SYR_", "FYR_", "TYR_", "MARKET_PRICE_", "DISCOUNT_RATE_", "VERSION_",
                              "BUY_STATUS_", "REDEEM_STATUS_"]
            for nom_field in nom_field_list:
                if nom_field == "VERSION_":
                    re_data[nom_field] = "0"
                elif nom_field == "BUY_STATUS_":
                    re_data["BUY_STATUS_"] = data.get("BUY_STATUS_", "")
                    re_data["BUY_STATUS_CODE_"] = self.new_bs_dict.get(re_data["BUY_STATUS_"], "")
                elif nom_field == "REDEEM_STATUS_":
                    re_data["REDEEM_STATUS_"] = data.get("REDEEM_STATUS_")
                    re_data["REDEEM_STATUS_CODE_"] = self.rs_dict.get(re_data["REDEEM_STATUS_"], "")
                else:
                    re_data[nom_field] = data.get(nom_field, "").replace("%", "")
                    re_data[nom_field] = re_data[nom_field].replace("--", "")
            re_data["CREATE_BY_ID_"] = CREATE_ID
            re_data["CREATE_BY_NAME_"] = CREATE_NAME
            if data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_FJZ",]:
                re_data["APY_FOURTEEN_"] = data.get("APY_FOURTEEN_")
                re_data["APY_TWENTY_EIGHT_"] = data.get("APY_TWENTY_EIGHT_")
                re_data["NEW_TOI_"] = data.get("NEW_TOI_")
                re_data["NEW_SYR_"] = data.get("NEW_SYR_")
                try:
                    re_data["APY_THIRTY_FIVE_"] = round(float(dict(self.db_spider_data.JRCP_JJ.find_one({'PRO_CODE_': data['PRO_CODE_'], 'TIME_': data['TIME_'], 'ENTITY_CODE_': 'JRCP_JJ_TTJJ_35NH'})).get('APY_THIRTY_FIVE_')) * 100) / 100.0
                except:
                    re_data["APY_THIRTY_FIVE_"] = ''
            elif data["ENTITY_CODE_"] in ["JRCP_JJ_TTJJ_JZ",]:
                re_data["NEW_WORTH_"] = data.get("NEW_WORTH_")

            # 处理T-1日净值
            # self.p_client.table_name =
            cur = self.connection.cursor()
            cur.execute(f"SELECT NEW_NAV_,NEW_ANV_,NEW_TOI_,NEW_SYR_ FROM  CRMFUND_DATA where PRO_CODE_='{re_data['PRO_CODE_']}' and TIME_<'{re_data['TIME_']}' order by TIME_ desc limit 1")
            t_1data = cur.fetchone()
            if t_1data:
                self.logger.info(f"====T-1日数据===={t_1data}")
                # print(t_1data)
                re_data['OLD_NAV_'] = t_1data[0]
                re_data['OLD_ANV_'] = t_1data[1]
                re_data['OLD_TOI_'] = t_1data[2]
                re_data['OLD_SYR_'] = t_1data[3]

            # 更新代销基金数据
            self.p_client.table_name = TABLE_NAME('CRMFUND_AGENCY')
            agences = self.p_client.search_all_from_phoenix(connection=self.connection, dict_status=True,
                                                            where_condition=f"PRO_CODE_='{re_data['PRO_CODE_']}'")
            if agences:
                while True:
                    try:
                        agence_data = agences.__next__()
                        self.logger.info(f"====更新代销基金数据===={agence_data}")
                        agence_data['NEW_NAV_'] = re_data['NEW_NAV_']
                        agence_data['NEW_SYR_'] = re_data['NEW_SYR_']
                        agence_data['BUY_STATUS_'] = re_data['BUY_STATUS_']
                        agence_data['BUY_STATUS_CODE_'] = re_data['BUY_STATUS_CODE_']
                    except:
                        break
                    try:
                        self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=agence_data)
                    except jaydebeapi.DatabaseError:
                        continue
            self.p_client.table_name = TABLE_NAME('CRMFUND_DATA')
            return [{"TABLE_NAME_": TABLE_NAME("CRMFUND_DATA"), "DATA_": re_data}]
Esempio n. 21
0
    def generic_shuffle(self, data):
        # print(data)
        re_data = dict()
        # 通用字段
        # ID_  历史信息 ID_
        serial_number = req_for_serial_number(code="WD_JZ_FJ_DATA")
        re_data["ID_"] = serial_number
        re_data["URL_"] = data["URL_"]
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 实体编码、名称及 url
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME
        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        # 状态
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"
        # 名称
        re_data["NAME_"] = data["NAME_"].replace("|", "")
        # 类型: 住宅(ZZ)、写字楼(XZL)、商铺(SP)
        if "LISP" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "SP"
        elif "LIXQ" in data["ENTITY_CODE_"] or "LJXQ" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "ZZ"
        elif "LIXZL" in data["ENTITY_CODE_"]:
            re_data["TYPE_"] = "XZL"

        # 验证名称是否在基本表中
        verify_name = value_replace(re_data["NAME_"])
        house_id = self.if_exists(name=verify_name, city_name="厦门市")

        # 基本表存在, 只插入 DATA 表
        if house_id:
            re_data["P_ID_"] = house_id
            if "TITLE_" in data:
                re_data["TITLE_"] = data["TITLE_"].replace("|", "")
            if "PUBLISH_TIME_" in data:
                re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
            else:
                re_data["PUBLISH_TIME_"] = data["DATETIME_"][:10]
            price = re.findall(r"[\d.]+", data["PRICE_"])
            if price:
                re_data["PRICE_"] = price[0]
            else:
                re_data["PRICE_"] = 0
            if "租赁" in data["ENTITY_NAME_"]:
                re_data["USE_TYPE_"] = "RENT"
            else:
                re_data["USE_TYPE_"] = "SALE"

            return [{"TABLE_NAME_": self.data_table_name, "DATA_": re_data}]
        else:
            # 基本信息表ID_
            base_id = req_for_serial_number(code="WD_JZ_FJ_BASE")
            # DATA_ 表
            data_dict = dict()
            data_dict.update(re_data)
            data_dict["P_ID_"] = base_id
            if "TITLE_" in data:
                data_dict["TITLE_"] = data["TITLE_"].replace("|", "")
            if "PUBLISH_TIME_" in data:
                data_dict["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
            else:
                data_dict["PUBLISH_TIME_"] = data["DATETIME_"][:10]
            price = re.findall(r"[\d.]+", data["PRICE_"])
            if price:
                data_dict["PRICE_"] = price[0]
            else:
                data_dict["PRICE_"] = 0
            if "租赁" in data["ENTITY_NAME_"]:
                data_dict["USE_TYPE_"] = "RENT"
            else:
                data_dict["USE_TYPE_"] = "SALE"
            # 基本信息表
            basic_dict = dict()
            basic_dict.update(re_data)
            basic_dict["ID_"] = base_id
            basic_dict["URL_"] = data["URL_"]
            basic_dict["PROVINCE_CODE_"] = "3500"
            basic_dict["PROVINCE_NAME_"] = "福建省"
            basic_dict["CITY_CODE_"] = "350200"
            basic_dict["CITY_NAME_"] = "厦门市"
            basic_dict["SALE_PRICE_"] = 0
            basic_dict["RENT_PRICE_"] = 0
            if "YEAR_" in data:
                year = re.findall(r"\d+", data["YEAR_"])
                if year:
                    basic_dict["YEAR_"] = year[0]

            # 地址分析
            try:
                if basic_dict["PROVINCE_NAME_"] == basic_dict["CITY_NAME_"]:
                    basic_dict["ADDR_"] = basic_dict[
                        "PROVINCE_NAME_"] + basic_dict["NAME_"]
                else:
                    basic_dict[
                        "ADDR_"] = basic_dict["PROVINCE_NAME_"] + basic_dict[
                            "CITY_NAME_"] + basic_dict["NAME_"]
                # print(basic_dict["ADDR_"])
                res = req_for_textLoc(text=basic_dict["ADDR_"])
                # print(res)
            except Exception as e:
                self.logger.exception(
                    f"2.2--err: 请求模型 req_for_textLoc 错误."
                    f" 原始数据 collection = {self.m_client.mongo_collection};"
                    f" ENTITY_CODE_ = {self.entity_code};"
                    f" 原始数据 _id = {data['_id']};"
                    f" error: {e}.")
            else:
                if "error" not in res:
                    if res["tagsId"] == "None" or res["tagsId"] is None:
                        pass
                    else:
                        basic_dict["TAGS_"] = res["tagsId"]
                    if res["flag"] == 1:
                        basic_dict["ADDR_"] = res["full"]
                    else:
                        basic_dict["ADDR_"] = data["ADDR_"]
                    try:
                        lat_result = get_lat_lng(address=basic_dict["ADDR_"])
                        basic_dict["LAT_"] = lat_result["result"]["location"][
                            "lat"]
                        basic_dict["LNG_"] = lat_result["result"]["location"][
                            "lng"]
                    except KeyError:
                        basic_dict["LAT_"] = None
                        basic_dict["LNG_"] = None
                    except Exception as e:
                        self.logger.info(f"获取经纬度失败, ERROR: {e}")
                        basic_dict["LAT_"] = None
                        basic_dict["LNG_"] = None
                    if basic_dict["LAT_"]:
                        try:
                            area_result = get_area(",".join([
                                str(basic_dict["LAT_"]),
                                str(basic_dict["LNG_"])
                            ]))
                        except Exception as e:
                            self.logger.info(f"获取地址失败, ERROR: {e}")
                        else:
                            try:
                                basic_dict["AREA_NAME_"] = area_result[
                                    "result"]["addressComponent"]["district"]
                                basic_dict["AREA_CODE_"] = area_result[
                                    "result"]["addressComponent"]["adcode"]
                            except KeyError:
                                pass
                            try:
                                basic_dict["ADDR_"] = area_result["result"][
                                    "formatted_address"]
                            except KeyError:
                                pass
            # basic_dict["AREA_CODE_"] = data[""]
            # basic_dict["AREA_NAME_"] = data[""]
            # basic_dict["LAT_"] = data[""]
            # basic_dict["LNG_"] = data[""]
            # basic_dict["BANK_CODE_"] = data[""]
            # basic_dict["BANK_NAME_"] = data[""]
            # basic_dict["REMARK_"] = data[""]
            basic_dict["M_STATUS_"] = "N"
            basic_dict["DELETE_STATUS_"] = "N"
            # basic_dict["TAGS_"] = data[""]
            # 数据来源 URL
            source = re.findall(r"(https?://.*?)/", data["URL_"])
            re_data["SOURCE_"] = source[0]
            # 数据来源 网站名称
            re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
            basic_dict["SOURCE_TYPE_"] = "链家"
            # basic_dict["PRICE_TYPE_"] = data[""]
            basic_dict["ADDR_"] = data["ADDR_"]

            return [{
                "TABLE_NAME_": self.data_table_name,
                "DATA_": data_dict
            }, {
                "TABLE_NAME_": self.base_table_name,
                "DATA_": basic_dict
            }]
Esempio n. 22
0
    def generic_shuffle(self, data, field="PRO_NAME_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        # 如果data是一个list
        if isinstance(data, list):
            re_data_list = []
            for item in data:
                re_data_list.append({"TABLE_NAME_": self.script_name, "DATA_": self.generic_shuffle(item)})
            return re_data_list

        re_data = dict()
        serial_number = req_for_serial_number(code="JRCP_BX")
        re_data["ID_"] = serial_number + "TEST"
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        re_data["VERSION_"] = "0"
        re_data["DATA_VERSION_"] = "0"
        # todo
        re_data["SOURCE_TYPE_"] = ""

        # 模型

        re_data["HOT_"] = data["HOT_"] if "HOT_" in data else "0"

        re_data["PRO_NAME_"] = data["PRO_NAME_"]

        # 保险公司
        if "COM_NAME_" in data:
            for each in self.company_list:
                if each["NAME_"]:
                    if data["COM_NAME_"] in each["NAME_"] or each["NAME_"] in data["COM_NAME_"]:
                        re_data["COM_NAME_"] = each["NAME_"]
                        re_data["COM_NAME_CODE_"] = each["CODE_"]
                    elif each["ALIAS_"] and data["COM_NAME_"] in each["ALIAS_"]:
                        re_data["COM_NAME_"] = each["NAME_"]
                        re_data["COM_NAME_CODE_"] = each["CODE_"]
            if "COM_NAME_" not in re_data:
                re_data["COM_NAME_"] = data["COM_NAME_"]

        # 保额 补录
        if "ENSURE_PRICE_" in data:
            re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"]
        # else:
        #     re_data["ENSURE_PRICE_"] = [100000, 500000, 1000000][random.randint(0, 2)]
        # 保费 补录
        if "ENSURE_FEE_" in data:
            re_data["ENSURE_FEE_"] = data["ENSURE_FEE_"]
        # else:
        #     re_data["ENSURE_FEE_"] = [50, 100, 200, 150][random.randint(0, 3)]
        # 产品特色 补录
        if "SPECAIL_" in data:
            re_data["SPECAIL_"] = data["SPECAIL_"]
        # 产品简介 补录
        if "BRIEF_" in data:
            re_data["BRIEF_"] = data["BRIEF_"]
        # 承保年龄 补录
        if "AGE_" in data:
            re_data["AGE_"] = data["AGE_"]
        # else:
        #     re_data["AGE_"] = [50, 70, 60, 80][random.randint(0, 3)]
        # 保险期间 补录
        if "ENSURE_DATE_" in data:
            re_data["ENSURE_DATE_"] = data["ENSURE_DATE_"]
        # else:
        #     re_data["ENSURE_DATE_"] = ["至80岁", "至60岁", "一年", "五年", "十年", "终身"][random.randint(0, 5)]
        # 投保份数 补录
        if "BUY_LIMIT_" in data:
            re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"]
        # else:
        #     re_data["BUY_LIMIT_"] = [1, 2, "不限"][random.randint(0, 2)]
        # 保单形式 补录
        if "ENSURE_MODE_" in data:
            re_data["ENSURE_MODE_"] = data["ENSURE_MODE_"]
        # 保单 补录
        if "ENSURE_MODE_CODE_" in data:
            re_data["ENSURE_MODE_CODE_"] = data["ENSURE_MODE_CODE_"]
        # 适用人群 补录
        if "SUIT_" in data:
            re_data["SUIT_"] = data["SUIT_"]
        # else:
        #     re_data["SUIT_"] = ["20岁以下", "20岁至50岁人群", "无重大疾病隐患者", "不限"][random.randint(0,3)]
        # 原始保险分类 补录
        if "ENSURE_SOURCE_TYPE_" in data:
            re_data["ENSURE_SOURCE_TYPE_"] = data["ENSURE_SOURCE_TYPE_"]
        # 保险类型 补录
        # type_dict = {"寿险": "SX", "年金险": "NJX", "意外险": "YWX", "个人财险": "GRCX", "企业财险": "QYCX", "旅游险": "LYX", "健康险": "JKX", "理财险": "LCX"}
        if "ENSURE_TYPE_" in data:
            re_data["ENSURE_TYPE_"] = data["ENSURE_TYPE_"]
            # re_data["ENSURE_TYPE_"] = ["寿险", "年金险", "意外险", "个人财险", "企业财险", "旅游险", "健康险", "理财险"][random.randint(0, 7)]
        # 保险类型分类 补录
        # if 1:
        if "ENSURE_TYPE_CODE_" in data:
            re_data["ENSURE_TYPE_CODE_"] = data["ENSURE_TYPE_CODE_"]
            # re_data["ENSURE_TYPE_CODE_"] = type_dict[re_data["ENSURE_TYPE_"]]
        # 推荐
        re_data["RECOMMEND_"] = "N"
        # 畅销
        re_data["GOOD_SALE_"] = "N"
        # 最新
        re_data["NEW_SALE_"] = "N"
        # 保障内容 补录
        if "ENSURE_CONTENT_" in data:
            re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"]
        # 投保须知 补录
        if "NOTICE_" in data:
            re_data["NOTICE_"] = data["NOTICE_"]
        # 产品介绍 补录
        if "PRO_DETAIL_" in data:
            re_data["PRO_DETAIL_"] = data["PRO_DETAIL_"]
        if "ENSURE_PAY_" in data.keys():
            re_data["ENSURE_PAY_"] = data["ENSURE_PAY_"].strip().replace("交", "缴")
            if re_data["ENSURE_PAY_"] not in self.pay_type:
                re_data["ENSURE_PAY_"] = "其他"
            re_data["ENSURE_PAY_CODE_"] = self.pay_type[re_data["ENSURE_PAY_"]]
        # 如果没有缴费方式从产品名字中再获取一次
        else:
            if re.findall(r"期[缴交]", data["PRO_NAME_"]):
                re_data["ENSURE_PAY_"] = "期缴"
                re_data["ENSURE_PAY_CODE_"] = "QJ"
            elif re.findall(r"趸[缴交]", data["PRO_NAME_"]):
                re_data["ENSURE_PAY_"] = "趸缴"
                re_data["ENSURE_PAY_CODE_"] = "DJ"
        # FDFS上传
        if "LOCAL_PDF_PATH_" in data:
            try:
                p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_PDF",
                                               file_name=data["LOCAL_PDF_NAME_"], postfix="pdf",
                                               file=open(data["LOCAL_PDF_PATH_"], "rb"))
                p_response.close()
            except Exception as e:
                self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}")
        if "WORD_" in data:
            try:
                response = req_for_something(url=data["WORD_"])
            except Exception as e:
                self.logger.warning(f"_id: {data['_id']},获取PDF失败, ERROR: {e}")
            else:
                if response:
                    try:
                        p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD",
                                                       file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc",
                                                       file=response.content)
                        self.logger.info(f"{p_response.content.decode('utf-8')}")
                        p_response.close()
                    except Exception as e:
                        self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}")
                    finally:
                        response.close()
                else:
                    self.logger.warning(f'id: {data["_id"]},获取PDF失败')

        if "HTML_" in data:
            del data["HTML_"]
        re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_")
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]