Exemple #1
0
def data_shuffle(data):
    if "PDF_" in data:
        if ".html" in data["PDF_"]:
            data["HTML_"] = req_for_something(
                url=data["PDF_"]).content.decode("UTF-8")
            del data["PDF_"]
        elif "http" not in data["PDF_"]:
            del data["PDF_"]
    if "RISK_LEVEL_" in data:
        if data["RISK_LEVEL_"] == "低风险":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["RISK_LEVEL_"] == "中低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "较低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "中等风险":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["RISK_LEVEL_"] == "中高风险":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["RISK_LEVEL_"] == "高风险":
            data["RISK_LEVEL_CODE_"] = "R5"
    elif "SOURCE_RISK_LEVEL_" in data:
        if data["SOURCE_RISK_LEVEL_"] == "低风险":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["SOURCE_RISK_LEVEL_"] == "中低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "较低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "中等风险":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["SOURCE_RISK_LEVEL_"] == "中高风险":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["SOURCE_RISK_LEVEL_"] == "高风险":
            data["RISK_LEVEL_CODE_"] = "R5"
    return data
Exemple #2
0
def data_shuffle(data):
    if "RAISE_START_" in data:
        data["RAISE_START_"] = re.sub(r"[^\d-]", "", data["RAISE_START_"])
    if "YIELD_HIGH_" in data:
        if "-" in data["YIELD_HIGH_"]:
            yield_rate = data["YIELD_HIGH_"].split("-")
            data["YIELD_HIGH_"] = yield_rate[0]
            data["YIELD_LOW_"] = yield_rate[1]
    # 起购金额
    if "START_FUNDS_" in data:
        data["START_FUNDS_"] = data["START_FUNDS_"].replace("W", "0000")
    # 风险等级
    if "SOURCE_RISK_LEVEL_" in data:
        if "很低" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R1"
        elif "较低" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R2"
        elif "中低" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R3"
        elif "中高" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R4"
        elif "较高" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R4"
        elif "很高" in data["SOURCE_RISK_LEVEL_"]:
            data["RISK_LEVEL_CODE_"] = "R5"
    # PDF
    if "PDF_" in data:
        if data["PDF_"]:
            response = req_for_something(url=data["PDF_"])
            pdf_url = re.findall(r"pdf_filename = \"(.*)\";",
                                 response.content.decode("gbk"))
            if pdf_url:
                data["PDF_"] = pdf_url[0]
    return data
Exemple #3
0
    def generic_shuffle(self, data, re_data, field="CONTENT_"):

        re_data = deepcopy(data)

        # 文件存储
        for _ in range(1, 10):
            if f"FJ{_}_NAME_" in data and data.get(f'FJ{_}_URL_'):
                type = find_type(data.get(f'FJ{_}_URL_')) if find_type(
                    data.get(f'FJ{_}_URL_')) else find_type(
                        data.get(f"FJ{_}_NAME_"))
                if not type:
                    return re_data
                try:
                    response = req_for_something(url=data[f'FJ{_}_URL_'])
                except Exception as e:
                    self.logger.exception('文件获取出错')
                else:
                    if response:
                        try:
                            # todo 文件上传出错是否继续还是跳过
                            number = 3932
                            serial_number = req_for_serial_number(
                                code="GOV_ZX_GDS")

                            file_name = src_dir + str(
                                int(serial_number[5:13]) - number
                            ) + '-' + data.get(f"FJ{_}_NAME_").replace(
                                '.xlsx', '').replace('.xls', '').replace(
                                    '.doc', '').replace('.docx', '').replace(
                                        '.zip',
                                        '').replace('.pdf', '').replace(
                                            '.PDF', '') + type

                            re_data[f'FILE_NAME_{_}_'] = str(
                                int(serial_number[5:13]) - number
                            ) + '-' + data.get(f"FJ{_}_NAME_").replace(
                                '.xlsx', '').replace('.xls', '').replace(
                                    '.docx', '').replace('.doc', '').replace(
                                        '.zip',
                                        '').replace('.pdf', '').replace(
                                            '.PDF', '') + type
                            with open(file_name, 'wb+') as fp:
                                fp.write(response.content)
                            print('保存文件成功', '  ', re_data[f'FILE_NAME_{_}_'])
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f"error: {e}.")
                        finally:
                            response.close()

        return re_data
def data_shuffle(data):
    if "PDF_1_" in data.keys():
        for i in range(10):
            try:
                if ".HTM" in data[f"PDF_{i}_"] or ".htm" in data[f"PDF_{i}_"]:
                    response = req_for_something(url=data[f"PDF_{i}_"])
                    if response:
                        profix_url = re.findall(r"https?://.*/",
                                                data[f"PDF_{i}_"])[0]
                        pdf_url = re.findall(r"/\w+\.pdf",
                                             response.content.decode("utf-8"))
                        if pdf_url:
                            data[f"PDF_{i}_"] = profix_url[:-1] + pdf_url[0]
            except Exception as e:
                continue

    # data["IMAGES_"] = data["PRO_DETAIL_"]
    # del data["PRO_DETAIL_"]
    return data
Exemple #5
0
    def __shuffle(self, data):
        serial_number = req_for_serial_number(code="CRM_JJK")
        data["ID_"] = serial_number

        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        data["CREATE_TIME_"] = create_time
        data["CREATE_BY_ID_"] = CREATE_ID
        data["CREATE_BY_NAME_"] = CREATE_NAME
        data["M_STATUS_"] = "N"
        data["DELETE_STATUS_"] = "N"
        data["DATA_STATUS_"] = "UNCHECK"
        data["PUBLISH_STATUS_"] = "N"
        data["HOT_"] = "0"
        data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        if source:
            data["SOURCE_"] = source[0]
        data["SOURCE_NAME_"] = data["ENTITY_NAME_"]

        # 处理图片
        if "IMG" in data and data["IMG"]:
            try:
                response = req_for_something(url=data["IMG"])
            except Exception as e:
                self.logger.exception(f"2.1--err: IMG"
                                      f" 原始数据 collection = {self.m_client.mongo_collection};"
                                      f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                      f"error: {e}.")
            else:
                if response:
                    content = response.content
                    encode_data = base64.b64encode(content)
                    data["IMG_"] = encode_data.decode("utf-8")
                response.close()
        else:
            data["IMG_"] = ""

        del data["IMG"]
        del data["DATETIME_"]
        return data
def data_shuffle(data):
    if data["PDF_"][-4:] == "html" or data["PDF_"][:-4] == "HTML":
        response = req_for_something(url=data["PDF_"])
        data["HTML_"] = response.content.decode("gbk")
        data["HTML_NAME_"] = data["PDF_NAME_"]
        regist_code = re.findall(r"C\d{13}", data["HTML_"])
        if regist_code:
            data["REGIST_CODE_"] = regist_code[0]
        else:
            regist_code = re.findall(r"C\d+C\d+", data["HTML_"])
            if regist_code:
                data["REGIST_CODE_"] = "".join(["C", regist_code[0].replace("C", "")])
        del data["PDF_"]
    if "RISK_LEVEL_" in data:
        if data["RISK_LEVEL_"] == "低风险":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["RISK_LEVEL_"] == "中低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "较低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "中等风险":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["RISK_LEVEL_"] == "中高风险":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["RISK_LEVEL_"] == "高风险":
            data["RISK_LEVEL_CODE_"] = "R5"
    elif "SOURCE_RISK_LEVEL_" in data:
        if data["SOURCE_RISK_LEVEL_"] == "低风险":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["SOURCE_RISK_LEVEL_"] == "中低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "较低风险":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "中等风险":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["SOURCE_RISK_LEVEL_"] == "中高风险":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["SOURCE_RISK_LEVEL_"] == "高风险":
            data["RISK_LEVEL_CODE_"] = "R5"
    return data
Exemple #7
0
def data_shuffle(data):
    if data.get("excel"):
        data_list = []
        response = req_for_something(url=data["excel"])
        work_book = read_excel(response.content)
        sheet_name = work_book.sheet_names()[0]
        sheet = work_book.sheet_by_name(sheet_name)
        com_name_ = ""
        row_list = sheet.row_values(2)
        for n in range(3, sheet.nrows):
            data_item = {}
            for k, v in data.items():
                data_item[k] = v
            rows1 = sheet.row_values(n)
            sheet_dict = dict(zip(row_list, rows1))
            if sheet_dict["保险公司"]:
                com_name_ = sheet_dict["保险公司"]
            else:
                sheet_dict["保险公司"] = com_name_
            data_item["COM_NAME_"] = sheet_dict["保险公司"]
            data_item["PRO_NAME_"] = sheet_dict["保险产品名称"]
            data_item["ENSURE_SOURCE_TYPE_"] = sheet_dict["产品类型"]
            data_list.append(data_item)
Exemple #8
0
    def generic_shuffle(self, data, field="CONTENT_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        re_data = dict()

        if "PUBLISH_TIME_" not in data:
            return None
        # 时间维度
        if re.findall(r"\d{4}-\d{1,2}-\d{1,2}", data["PUBLISH_TIME_"]):
            pass
        elif re.findall(r"\d{4}年\d{1,2}月\d{1,2}日", data["PUBLISH_TIME_"]):
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-")
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-")
            data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "")

        else:
            if ("年" in data["PUBLISH_TIME_"]) and ("月" in data["PUBLISH_TIME_"]) and ("二" in data["PUBLISH_TIME_"]):
                format_list = list()
                for i in data["PUBLISH_TIME_"][:10]:
                    format_list.append(self.number_dict[i])
                    data["PUBLISH_TIME_"] = "".join(format_list)

            # 暂无其他情形
            # elif
            else:
                find_time = re.findall(r"\|(\w{4}[-年]\w{1,2}[-月]\w{1,2})日?\W?\|", data["CONTENT_"])
                if find_time:
                    if "二" in find_time[0]:
                        format_list = list()
                        for i in find_time[0]:
                            format_list.append(self.number_dict[i])
                            data["PUBLISH_TIME_"] = "".join(format_list)
                    else:
                        data["PUBLISH_TIME_"] = find_time[0]
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-")
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-")
                        data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "")
                else:
                    data["PUBLISH_TIME_"] = ""

        if data["PUBLISH_TIME_"]:
            shuffle_list = data["PUBLISH_TIME_"].split("-")
            shuffle_list[0] = shuffle_list[0][:4]
            if len(shuffle_list[1]) == 2:
                pass
            elif len(shuffle_list[1]) == 1:
                shuffle_list[1] = "0" + shuffle_list[1]
            elif len(shuffle_list[1]) > 2:
                shuffle_list[1] = shuffle_list[1][:2]

            if len(shuffle_list[2]) == 2:
                pass
            elif len(shuffle_list[2]) == 1:
                shuffle_list[2] = "0" + shuffle_list[2]
            elif len(shuffle_list[2]) > 2:
                shuffle_list[2] = shuffle_list[2][:2]

            data["PUBLISH_TIME_"] = "-".join(shuffle_list)

        re_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "")

        # re_data["REMARK_"] = ""

        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""

        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        # 数据来源编码
        s_index = data["ENTITY_CODE_"].rfind("_")
        re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]

        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:7]

        re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
        re_data["TITLE_"] = data["TITLE_"]

        # 作者
        if "AUTHOR_" in data:
            if "编辑" in data["AUTHOR_"]:
                re_data["AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["AUTHOR_"])[0]
            else:
                re_data["AUTHOR_"] = data["AUTHOR_"]

        re_data["IMPORTANCE_"] = "N"
        # 阅读数
        if "READ_" in data:
            re_data["READS_"] = data["READ_"]
        else:
            re_data["READS_"] = 0
        # 点赞数
        if "LIKES_" in data:
            re_data["LIKES_"] = data["LIKES_"]
        else:
            re_data["LIKES_"] = 0
        # 评论数
        if "COMMENTS_" in data:
            re_data["COMMENTS_"] = data["COMMENTS_"]
        elif "COMMENT_" in data:
            re_data["COMMENTS_"] = data["COMMENT_"]
        else:
            re_data["COMMENTS_"] = 0
        # 参与数
        if "JOINS_" in data:
            re_data["JOINS_"] = data["JOINS_"]
        elif "JOIN_" in data:
            re_data["JOINS_"] = data["JOIN_"]
        else:
            re_data["JOINS_"] = 0

        # 内容
        re_data["CONTENT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["CONTENT_"])

        # HTML 标签
        re_data['CONTENT_HTML_'] = data["HTML_"]
        data["CONTENT_HTML_"] = data["HTML_"]
        re_data["CONTENT_HTML_"] = re.sub(r"href=\".*?\"", "href=\"javaScript:void(0);\"", re_data["CONTENT_HTML_"])

        if '28857' in re_data['CONTENT_HTML_'] or '您的IP' in re_data['CONTENT_HTML_']:
            try:
                soup = BeautifulSoup(re_data['CONTENT_HTML_'])
                soup.find('div', attrs={'class': 'online-desc-con'}).decompose()
                soup.find_all('script')[0].decompose()
                re_data['CONTENT_HTML_'] = soup.prettify()
            except Exception as e:
                self.logger.exception(f'IP检测内容清除出错')

        # TODO del data["HTML_] is wrong
        del data["HTML_"]
        re_data["CONTENT_"] = re_data["CONTENT_"].replace("|", "")
        re_data["TITLE_"] = re_data["TITLE_"].replace("|", "")
        # 是否营销活动
        re_data["ACT_"] = "N"

        # 版本
        re_data["VERSION_"] = "0"

        if "IMAGE_" in data:
            try:
                response = req_for_something(url=data["IMAGE_"])
                if response:
                    t = base64.b64encode(response.content)
                    data["IMAGE_"] = t.decode("utf-8")
                    response.close()
            except Exception:
                pass

        # 调用模型
        # 摘要
        try:
            brief = req_for_ts(re_data["CONTENT_"][0:1000])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_ts 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if brief:
                re_data["BRIEF_"] = brief["summary"]
            else:
                re_data["BRIEF_"] = '暂无摘要'
        # 情感分析
        try:
            sentiment = req_for_senti(re_data["TITLE_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_senti 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if sentiment:
                if sentiment["sentiment"] == "中性":
                    re_data["EMOTION_"] = "NORMAL"
                if sentiment["sentiment"] == "正面":
                    re_data["EMOTION_"] = "POSITIVE"
                if sentiment["sentiment"] == "敏感":
                    re_data["EMOTION_"] = "NAGETIVE"

        # 是否敏感
        try:
            censor = req_for_censor(re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_censor 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if censor:
                if censor["censor"] == "N":
                    re_data["SENSITIVE_"] = "N"
                else:
                    re_data["SENSITIVE_"] = "Y"
                    re_data["SENSITIVE_WORD_"] = censor["words"]
        # 热度
        try:
            hot = req_for_news_hot(title=re_data["TITLE_"], content=re_data["CONTENT_"][0:1000])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_news_hot 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if hot:
                re_data["HOT_"] = hot["level"]

        # 地址分析
        try:
            res = req_for_textLoc(text=re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if "error" not in res:
                if res["tagsId"] == "None" or res["tagsId"] is None:
                    pass
                else:
                    re_data["TAGS_"] = res["tagsId"]
                if res["flag"] == 1:
                    address = res["full"]
                else:
                    address = res["addr"]
                try:
                    lat_result = get_lat_lng(address=address)
                    re_data["LAT_"] = lat_result["result"]["location"]["lat"]
                    re_data["LNG_"] = lat_result["result"]["location"]["lng"]
                except KeyError:
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                except Exception as e:
                    self.logger.info(f"获取经纬度失败, ERROR: {e}")
                    re_data["LAT_"] = None
                    re_data["LNG_"] = None
                if re_data["LAT_"]:
                    try:
                        area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])]))
                    except Exception as e:
                        self.logger.info(f"获取地址失败, ERROR: {e}")
                    else:
                        try:
                            re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"]
                        except KeyError:
                            re_data["AREA_NAME_"] = ""
                        try:
                            re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"]
                        except KeyError:
                            re_data["AREA_CODE_"] = ""
                        else:
                            re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                            re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                            for city in self.city_list:
                                if city["CODE_"] == re_data["CITY_CODE_"]:
                                    re_data["CITY_NAME_"] = city["NAME_"]
                                    break
                            for prov in self.province_list:
                                if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                                    re_data["PROVINCE_NAME_"] = prov["NAME_"]
                                    break

        # 信用卡关联性
        try:
            res = req_for_credit_relative(text=re_data["CONTENT_"])
        except Exception as e:
            self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误."
                                  f" 原始数据 collection = {self.m_client.mongo_collection};"
                                  f" ENTITY_CODE_ = {self.entity_code};"
                                  f" 原始数据 _id = {data['_id']};"
                                  f" error: {e}.")
        else:
            if res["creditrelative"]:
                re_data["MODULE_TYPE_"] = "CREDITCARD"

        # 银行名称、编码
        if "BANK_NAME_" in data:
            re_data["BANK_NAME_"] = data["BANK_NAME_"]
        if "BANK_CODE_" in data:
            re_data["BANK_CODE_"] = data["BANK_CODE_"]

        re_data = super(BranchNews, self).generic_shuffle(data=data, re_data=re_data, field="CONTENT_")

        # 财资直接发布
        re_data['DATA_STATUS_'] = 'CHECK'
        # 是否发布
        if not re_data.get("PUBLISH_TIME_"):
            re_data["PUBLISH_STATUS_"] = "N"
        else:
            re_data["PUBLISH_STATUS_"] = "Y"

        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Exemple #9
0
    def generic_shuffle(self, data):
        """
        清洗规则写这里,如不需要通用清洗规则则不继承, 从大文本中筛选数据
        :param data:
        :param field:
        :return:
        """
        re_data = dict()
        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]

        serial_number = req_for_serial_number(code="JRCP_XYK")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")

        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        re_data["SOURCE_TYPE_"] = "WAK"

        # 对特殊微信 BANK_NAME 做处理
        for key, value in self.name_dict.items():
            if key[:2] in data["PRO_NAME_"]:
                re_data["BANK_NAME_"] = key
                re_data["BANK_CODE_"] = value
                break
        if "BANK_NAME_" in re_data:
            if re_data["BANK_NAME_"] == "建信":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "建设银行":
                re_data["BANK_NAME_"] = "中国建设银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "农业银行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "工行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "工商银行":
                re_data["BANK_NAME_"] = "中国工商银行"
            if re_data["BANK_NAME_"] == "民生银行":
                re_data["BANK_NAME_"] = "中国民生银行"
            if re_data["BANK_NAME_"] == "光大银行":
                re_data["BANK_NAME_"] = "中国光大银行"
            if re_data["BANK_NAME_"] == "交行":
                re_data["BANK_NAME_"] = "交通银行"
            if re_data["BANK_NAME_"] == "招行":
                re_data["BANK_NAME_"] = "招商银行"
            if re_data["BANK_NAME_"] == "农行":
                re_data["BANK_NAME_"] = "中国农业银行"
            if re_data["BANK_NAME_"] == "中行":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "中银":
                re_data["BANK_NAME_"] = "中国银行"
            if re_data["BANK_NAME_"] == "邮储银行":
                re_data["BANK_NAME_"] = "中国邮政储蓄银行"

        # 信用卡名称
        if "PRO_NAME_" in data:
            if "(" in data["PRO_NAME_"]:
                data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"].
                                                      find("(")]
            elif "(" in data["PRO_NAME_"]:
                data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"].
                                                      find("(")]
            re_data["PRO_NAME_"] = data["PRO_NAME_"]
        # 卡币种
        if "CURRENCY_TYPE_" in data:
            re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"]
        # 卡币种类型
        if data["CURRENCY_TYPE_"] == "人民币":
            re_data["CURRENCY_TYPE_CODE_"] = "RMB"
        if re.match(r"人民币/.*?", data["CURRENCY_TYPE_"]):
            re_data["CURRENCY_TYPE_CODE_"] = "DBZ"
        if data["CURRENCY_TYPE_"] == "美元":
            re_data["CURRENCY_TYPE_CODE_"] = "DBZ"
        # 卡组织|结算渠道
        if "BRAND_" in data:
            re_data["BRAND_"] = data["BRAND_"]

        # 卡组织CODE
        for brand_key in self.brand_dict:
            if brand_key in data["BRAND_"]:
                re_data["BRAND_CODE_"] = self.brand_dict[brand_key]
                break

        # 卡等级
        if "LEVEL_" in data:
            re_data["LEVEL_"] = data["LEVEL_"]
        # 卡等级CODE
        for level_key in self.level_dict:
            if level_key[:2] in data["LEVEL_"][:2]:
                re_data["LEVEL_CODE_"] = self.level_dict[level_key]
                break
        # 取现额度
        if "CONSUME_LIMIT_" in data:
            re_data["CONSUME_LIMIT_"] = data["CONSUME_LIMIT_"]

        # 这里开始从大文本清洗
        # 免息期
        GRACE_PERIODS_ = re.findall(r".*?免息期[::]\|(.*?)\|", data["CONTENT_"])
        if len(GRACE_PERIODS_) > 0:
            GRACE_PERIODS_ = GRACE_PERIODS_[0]
            # 处理到20天50天的错误数据
            pattern = re.compile(r"到(\d+)天(\d+)天")
            if re.match(pattern, GRACE_PERIODS_):
                GRACE_PERIODS_ = pattern.sub(r"\1天到\2天", GRACE_PERIODS_)

            if GRACE_PERIODS_ == "消费验证方式:":
                GRACE_PERIODS_ = ""

            if GRACE_PERIODS_ == "预借现金额度:" or GRACE_PERIODS_ == "预借现金额度:":
                GRACE_PERIODS_ = ""
            if re.match(r"最长\d+天最长\d+天", GRACE_PERIODS_):
                a = re.match(r"(最长\d+天)最长\d+天", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1)

            if re.match(r"\d+天到\d+天\d+天到\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)到(\d+天)(\d+天)到\d+天", GRACE_PERIODS_)
                if a.group(1) == a.group(2):
                    GRACE_PERIODS_ = a.group(1) + "到" + a.group(3)
                else:
                    GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"\d+天\d+天\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)\d+天(\d+天)\d+天", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"(\d+天)(\d+天)", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            if re.match(r"至\d+天\d+天", GRACE_PERIODS_):
                a = re.match(r"至(\d+天)(\d+天)", GRACE_PERIODS_)
                GRACE_PERIODS_ = a.group(1) + "到" + a.group(2)

            re_data["GRACE_PERIODS_"] = GRACE_PERIODS_
        else:
            re_data["GRACE_PERIODS_"] = data["GRACE_PERIODS_"]

        # 免年费政策
        FREE_POLICY_ = re.findall(r".*?免年费政策[::]\|(.*?)\|", data["CONTENT_"])
        if len(FREE_POLICY_) > 0:
            FREE_POLICY_ = FREE_POLICY_[0]
            # 删除重复数据
            pattern = re.compile(r"(免\d+年年费){2,9}")
            if re.match(pattern, FREE_POLICY_):
                a = re.match(pattern, FREE_POLICY_)
                FREE_POLICY_ = a.group(1)
            pattern = re.compile(r"(终身免年费){2,9}")
            if re.match(pattern, FREE_POLICY_):
                a = re.match(pattern, FREE_POLICY_)
                FREE_POLICY_ = a.group(1)
            re_data["FREE_POLICY_"] = FREE_POLICY_

        # 主卡年费
        FEE_ = re.findall(r".*?主卡年费[::]\|(.*?)\|", data["CONTENT_"])
        if len(FEE_) > 0:
            FEE_ = FEE_[0]
            tempfee = re.findall(r".*?(\d+).*?", FEE_)
            if len(tempfee) > 0:
                re_data["FEE_"] = tempfee[0]
            else:
                re_data["FEE_"] = ""
        else:
            re_data["FEE_"] = "0"

        # 预借现金额度
        PRE_BORROW_ = re.findall(r".*?预借现金额度[::]\|(.*?)\|", data["CONTENT_"])
        if len(PRE_BORROW_) > 0:
            PRE_BORROW_ = PRE_BORROW_[0]
            if PRE_BORROW_ == "免息期:":
                PRE_BORROW_ = ""
            if PRE_BORROW_ == "免年费政策:":
                PRE_BORROW_ = ""
            # 去除重复的数据
            pattern = re.compile(r"(信用额度的\d+%)信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(信用额度的\d+-\d+%)信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(普卡信用额度的\d+%)白金卡信用额度的\d+%金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(普卡信用额度的\d+%)金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            pattern = re.compile(r"(白金卡信用额度的\d+%)金卡信用额度的\d+%")
            if re.match(pattern, PRE_BORROW_):
                a = re.match(pattern, PRE_BORROW_)
                PRE_BORROW_ = a.group(1)

            re_data["PRE_BORROW_"] = PRE_BORROW_
        else:
            re_data["PRE_BORROW_"] = ""

        # 消费验证方式
        re_data["VALID_CONSUME_"] = "密码+签名 签名"

        # 账单日
        BILL_DATE_ = re.findall(r".*?账单日[::]\|(.*?)\|", data["CONTENT_"])
        if len(BILL_DATE_) > 0:
            BILL_DATE_ = BILL_DATE_[0]
            # 处理重复的账单日数据 比如:账单日21号账单日21号账单日21号
            pattern = re.compile(r"(账单日\d+号){2,9}")
            if re.match(pattern, BILL_DATE_):
                a = re.match(pattern, BILL_DATE_)
                BILL_DATE_ = a.group(1)
            re_data["BILL_DATE_"] = BILL_DATE_
        else:
            re_data["BILL_DATE_"] = ""

        # 积分方式
        POINTS_ = re.findall(r".*?积分方式[::]\|(.*?)\|", data["CONTENT_"])
        if len(POINTS_) > 0:
            POINTS_ = POINTS_[0]
            if re_data.get("BANK_CODE_") and re_data["BANK_CODE_"] == "CMB":
                POINTS_ = POINTS_.replace("元", "元 ")
            else:
                POINTS_ = POINTS_.replace("分", "分 ")
                POINTS_ = POINTS_.replace("倍", "倍 ")
                POINTS_ = POINTS_.replace("积分 的2倍", "积分的2倍")
            re_data["POINTS_"] = POINTS_
        else:
            re_data["POINTS_"] = ""

        # 积分有效期
        VALID_DATE_POINTS_ = re.findall(r".*?积分有效期[::]\|(.*?)\|",
                                        data["CONTENT_"])
        if len(VALID_DATE_POINTS_) > 0:
            VALID_DATE_POINTS_ = VALID_DATE_POINTS_[0]
            # 给几组有效期之间加上空格
            pattern = re.compile(r"(白金卡\d+年)(金卡\d+年)(普卡\d+年)")
            if re.match(pattern, VALID_DATE_POINTS_):
                VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3",
                                            VALID_DATE_POINTS_)

            pattern = re.compile(r"(\d+年到\d+年)(\d+年)(永久有效)")
            if re.match(pattern, VALID_DATE_POINTS_):
                VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3",
                                            VALID_DATE_POINTS_)

            re_data["VALID_DATE_POINTS_"] = VALID_DATE_POINTS_
        else:
            re_data["VALID_DATE_POINTS_"] = ""

        # 循环信用利息
        DAILY_INTEREST_ = re.findall(r".*?循环信用利息(日息)[::]?\|(.*?)\|",
                                     data["CONTENT_"])
        if len(DAILY_INTEREST_) > 0:
            DAILY_INTEREST_ = DAILY_INTEREST_[0]
            if DAILY_INTEREST_ == "消费短信通知费:":
                DAILY_INTEREST_ = ""
            re_data["DAILY_INTEREST_"] = DAILY_INTEREST_
        else:
            re_data["DAILY_INTEREST_"] = ""

        # 最低还款
        MIN_REPAY_ = re.findall(r".*?最低还款[::]?\|(.*?)\|", data["CONTENT_"])
        if len(MIN_REPAY_) > 0:
            MIN_REPAY_ = MIN_REPAY_[0]
            if re.match(r"最低应还所欠金额的\d+%最低应还所欠金额的\d+%", MIN_REPAY_):
                a = re.match(r"(最低应还所欠金额的\d+%)最低应还所欠金额的\d+%", MIN_REPAY_)
                MIN_REPAY_ = a.group(1)
            if MIN_REPAY_ == "账单日:":
                MIN_REPAY_ = ""
            re_data["MIN_REPAY_"] = MIN_REPAY_
        else:
            re_data["MIN_REPAY_"] = ""

        # 卡片特色
        if "SPECIAL_" in data and len(data["SPECIAL_"]) > 0:
            re_data["SPECIAL_"] = data["SPECIAL_"].replace("|", "<br/>")

        # 增值服务
        if "VAS_" in data and len(data["VAS_"]) > 0:
            re_data["VAS_"] = data["VAS_"].replace("|", "<br/>")

        # 信用卡图片
        # 处理错误的信用卡图片URL
        if "IMAGES_" in data:
            pattern = re.compile(r"https:(http://.*)")
            if re.match(pattern, data["IMAGES_"]):
                a = re.match(pattern, data["IMAGES_"])
                image_url = a.group(1)
            else:
                image_url = data["IMAGES_"]
            response = req_for_something(url=image_url)
            if response:
                t = base64.b64encode(response.content)
                re_data["IMAGE_"] = t.decode("utf-8")

        re_data = super(BranchXyk, self).generic_shuffle(data=data,
                                                         re_data=re_data,
                                                         field=None)
        # print(re_data)
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
Exemple #10
0
    def generic_shuffle(self, data, re_data, field=None):
        """
        通用清洗规则写在这里, 现只有从字段中匹配银行。
        :param data: 要清洗的数据 type: dict
        :param re_data: 要清洗的数据 type: dict
        :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ...
                                          NoneType: None 无需清洗
        :return: 清洗完毕的数据 type: dict
        """

        # 涉及银行统一在 __init_____.py 中处理
        # if field:
        #     if "BANK_NAME_" not in re_data:
        #         for bank in self.bank_list:
        #             if data["ENTITY_NAME_"][:-4] in bank["ALIAS_"]:
        #                 re_data["BACK_CODE_"] = bank["CODE_"]  # 银行编码
        #                 re_data["BACK_NAME_"] = bank["NAME_"]  # 银行名称
        #                 break

        if "ID_" not in re_data:
            serial_number = req_for_serial_number(
                code=data["ENTITY_CODE_"][:8])
            re_data["ID_"] = serial_number
        # 文件上传
        if "YJBG_" in data["ENTITY_CODE_"]:
            tc = "YJBG"

        if data["FILE_URL_"]:
            re_postfix = re.findall(r"\.([pd][do][fc]x?$)", data["FILE_URL_"])
            if re_postfix or data.get('ENTITY_CODE_') in [
                    'XYK_YJBG_GFYH', 'XYK_YJBG_JTYH'
            ]:
                postfix = re_postfix[0] if re_postfix else 'pdf'
                if "FILE_NAME_" in data:
                    file_name = data["FILE_NAME_"]
                else:
                    re_file_name = re.findall(rf"/(.*?)\.{postfix}",
                                              data["FILE_URL_"], re.IGNORECASE)
                    if re_file_name:
                        file_name = re_file_name[0]
                    else:
                        file_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["FILE_URL_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    print('附件请求成功')
                    if response:
                        try:
                            # p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_{tc}_{postfix.upper()}",
                            p_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_YJBG",
                                file_name=file_name,
                                postfix=postfix,
                                file=response.content)
                            if "error" in p_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {p_response.content.decode('utf-8')}."
                                )
                                raise Exception("上传文件出错")
                            else:
                                self.logger.info(
                                    f"2.3--success: 文件上传成功."
                                    f"{p_response.content.decode('utf-8')}")
                            p_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                            raise Exception("上传文件出错")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")
                        raise Exception("文件请求失败")

        if "ENTITY_CODE_" not in re_data:
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        if "ENTITY_NAME_" not in re_data:
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        if "URL_" not in re_data:
            if "URL_" in data:
                re_data["URL_"] = data["URL_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME

        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        if "PERIOD_CODE_" not in re_data:
            re_data["PERIOD_CODE_"] = re_data.get("PUBLISH_TIME_", "")
        if "M_STATUS_" not in re_data:
            re_data["M_STATUS_"] = "N"
        if "DELETE_STATUS_" not in re_data:
            re_data["DELETE_STATUS_"] = "N"
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "VERSION_" not in re_data:
            re_data["VERSION_"] = "0"
        if "DATA_VERSION_" not in re_data:
            re_data["DATA_VERSION_"] = "0"
        if "MICROBLOG" not in re_data[
                "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"

        return re_data
Exemple #11
0
    def generic_shuffle(self, data):
        """
        清洗脚本写到这里
        :param data:
        :return re_data:
        """

        re_data = dict()
        serial_number = req_for_serial_number(code="WD_SS_XX")
        re_data["ID_"] = serial_number
        # 时间维度
        re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "")
        # 标签
        if "TAGS_" in data:
            re_data["TAGS_"] = ""
        # SOURCE
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        # 数据来源名称
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]
        # # 数据来源编码
        # s_index = data["ENTITY_CODE_"].rfind("_")
        # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index]
        # 资讯来源分类
        re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8]
        # 得到经度和维度 补全省市区域数据
        try:
            lat_result = get_lat_lng(address=data["ADDR_"])
            re_data["LAT_"] = lat_result["result"]["location"]["lat"]
            re_data["LNG_"] = lat_result["result"]["location"]["lng"]
        except KeyError:
            re_data["LAT_"] = None
            re_data["LNG_"] = None
        except Exception as e:
            self.logger.info("获取经纬度失败信息为{}".format(e))
        if re_data["LAT_"]:
            try:
                area_result = get_area(",".join(
                    [str(re_data["LAT_"]),
                     str(re_data["LNG_"])]))
            except Exception as e:
                self.logger.info(f"获取地址失败, ERROR: {e}")
            else:
                try:
                    re_data["PROVINCE_NAME_"] = area_result["result"][
                        "addressComponent"]["province"]
                    re_data["CITY_NAME_"] = area_result["result"][
                        "addressComponent"]["city"]
                    re_data["AREA_NAME_"] = area_result["result"][
                        "addressComponent"]["district"]
                    re_data["AREA_CODE_"] = area_result["result"][
                        "addressComponent"]["adcode"]
                    re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                    re_data[
                        "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                except KeyError:
                    pass

        # 学校名称
        if "NAME_" in data:
            re_data["NAME_"] = data["NAME_"]
        # 属性(市重点、区重点、全国重点)
        if "LEVEL_" in data:
            re_data["LEVEL_"] = data["LEVEL_"]
        # 图片
        if "IMAGES_" in data:
            if data["IMAGES_"]:
                response = req_for_something(url=data["IMAGES_"])
                if response:
                    t = base64.b64encode(response.content)
                    re_data["IMAGES_"] = t.decode("utf-8")
        # 学校类型
        if "SCHOOL_TYPE_" in data:
            re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"]
        # 学校性质
        if "SCHOOL_NATURE_" in data:
            re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"]
        # 电话
        if "TEL_" in data:
            pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})")
            pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})")
            pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})")
            pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})")
            pattern5 = re.compile(r"(\d{8})(\d{11})")
            pattern6 = re.compile(r"(\d{8})(\d{8})")
            pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})")
            pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})")
            pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})")
            if re.match(pattern1, data["TEL_"]):
                phone_number = re.sub(pattern1, r"\1  \2", data["TEL_"])
            elif re.match(pattern2, data["TEL_"]):
                phone_number = re.sub(pattern2, r"\1  \2", data["TEL_"])
            elif re.match(pattern3, data["TEL_"]):
                phone_number = re.sub(pattern3, r"\1  \2", data["TEL_"])
            elif re.match(pattern4, data["TEL_"]):
                phone_number = re.sub(pattern4, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern5, data["TEL_"]):
                phone_number = re.sub(pattern5, r"\1  \2", data["TEL_"])
            elif re.match(pattern6, data["TEL_"]):
                phone_number = re.sub(pattern6, r"\1  \2", data["TEL_"])
            elif re.match(pattern7, data["TEL_"]):
                phone_number = re.sub(pattern7, r"\1  \2", data["TEL_"])
            elif re.match(pattern8, data["TEL_"]):
                phone_number = re.sub(pattern8, r"\1  \2  \3", data["TEL_"])
            elif re.match(pattern9, data["TEL_"]):
                phone_number = re.sub(pattern9, r"\1  \2", data["TEL_"])
            else:
                phone_number = data["TEL_"]
            re_data["TEL_"] = phone_number
        # 地址
        if "ADDR_" in data:
            re_data["ADDR_"] = data["ADDR_"]
        re_data = super(Branchssxx, self).generic_shuffle(data=data,
                                                          re_data=re_data,
                                                          field=None)
        return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
Exemple #12
0
def data_shuffle(data):
    re_data = dict()
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    # 年费
    re_data["FEE_"] = data["FEE_"]
    # 提现额
    re_data["CASHING_AMOUNT_"] = data["CASHING_AMOUNT_"]
    # 信用额(最高)
    re_data["MOST_AMOUNT_"] = data["MOST_AMOUNT_"]
    # 卡等级
    re_data["CARD_LEVEL_"] = data["CARD_LEVEL_"]
    # 卡组织
    re_data["CARD_ORG_"] = data["CARD_ORG_"]
    # 卡片IMAGE
    if "IMG_" in data:
        image_url = data["IMG_"]
        response = req_for_something(url=image_url)
        if response:
            t = base64.b64encode(response.content)
            re_data["IMG_"] = t.decode("utf-8")
    # 卡片名称
    re_data["CARD_NAME_"] = data["CARD_NAME_"]
    # 权益(文字描述)
    re_data["POWER_WRITING_"] = data["POWER_WRITING_"]
    # 卡属性
    re_data["CARD_ATTR_"] = data["CARD_ATTR_"]
    # 信用额度
    re_data["CREDIT_AMOUNT_"] = data["CREDIT_AMOUNT_"]
    # 免息期
    re_data["INTEREST_FREE_"] = data["INTEREST_FREE_"]
    # 详细介绍
    INTRO_ = BeautifulSoup(data["INTRO_"], "html.parser").getText()
    pattern = re.compile(r"[\s\S]*卡片介绍([\s\S]*)")
    if re.match(pattern, INTRO_):
        a = re.match(pattern, INTRO_)
        intro = a.group(1)
        intro = re.sub('[\n]+', '', intro)
        re_data["INTRO_"] = intro

    # 卡片介绍
    # print(data["CARD_INTRO_"])
    soup = BeautifulSoup(data["CARD_INTRO_"], "html.parser")
    re_data["CARD_INTRO_"] = soup.find('div', {"class": "adp"}).text
    # pattern = re.compile(r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)")

    # OTHER_REPAY_其他还款
    soup = BeautifulSoup(data["OTHER_REPAY_"], "html.parser")
    # print(soup)
    a = soup.find_all('div', {"class": "tt2_1"})
    OTHER_REPAY_LIST = list()
    for item in a:
        OTHER_REPAY_LIST.append(item.string)
    OTHER_REPAY_ = "|".join(OTHER_REPAY_LIST)
    re_data["OTHER_REPAY_"] = OTHER_REPAY_

    # OFFLINE_REPAY_ 网点还款
    soup = BeautifulSoup(data["OFFLINE_REPAY_"], "html.parser")
    a = soup.find_all('div', {"class": "tt2_1"})
    OFFLINE_REPAY_LIST = list()
    for item in a:
        OFFLINE_REPAY_LIST.append(item.string)
    OFFLINE_REPAY_ = "|".join(OFFLINE_REPAY_LIST)
    re_data["OFFLINE_REPAY_"] = OFFLINE_REPAY_

    # NET_REPAY_ 在线还款
    soup = BeautifulSoup(data["NET_REPAY_"], "html.parser")
    a = soup.find_all('div', {"class": "tt2_1"})
    NET_REPAY_LIST = list()
    for item in a:
        NET_REPAY_LIST.append(item.string)
        NET_REPAY_ = "|".join(NET_REPAY_LIST)
    re_data["NET_REPAY_"] = NET_REPAY_

    # ACTIVATE_ 激活
    re_data["ACTIVATE_"] = data["ACTIVATE_"]

    # SCORE_MILEAGE_ 积分兑换里程
    SCORE_MILEAGE_ = BeautifulSoup(data["SCORE_MILEAGE_"],
                                   "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_MILEAGE_):
        a = re.match(pattern, SCORE_MILEAGE_)
        score_mileage = a.group(1)
        score_mileage = re.sub('[\n]+', '', score_mileage)
        score_mileage = re.sub('\s+', '', score_mileage)
        re_data["SCORE_MILEAGE_"] = score_mileage

    # SCORE_METHOD_ 积分兑换方法
    SCORE_METHOD_ = BeautifulSoup(data["SCORE_METHOD_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_METHOD_):
        a = re.match(pattern, SCORE_METHOD_)
        score_method = a.group(1)
        score_method = re.sub('[\n]+', '', score_method)
        score_method = re.sub('\s+', '', score_method)
        re_data["SCORE_METHOD_"] = score_method

    # SCORE_SEARCH_ 积分查询方式
    SCORE_SEARCH_ = BeautifulSoup(data["SCORE_SEARCH_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_SEARCH_):
        a = re.match(pattern, SCORE_SEARCH_)
        score_search = a.group(1)
        score_search = re.sub('[\n]+', '', score_search)
        score_search = re.sub('\s+', '', score_search)
        re_data["SCORE_SEARCH_"] = score_search

    # SCORE_ACCU_ 积分累积规则
    SCORE_ACCU_ = BeautifulSoup(data["SCORE_ACCU_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_ACCU_):
        a = re.match(pattern, SCORE_ACCU_)
        score_accu = a.group(1)
        score_accu = re.sub('[\n]+', '', score_accu)
        score_accu = re.sub('\s+', '', score_accu)
        re_data["SCORE_ACCU_"] = score_accu

    # SCORE_VALID_ 积分有效期
    SCORE_VALID_ = BeautifulSoup(data["SCORE_VALID_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_VALID_):
        a = re.match(pattern, SCORE_VALID_)
        score_valid = a.group(1)
        score_valid = re.sub('[\n]+', '', score_valid)
        score_valid = re.sub('\s+', '', score_valid)
        re_data["SCORE_VALID_"] = score_valid

    # PREPAYMENT_ 提前还款规定
    PREPAYMENT_ = BeautifulSoup(data["PREPAYMENT_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, PREPAYMENT_):
        a = re.match(pattern, PREPAYMENT_)
        repayment = a.group(1)
        repayment = re.sub('[\n]+', '', repayment)
        repayment = re.sub('\s+', '', repayment)
        re_data["PREPAYMENT_"] = repayment

    # CHARE_DEDUCT_ 手续费扣除方式
    CHARE_DEDUCT_ = BeautifulSoup(data["CHARE_DEDUCT_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, CHARE_DEDUCT_):
        a = re.match(pattern, CHARE_DEDUCT_)
        chage_deduct = a.group(1)
        chage_deduct = re.sub('[\n]+', '', chage_deduct)
        chage_deduct = re.sub('\s+', '', chage_deduct)
        re_data["CHARE_DEDUCT_"] = chage_deduct

    # NUMBER_RATE_  期数及费率
    # print(data["NUMBER_RATE_"])
    from scrapy.selector import Selector
    import requests
    response = requests.get(
        data['URL_'],
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        })
    html = Selector(text=response.content.decode('gb2312'))
    trs = html.xpath(
        '//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[position()>5 and position()<last()-1]'
    )
    for tr in trs:
        try:
            page = trs.index(tr) + 6
            xpath_ = f'//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[{page}]'
            periods_1 = tr.xpath(xpath_ +
                                 '/td[1]/p/span[1]/text()').extract()[0]
            rate_1 = tr.xpath(xpath_ +
                              '//td[1]/p/span[2]/text()').extract()[-1]

            periods_2 = tr.xpath(xpath_ +
                                 '/td[2]/p/span[1]/text()').extract()[0]
            rate_2 = tr.xpath(xpath_ + '/td[2]/p/span[2]/text()').extract()[-1]
        except:
            periods_1, rate_1, periods_2, rate_2 = '', '', '', '',
        print(periods_1, rate_1, periods_2, rate_2)
    return re_data
def data_shuffle(data):
    if "PDF_" in data:
        # print(data["PDF_"])
        response = req_for_something(url=data["PDF_"])
        # print(response.text)
        try:
            pdf_content = response.content.decode("utf-8")
        except UnicodeDecodeError:
            pdf_content = response.content.decode("gbk")
        html = HTML(pdf_content)
        url = html.xpath("//a[contains(text(),\"说明书\")]/@href")
        if url:
            url = "http://ewealth.abchina.com/fs" + url[0][1:]
            # print(url)
            response2 = req_for_something(url=url)
            # print(response2.content.decode("utf-8"))
            try:
                response2_content = response2.content.decode("utf-8")
            except UnicodeDecodeError:
                response2_content = response2.content.decode("gbk")

            pdf_url1 = re.findall(r"/\w+\.pdf", response2_content)
            if pdf_url1:
                pdf_url = "http://ewealth.abchina.com/fs/intro_list" + pdf_url1[
                    0]
                data["PDF_"] = pdf_url
        else:
            try:
                response_content = response.content.decode("utf-8")
            except UnicodeDecodeError:
                response_content = response.content.decode("gbk")
            url = re.findall(r"/\w+\.pdf", response_content)
            if url:
                pdf_url = "http://ewealth.abchina.com/fs/intro_list" + url[0]
                data["PDF_"] = pdf_url
        # response3 = req_for_something(url=pdf_url)
        # return response3
    if "RISK_LEVEL_" in data:
        if data["RISK_LEVEL_"] == "低":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["RISK_LEVEL_"] == "中低":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "较低":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["RISK_LEVEL_"] == "中等":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["RISK_LEVEL_"] == "中高":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["RISK_LEVEL_"] == "高":
            data["RISK_LEVEL_CODE_"] = "R5"
    elif "SOURCE_RISK_LEVEL_" in data:
        if data["SOURCE_RISK_LEVEL_"] == "低":
            data["RISK_LEVEL_CODE_"] = "R1"
        elif data["SOURCE_RISK_LEVEL_"] == "中低":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "较低":
            data["RISK_LEVEL_CODE_"] = "R2"
        elif data["SOURCE_RISK_LEVEL_"] == "中等":
            data["RISK_LEVEL_CODE_"] = "R3"
        elif data["SOURCE_RISK_LEVEL_"] == "中高":
            data["RISK_LEVEL_CODE_"] = "R4"
        elif data["SOURCE_RISK_LEVEL_"] == "高":
            data["RISK_LEVEL_CODE_"] = "R5"
    return data
Exemple #14
0
def data_shuffle(data, province_list, city_list, area_list):
    data_list = list()
    for city in city_list:
        if city["NAME_"] == "县":
            city_list.remove(city)

    prov_c = ""
    prov_n = ""
    city_c = ""
    city_n = ""
    area_c = ""
    area_n = ""

    # 省市级信息
    if "北京" in data["CITY_NAME_"]:
        prov_n = "北京市"
        prov_c = "1100"
        city_n = "北京市"
        city_c = "110100"
    elif "天津" in data["CITY_NAME_"]:
        prov_n = "天津市"
        prov_c = "1200"
        city_n = "天津市"
        city_c = "120100"
    elif "上海" in data["CITY_NAME_"]:
        prov_n = "上海市"
        prov_c = "3100"
        city_n = "上海市"
        city_c = "310100"
    elif "重庆" in data["CITY_NAME_"]:
        prov_n = "重庆市"
        prov_c = "5000"
        city_n = "重庆市"
        city_c = "500100"
    else:
        for city in city_list:
            if city["NAME_"][:-1] in data["CITY_NAME_"]:
                city_n = city["NAME_"]
                city_c = city["CODE_"]
                prov_c = city["PARENT_"]
                break
        if prov_c:
            for prov in province_list:
                if prov["CODE_"] == prov_c:
                    prov_n = prov["NAME_"]
                    break

    response = req_for_something(data["URL_"])

    a = re.sub(r"[^\w|,]+", "", response.content.decode("utf-8"))

    b = a.split("|")

    for each in b:
        re_data = dict()
        message = each.split(",")
        if len(message) == 1:
            continue
        # city_n = message[0]
        # city_c = message[1]
        name = message[2]
        addr_ = message[3]
        tel = message[4]
        business_time = message[5] + message[6]
        # lng = message[8]
        # lat = message[9]

        # # 区县级清洗
        # for area in area_list:
        #     if area["PARENT_"] == city_c:
        #         if area["NAME_"] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:-1] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:4] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:3] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]
        #         elif area["NAME_"][:2] in addr_:
        #             area_n = area["NAME_"]
        #             area_c = area["CODE_"]

        # 地址清洗
        if prov_n in addr_:
            pass
        elif prov_n[:-1] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:-1],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:4] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:4],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:3] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:3],
                                                prov_n) + addr_[len(prov_n):]
        elif prov_n[:2] in addr_[:len(prov_n)]:
            addr_ = addr_[:len(prov_n)].replace(prov_n[:2],
                                                prov_n) + addr_[len(prov_n):]
        else:
            addr_ = prov_n + addr_

        if city_n in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_
        elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):]
        elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]:
            addr_ = addr_[:len(prov_n) + len(city_n)].replace(
                city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):]
        else:
            addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):]

        # "C"
        re_data["BANK_CODE_"] = "CZB"
        re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3]
        re_data["SPIDER_TIME_"] = data["DATETIME_"]
        # re_data["AREA_CODE_"] = area_c
        # re_data["AREA_NAME_"] = area_n
        # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c

        # "F"
        re_data["ADDR_"] = addr_
        re_data["PROVINCE_NAME_"] = prov_n
        re_data["PROVINCE_CODE_"] = prov_c
        re_data["CITY_CODE_"] = city_c
        re_data["CITY_NAME_"] = city_n
        # re_data["LAT_"] = lat
        # re_data["LNG_"] = lng
        re_data["NAME_"] = name
        # re_data["PROVINCE_CODE_"] = prov_c
        # re_data["PROVINCE_NAME_"] = prov_n

        result = get_lat_lng(address=re_data["ADDR_"])
        try:
            re_data["LAT_"] = str(result["result"]["location"]["lat"])
            re_data["LNG_"] = str(result["result"]["location"]["lng"])
        except KeyError:
            re_data["LAT_"] = ""
            re_data["LNG_"] = ""
        else:
            dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]]))
            try:
                re_data["AREA_NAME_"] = dis_result["result"][
                    "addressComponent"]["district"]
            except KeyError:
                re_data["AREA_NAME_"] = ""
            try:
                re_data["AREA_CODE_"] = dis_result["result"][
                    "addressComponent"]["adcode"]
            except KeyError:
                re_data["AREA_CODE_"] = ""
            else:
                re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00"
                re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00"
                for city in city_list:
                    if city["CODE_"] == re_data["CITY_CODE_"]:
                        re_data["CITY_NAME_"] = city["NAME_"]
                        break
                for prov in province_list:
                    if prov["CODE_"] == re_data["PROVINCE_CODE_"]:
                        re_data["PROVINCE_NAME_"] = prov["NAME_"]
                        break
        re_data["UNIT_CODE_"] = "CZB" + "_" + re_data.get("CITY_CODE_", "")

        re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        re_data["URL_"] = data["URL_"]
        re_data["TEL_"] = tel
        re_data["BUSINESS_HOURS_"] = business_time
        if "SOURCE_TYPE_NAME_" in data:
            re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"]
        re_data["TYPE_NAME_"] = "支行"
        re_data["TYPE_"] = "ZH"

        data_list.append(re_data)

    return data_list
Exemple #15
0
    print(sheet.merged_cells)
    print(len(sheet.merged_cells))
    # # 获取整行和整列的值(数组)
    # rows1 = sheet.row_values(3)  # 获取第四行内容
    # rows2 = sheet.row_values(4)  # 获取第四行内容
    # rows3 = sheet.row_values(5)  # 获取第四行内容
    # cols = sheet.col_values(2)  # 获取第三列内容
    # print(rows1)
    # print(rows2)
    # print(rows3)

    # # 获取单元格内容
    # print
    # sheet2.cell(1, 0).value.encode('utf-8')
    # print
    # sheet2.cell_value(1, 0).encode('utf-8')
    # print
    # sheet2.row(1)[0].value.encode('utf-8')
    #
    # # 获取单元格内容的数据类型
    # print
    # sheet2.cell(1, 0).ctype


if __name__ == '__main__':
    response = req_for_something(url="http://www.hxb.com.cn/images/grjr/zjyw/dlxsl/2018/10/12/12165251E4AD1CEEB164E48700BC924FC58F1BED.xls")
    read_excel(response.content)


Exemple #16
0
    def generic_shuffle(self, data, field="PRO_NAME_"):
        """
        清洗规则写这里, 如不需要通用清洗规则则不继承
        :param data:
        :param field:
        :return:
        """
        # different shuffle rule
        # 如果data是一个list
        if isinstance(data, list):
            re_data_list = []
            for item in data:
                re_data_list.append({"TABLE_NAME_": self.script_name, "DATA_": self.generic_shuffle(item)})
            return re_data_list

        re_data = dict()
        serial_number = req_for_serial_number(code="JRCP_BX")
        re_data["ID_"] = serial_number + "TEST"
        source = re.findall(r"(https?://.*?)/", data["URL_"])
        re_data["SOURCE_"] = source[0]
        re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"]
        re_data["VERSION_"] = "0"
        re_data["DATA_VERSION_"] = "0"
        # todo
        re_data["SOURCE_TYPE_"] = ""

        # 模型

        re_data["HOT_"] = data["HOT_"] if "HOT_" in data else "0"

        re_data["PRO_NAME_"] = data["PRO_NAME_"]

        # 保险公司
        if "COM_NAME_" in data:
            for each in self.company_list:
                if each["NAME_"]:
                    if data["COM_NAME_"] in each["NAME_"] or each["NAME_"] in data["COM_NAME_"]:
                        re_data["COM_NAME_"] = each["NAME_"]
                        re_data["COM_NAME_CODE_"] = each["CODE_"]
                    elif each["ALIAS_"] and data["COM_NAME_"] in each["ALIAS_"]:
                        re_data["COM_NAME_"] = each["NAME_"]
                        re_data["COM_NAME_CODE_"] = each["CODE_"]
            if "COM_NAME_" not in re_data:
                re_data["COM_NAME_"] = data["COM_NAME_"]

        # 保额 补录
        if "ENSURE_PRICE_" in data:
            re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"]
        # else:
        #     re_data["ENSURE_PRICE_"] = [100000, 500000, 1000000][random.randint(0, 2)]
        # 保费 补录
        if "ENSURE_FEE_" in data:
            re_data["ENSURE_FEE_"] = data["ENSURE_FEE_"]
        # else:
        #     re_data["ENSURE_FEE_"] = [50, 100, 200, 150][random.randint(0, 3)]
        # 产品特色 补录
        if "SPECAIL_" in data:
            re_data["SPECAIL_"] = data["SPECAIL_"]
        # 产品简介 补录
        if "BRIEF_" in data:
            re_data["BRIEF_"] = data["BRIEF_"]
        # 承保年龄 补录
        if "AGE_" in data:
            re_data["AGE_"] = data["AGE_"]
        # else:
        #     re_data["AGE_"] = [50, 70, 60, 80][random.randint(0, 3)]
        # 保险期间 补录
        if "ENSURE_DATE_" in data:
            re_data["ENSURE_DATE_"] = data["ENSURE_DATE_"]
        # else:
        #     re_data["ENSURE_DATE_"] = ["至80岁", "至60岁", "一年", "五年", "十年", "终身"][random.randint(0, 5)]
        # 投保份数 补录
        if "BUY_LIMIT_" in data:
            re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"]
        # else:
        #     re_data["BUY_LIMIT_"] = [1, 2, "不限"][random.randint(0, 2)]
        # 保单形式 补录
        if "ENSURE_MODE_" in data:
            re_data["ENSURE_MODE_"] = data["ENSURE_MODE_"]
        # 保单 补录
        if "ENSURE_MODE_CODE_" in data:
            re_data["ENSURE_MODE_CODE_"] = data["ENSURE_MODE_CODE_"]
        # 适用人群 补录
        if "SUIT_" in data:
            re_data["SUIT_"] = data["SUIT_"]
        # else:
        #     re_data["SUIT_"] = ["20岁以下", "20岁至50岁人群", "无重大疾病隐患者", "不限"][random.randint(0,3)]
        # 原始保险分类 补录
        if "ENSURE_SOURCE_TYPE_" in data:
            re_data["ENSURE_SOURCE_TYPE_"] = data["ENSURE_SOURCE_TYPE_"]
        # 保险类型 补录
        # type_dict = {"寿险": "SX", "年金险": "NJX", "意外险": "YWX", "个人财险": "GRCX", "企业财险": "QYCX", "旅游险": "LYX", "健康险": "JKX", "理财险": "LCX"}
        if "ENSURE_TYPE_" in data:
            re_data["ENSURE_TYPE_"] = data["ENSURE_TYPE_"]
            # re_data["ENSURE_TYPE_"] = ["寿险", "年金险", "意外险", "个人财险", "企业财险", "旅游险", "健康险", "理财险"][random.randint(0, 7)]
        # 保险类型分类 补录
        # if 1:
        if "ENSURE_TYPE_CODE_" in data:
            re_data["ENSURE_TYPE_CODE_"] = data["ENSURE_TYPE_CODE_"]
            # re_data["ENSURE_TYPE_CODE_"] = type_dict[re_data["ENSURE_TYPE_"]]
        # 推荐
        re_data["RECOMMEND_"] = "N"
        # 畅销
        re_data["GOOD_SALE_"] = "N"
        # 最新
        re_data["NEW_SALE_"] = "N"
        # 保障内容 补录
        if "ENSURE_CONTENT_" in data:
            re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"]
        # 投保须知 补录
        if "NOTICE_" in data:
            re_data["NOTICE_"] = data["NOTICE_"]
        # 产品介绍 补录
        if "PRO_DETAIL_" in data:
            re_data["PRO_DETAIL_"] = data["PRO_DETAIL_"]
        if "ENSURE_PAY_" in data.keys():
            re_data["ENSURE_PAY_"] = data["ENSURE_PAY_"].strip().replace("交", "缴")
            if re_data["ENSURE_PAY_"] not in self.pay_type:
                re_data["ENSURE_PAY_"] = "其他"
            re_data["ENSURE_PAY_CODE_"] = self.pay_type[re_data["ENSURE_PAY_"]]
        # 如果没有缴费方式从产品名字中再获取一次
        else:
            if re.findall(r"期[缴交]", data["PRO_NAME_"]):
                re_data["ENSURE_PAY_"] = "期缴"
                re_data["ENSURE_PAY_CODE_"] = "QJ"
            elif re.findall(r"趸[缴交]", data["PRO_NAME_"]):
                re_data["ENSURE_PAY_"] = "趸缴"
                re_data["ENSURE_PAY_CODE_"] = "DJ"
        # FDFS上传
        if "LOCAL_PDF_PATH_" in data:
            try:
                p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_PDF",
                                               file_name=data["LOCAL_PDF_NAME_"], postfix="pdf",
                                               file=open(data["LOCAL_PDF_PATH_"], "rb"))
                p_response.close()
            except Exception as e:
                self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}")
        if "WORD_" in data:
            try:
                response = req_for_something(url=data["WORD_"])
            except Exception as e:
                self.logger.warning(f"_id: {data['_id']},获取PDF失败, ERROR: {e}")
            else:
                if response:
                    try:
                        p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD",
                                                       file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc",
                                                       file=response.content)
                        self.logger.info(f"{p_response.content.decode('utf-8')}")
                        p_response.close()
                    except Exception as e:
                        self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}")
                    finally:
                        response.close()
                else:
                    self.logger.warning(f'id: {data["_id"]},获取PDF失败')

        if "HTML_" in data:
            del data["HTML_"]
        re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_")
        re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"]
        return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
Exemple #17
0
    def generic_shuffle(self, data):
        re_data = list()
        # CHA_BRANCH_WEIBO_INFO
        info_data = dict()
        serial_number = req_for_serial_number(code="WEIBO_INFO")
        info_data["ID_"] = serial_number
        print(serial_number)

        info_data["ENTITY_CODE_"] = data["BANK_CODE_"]

        info_data["URL_"] = data["CONTENT_URL_"]

        info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "")
        # 数据来源 URL
        source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"])
        info_data["SOURCE_"] = source[0]
        # 数据来源 网站名称
        info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0]

        info_data["SOURCE_TYPE_"] = "WEIBO"

        info_data["LIKES_"] = data["PRAISES_"]
        if not info_data["LIKES_"]:
            info_data["LIKES_"] = 0
        info_data["COMMENTS_"] = data["REPLIES_"]
        if not info_data["COMMENTS_"]:
            info_data["COMMENTS_"] = 0
        info_data["RELAYS_"] = data["RELAYS_"]
        if not info_data["RELAYS_"]:
            info_data["RELAYS_"] = 0
        info_data["IMPORTANCE_"] = "N"
        info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"]
        info_data["CONTENT_"] = data["CONTENT_"]
        if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0:
            for each_image in data["CONTENT_IMAGES_"]:
                response = req_for_something(url=each_image)
                if response:
                    t = base64.b64encode(response.content)
                    info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8")
                    response.close()

        # 补录
        # info_data["TYPE_"] = data[""]
        # info_data["TYPE_CODE_"] = data[""]
        info_data["PUBLISH_STATUS_"] = "N"
        if "OWN_" in data:
            if data["OWN_"] == "转载":
                info_data["OWN_"] = "N"
            else:
                info_data["OWN_"] = "Y"

        for each in self.weibo_list:
            if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]:
                info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"]
                info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"]
                break
        # 模型
        # 摘要
        try:
            brief = req_for_ts(info_data["CONTENT_"])
            if brief:
                info_data["BRIEF_"] = brief["summary"]
        except Exception as e:
            self.logger.info(f"调用模型req_for_ts失败,原因为{e}")
            info_data["BRIEF_"] = ""
        # 是否敏感
        try:
            censor = req_for_censor(info_data["CONTENT_"])
            if censor:
                if censor["censor"] == "N":
                    info_data["SENSITIVE_"] = "N"
                else:
                    info_data["SENSITIVE_"] = "Y"
                    info_data["SENSITIVE_WORD_"] = censor["words"]
        except Exception as e:
            self.logger.info(f"调用模型censor失败,错误为{e}")
            info_data["SENSITIVE_"] = "N"

        info_data["VERSION_"] = "0"
        info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_")
        # 清洗浦发银行BANK_NAME_和BANK_CODE_
        if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博":
            info_data["BANK_NAME_"] = "浦发银行"
            info_data["BANK_CODE_"] = "SPDB"
        if info_data["ENTITY_NAME_"] == "南海农商银行微博":
            info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "NRC"
        if info_data["ENTITY_NAME_"] == "顺德农商银行微博":
            info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司"
            info_data["BANK_CODE_"] = "sdebank"

        comment = data["INFO_COMMENTS_"]
        verifieds = 0
        for c in comment:
            if c.get("VERIFIED_", ""):
                verifieds += 1

        # 微博热度
        try:
            hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"],
                                    replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds)
            if hot:
                info_data["HOT_"] = hot["level"]
        except Exception as e:
            self.logger.info(f"调用模型weibo_hot失败,错误为{e}")

        re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data})
        if len(comment) > 0:
            comment_count = 0
            for each in comment:
                # CHA_BRANCH_WEIBO_COMMENT
                # 每次需要初始化comment_data不然导致数据重复
                comment_data = dict()
                # HBase row_key
                serial_number = req_for_serial_number(code="WEIBO_COMMENT")
                comment_data["ID_"] = serial_number
                comment_data["INFO_ID_"] = info_data["ID_"]
                comment_data["COMMENT_"] = each["COMMENT_"]
                comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"]
                comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"]
                comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"]
                comment_data["REPLIER_"] = each["REPLIER_"]
                comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"]
        # 情感分析

                if each.get("COMMENT_") and len(each["COMMENT_"]) > 0:
                    try:
                        sentiment = req_for_comment(each["COMMENT_"])
                        if sentiment:
                            if sentiment["sentiment"] == "中性":
                                comment_data["EMOTION_"] = "NORMAL"
                            if sentiment["sentiment"] == "积极":
                                comment_data["EMOTION_"] = "POSITIVE"
                            if sentiment["sentiment"] == "敏感":
                                comment_data["EMOTION_"] = "NAGETIVE"
                        else:
                            comment_data["EMOTION_"] = "NORMAL"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["EMOTION_"] = "NORMAL"

        # 是否敏感
                    try:
                        censor = req_for_censor(each["COMMENT_"])
                        if censor:
                            if censor["censor"] == "N":
                                comment_data["SENSITIVE_"] = "N"
                            else:
                                comment_data["SENSITIVE_"] = "Y"
                                comment_data["SENSITIVE_WORD_"] = censor["words"]
                        else:
                            comment_data["SENSITIVE_"] = "N"
                    except Exception as e:
                        self.logger.info(f"调用模型req_for_comment失败,错误为{e}")
                        comment_data["SENSITIVE_"] = "N"

                comment_data["VERSION_"] = "0"
                comment_data["CREATE_BY_ID_"] = "P0131857"
                comment_data["CREATE_BY_NAME_"] = "钟楷文"
                re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data})
                comment_count += 1
            # 打相关评论日志方便调试
            self.logger.info(f'清洗的URL为{info_data["URL_"]}')
            self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}')
            self.logger.info(f'插入到comment表的数量为{comment_count}')
        # print(re_data)
        return re_data
Exemple #18
0
    def generic_shuffle(self, data, re_data, field="CONTENT_"):
        """
        父类通用清洗规则写在这里, 现只有从字段中匹配银行。
        :param data: 要清洗的数据 type: dict
        :param re_data: 要清洗的数据 type: dict
        :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ...
                                          NoneType: None 无需清洗
        :return: 清洗完毕的数据 type: dict
        """

        if not field:
            pass
        # 涉及银行统一在 __init_____.py 中处理
        else:
            if "BANK_NAME_" not in re_data:
                if "ZX" in data.get("ENTITY_CODE_", "")[:2]:
                    if field in data:
                        try:
                            result = req_for_ner(data[field])
                        except Exception as e:
                            self.logger.exception(
                                f"2.2--err: 请求模型 req_for_ner 错误."
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f" error: {e}.")
                        else:
                            if result:
                                if "Organ" in result:
                                    if result["Organ"].get("entity", ""):
                                        organ = result["Organ"]["entity"]
                                        for each in self.bank_list:
                                            if organ in each["ALIAS_"]:
                                                re_data["BANK_NAME_"] = each[
                                                    "NAME_"]
                                                re_data["BANK_CODE_"] = each[
                                                    "CODE_"]
                                                break
                else:
                    bank_list = list()
                    bank_code_list = list()
                    for each in self.bank_list:
                        if each["NAME_"] in data.get(field, ""):
                            bank_list.append(each["NAME_"])
                            bank_code_list.append(each["CODE_"])
                    if bank_list:
                        re_data["BANK_NAME_"] = "|".join(bank_list)
                    if bank_code_list:
                        re_data["BANK_CODE_"] = "|".join(bank_code_list)
        # 地址信息
        #     # todo 机构
        #     # data["UNIT_CODE_"] = ""
        #     # data["UNIT_NAME_"] = ""
        if "ID_" not in re_data:
            serial_number = req_for_serial_number(
                code=data["ENTITY_CODE_"][:7])
            re_data["ID_"] = serial_number

        # FDFS 存储
        if "ENTITY_CODE_" in data:
            if data["ENTITY_CODE_"][:2] == "ZX":
                tc = "NEWS"
            elif "WECHAT" in data["ENTITY_CODE_"]:
                tc = "WECHAT"
            elif "JRCP_BX" in data["ENTITY_CODE_"]:
                tc = "INSURANCE"
            elif "JRCP_LCCP" in data["ENTITY_CODE_"]:
                tc = "LCCP"

        elif "BANK_CODE_" in data:
            if "MICROBLOG" in data["BANK_CODE_"]:
                tc = "WEIBOBASIC"
        if "HTML_" in data:
            if data["HTML_"]:
                if "HTML_NAME_" in data:
                    html_name = data["HTML_NAME_"]
                elif "PDF_NAME_" in data:
                    html_name = data["PDF_NAME_"]
                else:
                    html_name = str(uuid.uuid1())
                try:
                    response_file = req_for_file_save(
                        id=re_data["ID_"],
                        type_code=f"CHA_{tc}_HTML",
                        file_name=html_name,
                        postfix="html",
                        file=data["HTML_"].encode("utf-8"))
                    if "error" in response_file.content.decode("utf-8"):
                        self.logger.info(
                            f"2.3--err:文件上传错误."
                            f" 原始数据collection={self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f" error: {response_file.content.decode('utf-8')}."
                        )
                        raise Exception(
                            f"附件上传错误{response_file.content.decode('utf-8')}")
                    response_file.close()
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
        elif "PDF_" in data:
            if data["PDF_"]:
                if "HTML_NAME_" in data:
                    pdf_name = data["HTML_NAME_"]
                elif "PDF_NAME_" in data:
                    pdf_name = data["PDF_NAME_"]
                else:
                    if ".PDF" in data["PDF_"] or ".pdf" in data["PDF_"]:
                        file_name = re.findall(r"/(.*?).pdf", data["PDF_"],
                                               re.IGNORECASE)
                        if file_name:
                            pdf_name = file_name[0]
                        else:
                            pdf_name = str(uuid.uuid1())
                    else:
                        pdf_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["PDF_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    if response:
                        try:
                            # todo 文件上传出错是否继续还是跳过
                            p_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_{tc}_PDF",
                                file_name=pdf_name,
                                postfix="pdf",
                                file=response.content)
                            if "error" in p_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {p_response.content.decode('utf-8')}."
                                )
                            p_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")
        elif "PDF_1_" in data:
            if data["PDF_1_"]:
                for i in range(10):
                    try:
                        if f"PDF_{i}_NAME_" in data:
                            pdf_name = data[f"PDF_{i}_NAME_"]
                        else:
                            if ".PDF" in data[f"PDF_{i}_"] or ".pdf" in data[
                                    f"PDF_{i}_"]:
                                file_name = re.findall(r"/(.*?).pdf",
                                                       data[f"PDF_{i}_"],
                                                       re.IGNORECASE)
                                if file_name:
                                    pdf_name = file_name[0]
                                else:
                                    pdf_name = str(uuid.uuid1())
                            else:
                                pdf_name = str(uuid.uuid1())
                        try:
                            response = req_for_something(url=data[f"PDF_{i}_"])
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        else:
                            if response:
                                try:
                                    p_response = req_for_file_save(
                                        id=re_data["ID_"],
                                        type_code=f"CHA_{tc}_PDF",
                                        file_name=pdf_name,
                                        postfix="pdf",
                                        file=response.content)
                                    if "error" in p_response.content.decode(
                                            "utf-8"):
                                        self.logger.info(
                                            f"2.3--err:文件上传错误."
                                            f" 原始数据collection={self.m_client.mongo_collection};"
                                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                            f" 原始数据 _id = {data['_id']};"
                                            f" error: {p_response.content.decode('utf-8')}."
                                        )
                                    p_response.close()
                                except Exception as e:
                                    self.logger.exception(
                                        f"2.1--err: PDF"
                                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                        f" 原始数据 _id = {data['_id']};"
                                        f"error: {e}.")
                                finally:
                                    response.close()
                            else:
                                self.logger.exception(
                                    f"2.1--err: PDF"
                                    f" 原始数据 collection = {self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f"error: PDF 请求失败.")
                    except KeyError:
                        break
        elif "PDF_URL_" in data:
            if data["PDF_URL_"]:
                if "PDF_NAME_" in data:
                    pdf_name = data["PDF_NAME_"]
                else:
                    if ".PDF" in data["PDF_URL_"] or ".pdf" in data["PDF_URL_"]:
                        file_name = re.findall(r"/(.*?).pdf", data["PDF_URL_"],
                                               re.IGNORECASE)
                        if file_name:
                            pdf_name = file_name[0]
                        else:
                            pdf_name = str(uuid.uuid1())
                    else:
                        pdf_name = str(uuid.uuid1())
                try:
                    response = req_for_something(url=data["PDF_URL_"])
                except Exception as e:
                    self.logger.exception(
                        f"2.1--err: PDF"
                        f" 原始数据 collection = {self.m_client.mongo_collection};"
                        f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                        f" 原始数据 _id = {data['_id']};"
                        f"error: {e}.")
                else:
                    if response:
                        try:
                            f_response = req_for_file_save(
                                id=re_data["ID_"],
                                type_code=f"CHA_{tc}_PDF",
                                file_name=pdf_name,
                                postfix="pdf",
                                file=response.content)
                            if "error" in f_response.content.decode("utf-8"):
                                self.logger.info(
                                    f"2.3--err:文件上传错误."
                                    f" 原始数据collection={self.m_client.mongo_collection};"
                                    f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                    f" 原始数据 _id = {data['_id']};"
                                    f" error: {f_response.content.decode('utf-8')}."
                                )

                            f_response.close()
                        except Exception as e:
                            self.logger.exception(
                                f"2.1--err: PDF"
                                f" 原始数据 collection = {self.m_client.mongo_collection};"
                                f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                                f" 原始数据 _id = {data['_id']};"
                                f"error: {e}.")
                        finally:
                            response.close()
                    else:
                        self.logger.exception(
                            f"2.1--err: PDF"
                            f" 原始数据 collection = {self.m_client.mongo_collection};"
                            f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};"
                            f" 原始数据 _id = {data['_id']};"
                            f"error: PDF 请求失败.")

        if "ENTITY_CODE_" not in re_data:
            re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
        if "ENTITY_NAME_" not in re_data:
            re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
        if "URL_" not in re_data:
            if "URL_" in data:
                re_data["URL_"] = data["URL_"]
        # 创建时间及操作人
        time_array = time.localtime()
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        re_data["CREATE_TIME_"] = create_time
        re_data["CREATE_BY_ID_"] = CREATE_ID
        re_data["CREATE_BY_NAME_"] = CREATE_NAME

        # 爬取时间
        if "DATETIME_" in data:
            re_data["SPIDER_TIME_"] = data["DATETIME_"]
        elif ("DATETIME_" not in data) and ("DEALTIME_" in data):
            d_time = arrow.get(data["DEALTIME_"])
            date_time = d_time.format("YYYY-MM-DD")
            re_data["SPIDER_TIME_"] = date_time
        if "M_STATUS_" not in re_data:
            re_data["M_STATUS_"] = "N"
        if "DELETE_STATUS_" not in re_data:
            re_data["DELETE_STATUS_"] = "N"
        if "DATA_STATUS_" not in re_data:
            re_data["DATA_STATUS_"] = "UNCHECK"
        if "MICROBLOG" not in re_data[
                "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data:
            re_data["PUBLISH_STATUS_"] = "N"

        return re_data