def data_shuffle(data): if "PDF_" in data: if ".html" in data["PDF_"]: data["HTML_"] = req_for_something( url=data["PDF_"]).content.decode("UTF-8") del data["PDF_"] elif "http" not in data["PDF_"]: del data["PDF_"] if "RISK_LEVEL_" in data: if data["RISK_LEVEL_"] == "低风险": data["RISK_LEVEL_CODE_"] = "R1" elif data["RISK_LEVEL_"] == "中低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "较低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "中等风险": data["RISK_LEVEL_CODE_"] = "R3" elif data["RISK_LEVEL_"] == "中高风险": data["RISK_LEVEL_CODE_"] = "R4" elif data["RISK_LEVEL_"] == "高风险": data["RISK_LEVEL_CODE_"] = "R5" elif "SOURCE_RISK_LEVEL_" in data: if data["SOURCE_RISK_LEVEL_"] == "低风险": data["RISK_LEVEL_CODE_"] = "R1" elif data["SOURCE_RISK_LEVEL_"] == "中低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "较低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "中等风险": data["RISK_LEVEL_CODE_"] = "R3" elif data["SOURCE_RISK_LEVEL_"] == "中高风险": data["RISK_LEVEL_CODE_"] = "R4" elif data["SOURCE_RISK_LEVEL_"] == "高风险": data["RISK_LEVEL_CODE_"] = "R5" return data
def data_shuffle(data): if "RAISE_START_" in data: data["RAISE_START_"] = re.sub(r"[^\d-]", "", data["RAISE_START_"]) if "YIELD_HIGH_" in data: if "-" in data["YIELD_HIGH_"]: yield_rate = data["YIELD_HIGH_"].split("-") data["YIELD_HIGH_"] = yield_rate[0] data["YIELD_LOW_"] = yield_rate[1] # 起购金额 if "START_FUNDS_" in data: data["START_FUNDS_"] = data["START_FUNDS_"].replace("W", "0000") # 风险等级 if "SOURCE_RISK_LEVEL_" in data: if "很低" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R1" elif "较低" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R2" elif "中低" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R3" elif "中高" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R4" elif "较高" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R4" elif "很高" in data["SOURCE_RISK_LEVEL_"]: data["RISK_LEVEL_CODE_"] = "R5" # PDF if "PDF_" in data: if data["PDF_"]: response = req_for_something(url=data["PDF_"]) pdf_url = re.findall(r"pdf_filename = \"(.*)\";", response.content.decode("gbk")) if pdf_url: data["PDF_"] = pdf_url[0] return data
def generic_shuffle(self, data, re_data, field="CONTENT_"): re_data = deepcopy(data) # 文件存储 for _ in range(1, 10): if f"FJ{_}_NAME_" in data and data.get(f'FJ{_}_URL_'): type = find_type(data.get(f'FJ{_}_URL_')) if find_type( data.get(f'FJ{_}_URL_')) else find_type( data.get(f"FJ{_}_NAME_")) if not type: return re_data try: response = req_for_something(url=data[f'FJ{_}_URL_']) except Exception as e: self.logger.exception('文件获取出错') else: if response: try: # todo 文件上传出错是否继续还是跳过 number = 3932 serial_number = req_for_serial_number( code="GOV_ZX_GDS") file_name = src_dir + str( int(serial_number[5:13]) - number ) + '-' + data.get(f"FJ{_}_NAME_").replace( '.xlsx', '').replace('.xls', '').replace( '.doc', '').replace('.docx', '').replace( '.zip', '').replace('.pdf', '').replace( '.PDF', '') + type re_data[f'FILE_NAME_{_}_'] = str( int(serial_number[5:13]) - number ) + '-' + data.get(f"FJ{_}_NAME_").replace( '.xlsx', '').replace('.xls', '').replace( '.docx', '').replace('.doc', '').replace( '.zip', '').replace('.pdf', '').replace( '.PDF', '') + type with open(file_name, 'wb+') as fp: fp.write(response.content) print('保存文件成功', ' ', re_data[f'FILE_NAME_{_}_']) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f"error: {e}.") finally: response.close() return re_data
def data_shuffle(data): if "PDF_1_" in data.keys(): for i in range(10): try: if ".HTM" in data[f"PDF_{i}_"] or ".htm" in data[f"PDF_{i}_"]: response = req_for_something(url=data[f"PDF_{i}_"]) if response: profix_url = re.findall(r"https?://.*/", data[f"PDF_{i}_"])[0] pdf_url = re.findall(r"/\w+\.pdf", response.content.decode("utf-8")) if pdf_url: data[f"PDF_{i}_"] = profix_url[:-1] + pdf_url[0] except Exception as e: continue # data["IMAGES_"] = data["PRO_DETAIL_"] # del data["PRO_DETAIL_"] return data
def __shuffle(self, data): serial_number = req_for_serial_number(code="CRM_JJK") data["ID_"] = serial_number # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) data["CREATE_TIME_"] = create_time data["CREATE_BY_ID_"] = CREATE_ID data["CREATE_BY_NAME_"] = CREATE_NAME data["M_STATUS_"] = "N" data["DELETE_STATUS_"] = "N" data["DATA_STATUS_"] = "UNCHECK" data["PUBLISH_STATUS_"] = "N" data["HOT_"] = "0" data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") source = re.findall(r"(https?://.*?)/", data["URL_"]) if source: data["SOURCE_"] = source[0] data["SOURCE_NAME_"] = data["ENTITY_NAME_"] # 处理图片 if "IMG" in data and data["IMG"]: try: response = req_for_something(url=data["IMG"]) except Exception as e: self.logger.exception(f"2.1--err: IMG" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f"error: {e}.") else: if response: content = response.content encode_data = base64.b64encode(content) data["IMG_"] = encode_data.decode("utf-8") response.close() else: data["IMG_"] = "" del data["IMG"] del data["DATETIME_"] return data
def data_shuffle(data): if data["PDF_"][-4:] == "html" or data["PDF_"][:-4] == "HTML": response = req_for_something(url=data["PDF_"]) data["HTML_"] = response.content.decode("gbk") data["HTML_NAME_"] = data["PDF_NAME_"] regist_code = re.findall(r"C\d{13}", data["HTML_"]) if regist_code: data["REGIST_CODE_"] = regist_code[0] else: regist_code = re.findall(r"C\d+C\d+", data["HTML_"]) if regist_code: data["REGIST_CODE_"] = "".join(["C", regist_code[0].replace("C", "")]) del data["PDF_"] if "RISK_LEVEL_" in data: if data["RISK_LEVEL_"] == "低风险": data["RISK_LEVEL_CODE_"] = "R1" elif data["RISK_LEVEL_"] == "中低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "较低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "中等风险": data["RISK_LEVEL_CODE_"] = "R3" elif data["RISK_LEVEL_"] == "中高风险": data["RISK_LEVEL_CODE_"] = "R4" elif data["RISK_LEVEL_"] == "高风险": data["RISK_LEVEL_CODE_"] = "R5" elif "SOURCE_RISK_LEVEL_" in data: if data["SOURCE_RISK_LEVEL_"] == "低风险": data["RISK_LEVEL_CODE_"] = "R1" elif data["SOURCE_RISK_LEVEL_"] == "中低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "较低风险": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "中等风险": data["RISK_LEVEL_CODE_"] = "R3" elif data["SOURCE_RISK_LEVEL_"] == "中高风险": data["RISK_LEVEL_CODE_"] = "R4" elif data["SOURCE_RISK_LEVEL_"] == "高风险": data["RISK_LEVEL_CODE_"] = "R5" return data
def data_shuffle(data): if data.get("excel"): data_list = [] response = req_for_something(url=data["excel"]) work_book = read_excel(response.content) sheet_name = work_book.sheet_names()[0] sheet = work_book.sheet_by_name(sheet_name) com_name_ = "" row_list = sheet.row_values(2) for n in range(3, sheet.nrows): data_item = {} for k, v in data.items(): data_item[k] = v rows1 = sheet.row_values(n) sheet_dict = dict(zip(row_list, rows1)) if sheet_dict["保险公司"]: com_name_ = sheet_dict["保险公司"] else: sheet_dict["保险公司"] = com_name_ data_item["COM_NAME_"] = sheet_dict["保险公司"] data_item["PRO_NAME_"] = sheet_dict["保险产品名称"] data_item["ENSURE_SOURCE_TYPE_"] = sheet_dict["产品类型"] data_list.append(data_item)
def generic_shuffle(self, data, field="CONTENT_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule re_data = dict() if "PUBLISH_TIME_" not in data: return None # 时间维度 if re.findall(r"\d{4}-\d{1,2}-\d{1,2}", data["PUBLISH_TIME_"]): pass elif re.findall(r"\d{4}年\d{1,2}月\d{1,2}日", data["PUBLISH_TIME_"]): data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "") else: if ("年" in data["PUBLISH_TIME_"]) and ("月" in data["PUBLISH_TIME_"]) and ("二" in data["PUBLISH_TIME_"]): format_list = list() for i in data["PUBLISH_TIME_"][:10]: format_list.append(self.number_dict[i]) data["PUBLISH_TIME_"] = "".join(format_list) # 暂无其他情形 # elif else: find_time = re.findall(r"\|(\w{4}[-年]\w{1,2}[-月]\w{1,2})日?\W?\|", data["CONTENT_"]) if find_time: if "二" in find_time[0]: format_list = list() for i in find_time[0]: format_list.append(self.number_dict[i]) data["PUBLISH_TIME_"] = "".join(format_list) else: data["PUBLISH_TIME_"] = find_time[0] data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("年", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("月", "-") data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"].replace("日", "") else: data["PUBLISH_TIME_"] = "" if data["PUBLISH_TIME_"]: shuffle_list = data["PUBLISH_TIME_"].split("-") shuffle_list[0] = shuffle_list[0][:4] if len(shuffle_list[1]) == 2: pass elif len(shuffle_list[1]) == 1: shuffle_list[1] = "0" + shuffle_list[1] elif len(shuffle_list[1]) > 2: shuffle_list[1] = shuffle_list[1][:2] if len(shuffle_list[2]) == 2: pass elif len(shuffle_list[2]) == 1: shuffle_list[2] = "0" + shuffle_list[2] elif len(shuffle_list[2]) > 2: shuffle_list[2] = shuffle_list[2][:2] data["PUBLISH_TIME_"] = "-".join(shuffle_list) re_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "") # re_data["REMARK_"] = "" # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源 网站名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # 数据来源编码 s_index = data["ENTITY_CODE_"].rfind("_") re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:7] re_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] re_data["TITLE_"] = data["TITLE_"] # 作者 if "AUTHOR_" in data: if "编辑" in data["AUTHOR_"]: re_data["AUTHOR_"] = re.findall(r"编辑[::](\w+)", data["AUTHOR_"])[0] else: re_data["AUTHOR_"] = data["AUTHOR_"] re_data["IMPORTANCE_"] = "N" # 阅读数 if "READ_" in data: re_data["READS_"] = data["READ_"] else: re_data["READS_"] = 0 # 点赞数 if "LIKES_" in data: re_data["LIKES_"] = data["LIKES_"] else: re_data["LIKES_"] = 0 # 评论数 if "COMMENTS_" in data: re_data["COMMENTS_"] = data["COMMENTS_"] elif "COMMENT_" in data: re_data["COMMENTS_"] = data["COMMENT_"] else: re_data["COMMENTS_"] = 0 # 参与数 if "JOINS_" in data: re_data["JOINS_"] = data["JOINS_"] elif "JOIN_" in data: re_data["JOINS_"] = data["JOIN_"] else: re_data["JOINS_"] = 0 # 内容 re_data["CONTENT_"] = re.sub(r"(var.*?;\|)(?![a-zA-Z])", "", data["CONTENT_"]) # HTML 标签 re_data['CONTENT_HTML_'] = data["HTML_"] data["CONTENT_HTML_"] = data["HTML_"] re_data["CONTENT_HTML_"] = re.sub(r"href=\".*?\"", "href=\"javaScript:void(0);\"", re_data["CONTENT_HTML_"]) if '28857' in re_data['CONTENT_HTML_'] or '您的IP' in re_data['CONTENT_HTML_']: try: soup = BeautifulSoup(re_data['CONTENT_HTML_']) soup.find('div', attrs={'class': 'online-desc-con'}).decompose() soup.find_all('script')[0].decompose() re_data['CONTENT_HTML_'] = soup.prettify() except Exception as e: self.logger.exception(f'IP检测内容清除出错') # TODO del data["HTML_] is wrong del data["HTML_"] re_data["CONTENT_"] = re_data["CONTENT_"].replace("|", "") re_data["TITLE_"] = re_data["TITLE_"].replace("|", "") # 是否营销活动 re_data["ACT_"] = "N" # 版本 re_data["VERSION_"] = "0" if "IMAGE_" in data: try: response = req_for_something(url=data["IMAGE_"]) if response: t = base64.b64encode(response.content) data["IMAGE_"] = t.decode("utf-8") response.close() except Exception: pass # 调用模型 # 摘要 try: brief = req_for_ts(re_data["CONTENT_"][0:1000]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_ts 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if brief: re_data["BRIEF_"] = brief["summary"] else: re_data["BRIEF_"] = '暂无摘要' # 情感分析 try: sentiment = req_for_senti(re_data["TITLE_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_senti 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if sentiment: if sentiment["sentiment"] == "中性": re_data["EMOTION_"] = "NORMAL" if sentiment["sentiment"] == "正面": re_data["EMOTION_"] = "POSITIVE" if sentiment["sentiment"] == "敏感": re_data["EMOTION_"] = "NAGETIVE" # 是否敏感 try: censor = req_for_censor(re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_censor 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if censor: if censor["censor"] == "N": re_data["SENSITIVE_"] = "N" else: re_data["SENSITIVE_"] = "Y" re_data["SENSITIVE_WORD_"] = censor["words"] # 热度 try: hot = req_for_news_hot(title=re_data["TITLE_"], content=re_data["CONTENT_"][0:1000]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_news_hot 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if hot: re_data["HOT_"] = hot["level"] # 地址分析 try: res = req_for_textLoc(text=re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_textLoc 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if "error" not in res: if res["tagsId"] == "None" or res["tagsId"] is None: pass else: re_data["TAGS_"] = res["tagsId"] if res["flag"] == 1: address = res["full"] else: address = res["addr"] try: lat_result = get_lat_lng(address=address) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: self.logger.info(f"获取经纬度失败, ERROR: {e}") re_data["LAT_"] = None re_data["LNG_"] = None if re_data["LAT_"]: try: area_result = get_area(",".join([str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["AREA_NAME_"] = area_result["result"]["addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = area_result["result"]["addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in self.city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in self.province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break # 信用卡关联性 try: res = req_for_credit_relative(text=re_data["CONTENT_"]) except Exception as e: self.logger.exception(f"2.2--err: 请求模型 req_for_credit_relative 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {self.entity_code};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if res["creditrelative"]: re_data["MODULE_TYPE_"] = "CREDITCARD" # 银行名称、编码 if "BANK_NAME_" in data: re_data["BANK_NAME_"] = data["BANK_NAME_"] if "BANK_CODE_" in data: re_data["BANK_CODE_"] = data["BANK_CODE_"] re_data = super(BranchNews, self).generic_shuffle(data=data, re_data=re_data, field="CONTENT_") # 财资直接发布 re_data['DATA_STATUS_'] = 'CHECK' # 是否发布 if not re_data.get("PUBLISH_TIME_"): re_data["PUBLISH_STATUS_"] = "N" else: re_data["PUBLISH_STATUS_"] = "Y" return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def generic_shuffle(self, data): """ 清洗规则写这里,如不需要通用清洗规则则不继承, 从大文本中筛选数据 :param data: :param field: :return: """ re_data = dict() re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] serial_number = req_for_serial_number(code="JRCP_XYK") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["SOURCE_TYPE_"] = "WAK" # 对特殊微信 BANK_NAME 做处理 for key, value in self.name_dict.items(): if key[:2] in data["PRO_NAME_"]: re_data["BANK_NAME_"] = key re_data["BANK_CODE_"] = value break if "BANK_NAME_" in re_data: if re_data["BANK_NAME_"] == "建信": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "建设银行": re_data["BANK_NAME_"] = "中国建设银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "农业银行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "工行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "工商银行": re_data["BANK_NAME_"] = "中国工商银行" if re_data["BANK_NAME_"] == "民生银行": re_data["BANK_NAME_"] = "中国民生银行" if re_data["BANK_NAME_"] == "光大银行": re_data["BANK_NAME_"] = "中国光大银行" if re_data["BANK_NAME_"] == "交行": re_data["BANK_NAME_"] = "交通银行" if re_data["BANK_NAME_"] == "招行": re_data["BANK_NAME_"] = "招商银行" if re_data["BANK_NAME_"] == "农行": re_data["BANK_NAME_"] = "中国农业银行" if re_data["BANK_NAME_"] == "中行": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "中银": re_data["BANK_NAME_"] = "中国银行" if re_data["BANK_NAME_"] == "邮储银行": re_data["BANK_NAME_"] = "中国邮政储蓄银行" # 信用卡名称 if "PRO_NAME_" in data: if "(" in data["PRO_NAME_"]: data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"]. find("(")] elif "(" in data["PRO_NAME_"]: data["PRO_NAME_"] = data["PRO_NAME_"][:data["PRO_NAME_"]. find("(")] re_data["PRO_NAME_"] = data["PRO_NAME_"] # 卡币种 if "CURRENCY_TYPE_" in data: re_data["CURRENCY_TYPE_"] = data["CURRENCY_TYPE_"] # 卡币种类型 if data["CURRENCY_TYPE_"] == "人民币": re_data["CURRENCY_TYPE_CODE_"] = "RMB" if re.match(r"人民币/.*?", data["CURRENCY_TYPE_"]): re_data["CURRENCY_TYPE_CODE_"] = "DBZ" if data["CURRENCY_TYPE_"] == "美元": re_data["CURRENCY_TYPE_CODE_"] = "DBZ" # 卡组织|结算渠道 if "BRAND_" in data: re_data["BRAND_"] = data["BRAND_"] # 卡组织CODE for brand_key in self.brand_dict: if brand_key in data["BRAND_"]: re_data["BRAND_CODE_"] = self.brand_dict[brand_key] break # 卡等级 if "LEVEL_" in data: re_data["LEVEL_"] = data["LEVEL_"] # 卡等级CODE for level_key in self.level_dict: if level_key[:2] in data["LEVEL_"][:2]: re_data["LEVEL_CODE_"] = self.level_dict[level_key] break # 取现额度 if "CONSUME_LIMIT_" in data: re_data["CONSUME_LIMIT_"] = data["CONSUME_LIMIT_"] # 这里开始从大文本清洗 # 免息期 GRACE_PERIODS_ = re.findall(r".*?免息期[::]\|(.*?)\|", data["CONTENT_"]) if len(GRACE_PERIODS_) > 0: GRACE_PERIODS_ = GRACE_PERIODS_[0] # 处理到20天50天的错误数据 pattern = re.compile(r"到(\d+)天(\d+)天") if re.match(pattern, GRACE_PERIODS_): GRACE_PERIODS_ = pattern.sub(r"\1天到\2天", GRACE_PERIODS_) if GRACE_PERIODS_ == "消费验证方式:": GRACE_PERIODS_ = "" if GRACE_PERIODS_ == "预借现金额度:" or GRACE_PERIODS_ == "预借现金额度:": GRACE_PERIODS_ = "" if re.match(r"最长\d+天最长\d+天", GRACE_PERIODS_): a = re.match(r"(最长\d+天)最长\d+天", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) if re.match(r"\d+天到\d+天\d+天到\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)到(\d+天)(\d+天)到\d+天", GRACE_PERIODS_) if a.group(1) == a.group(2): GRACE_PERIODS_ = a.group(1) + "到" + a.group(3) else: GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"\d+天\d+天\d+天\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)\d+天(\d+天)\d+天", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"\d+天\d+天", GRACE_PERIODS_): a = re.match(r"(\d+天)(\d+天)", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) if re.match(r"至\d+天\d+天", GRACE_PERIODS_): a = re.match(r"至(\d+天)(\d+天)", GRACE_PERIODS_) GRACE_PERIODS_ = a.group(1) + "到" + a.group(2) re_data["GRACE_PERIODS_"] = GRACE_PERIODS_ else: re_data["GRACE_PERIODS_"] = data["GRACE_PERIODS_"] # 免年费政策 FREE_POLICY_ = re.findall(r".*?免年费政策[::]\|(.*?)\|", data["CONTENT_"]) if len(FREE_POLICY_) > 0: FREE_POLICY_ = FREE_POLICY_[0] # 删除重复数据 pattern = re.compile(r"(免\d+年年费){2,9}") if re.match(pattern, FREE_POLICY_): a = re.match(pattern, FREE_POLICY_) FREE_POLICY_ = a.group(1) pattern = re.compile(r"(终身免年费){2,9}") if re.match(pattern, FREE_POLICY_): a = re.match(pattern, FREE_POLICY_) FREE_POLICY_ = a.group(1) re_data["FREE_POLICY_"] = FREE_POLICY_ # 主卡年费 FEE_ = re.findall(r".*?主卡年费[::]\|(.*?)\|", data["CONTENT_"]) if len(FEE_) > 0: FEE_ = FEE_[0] tempfee = re.findall(r".*?(\d+).*?", FEE_) if len(tempfee) > 0: re_data["FEE_"] = tempfee[0] else: re_data["FEE_"] = "" else: re_data["FEE_"] = "0" # 预借现金额度 PRE_BORROW_ = re.findall(r".*?预借现金额度[::]\|(.*?)\|", data["CONTENT_"]) if len(PRE_BORROW_) > 0: PRE_BORROW_ = PRE_BORROW_[0] if PRE_BORROW_ == "免息期:": PRE_BORROW_ = "" if PRE_BORROW_ == "免年费政策:": PRE_BORROW_ = "" # 去除重复的数据 pattern = re.compile(r"(信用额度的\d+%)信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(信用额度的\d+-\d+%)信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(普卡信用额度的\d+%)白金卡信用额度的\d+%金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(普卡信用额度的\d+%)金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) pattern = re.compile(r"(白金卡信用额度的\d+%)金卡信用额度的\d+%") if re.match(pattern, PRE_BORROW_): a = re.match(pattern, PRE_BORROW_) PRE_BORROW_ = a.group(1) re_data["PRE_BORROW_"] = PRE_BORROW_ else: re_data["PRE_BORROW_"] = "" # 消费验证方式 re_data["VALID_CONSUME_"] = "密码+签名 签名" # 账单日 BILL_DATE_ = re.findall(r".*?账单日[::]\|(.*?)\|", data["CONTENT_"]) if len(BILL_DATE_) > 0: BILL_DATE_ = BILL_DATE_[0] # 处理重复的账单日数据 比如:账单日21号账单日21号账单日21号 pattern = re.compile(r"(账单日\d+号){2,9}") if re.match(pattern, BILL_DATE_): a = re.match(pattern, BILL_DATE_) BILL_DATE_ = a.group(1) re_data["BILL_DATE_"] = BILL_DATE_ else: re_data["BILL_DATE_"] = "" # 积分方式 POINTS_ = re.findall(r".*?积分方式[::]\|(.*?)\|", data["CONTENT_"]) if len(POINTS_) > 0: POINTS_ = POINTS_[0] if re_data.get("BANK_CODE_") and re_data["BANK_CODE_"] == "CMB": POINTS_ = POINTS_.replace("元", "元 ") else: POINTS_ = POINTS_.replace("分", "分 ") POINTS_ = POINTS_.replace("倍", "倍 ") POINTS_ = POINTS_.replace("积分 的2倍", "积分的2倍") re_data["POINTS_"] = POINTS_ else: re_data["POINTS_"] = "" # 积分有效期 VALID_DATE_POINTS_ = re.findall(r".*?积分有效期[::]\|(.*?)\|", data["CONTENT_"]) if len(VALID_DATE_POINTS_) > 0: VALID_DATE_POINTS_ = VALID_DATE_POINTS_[0] # 给几组有效期之间加上空格 pattern = re.compile(r"(白金卡\d+年)(金卡\d+年)(普卡\d+年)") if re.match(pattern, VALID_DATE_POINTS_): VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3", VALID_DATE_POINTS_) pattern = re.compile(r"(\d+年到\d+年)(\d+年)(永久有效)") if re.match(pattern, VALID_DATE_POINTS_): VALID_DATE_POINTS_ = re.sub(pattern, r"\1 \2 \3", VALID_DATE_POINTS_) re_data["VALID_DATE_POINTS_"] = VALID_DATE_POINTS_ else: re_data["VALID_DATE_POINTS_"] = "" # 循环信用利息 DAILY_INTEREST_ = re.findall(r".*?循环信用利息(日息)[::]?\|(.*?)\|", data["CONTENT_"]) if len(DAILY_INTEREST_) > 0: DAILY_INTEREST_ = DAILY_INTEREST_[0] if DAILY_INTEREST_ == "消费短信通知费:": DAILY_INTEREST_ = "" re_data["DAILY_INTEREST_"] = DAILY_INTEREST_ else: re_data["DAILY_INTEREST_"] = "" # 最低还款 MIN_REPAY_ = re.findall(r".*?最低还款[::]?\|(.*?)\|", data["CONTENT_"]) if len(MIN_REPAY_) > 0: MIN_REPAY_ = MIN_REPAY_[0] if re.match(r"最低应还所欠金额的\d+%最低应还所欠金额的\d+%", MIN_REPAY_): a = re.match(r"(最低应还所欠金额的\d+%)最低应还所欠金额的\d+%", MIN_REPAY_) MIN_REPAY_ = a.group(1) if MIN_REPAY_ == "账单日:": MIN_REPAY_ = "" re_data["MIN_REPAY_"] = MIN_REPAY_ else: re_data["MIN_REPAY_"] = "" # 卡片特色 if "SPECIAL_" in data and len(data["SPECIAL_"]) > 0: re_data["SPECIAL_"] = data["SPECIAL_"].replace("|", "<br/>") # 增值服务 if "VAS_" in data and len(data["VAS_"]) > 0: re_data["VAS_"] = data["VAS_"].replace("|", "<br/>") # 信用卡图片 # 处理错误的信用卡图片URL if "IMAGES_" in data: pattern = re.compile(r"https:(http://.*)") if re.match(pattern, data["IMAGES_"]): a = re.match(pattern, data["IMAGES_"]) image_url = a.group(1) else: image_url = data["IMAGES_"] response = req_for_something(url=image_url) if response: t = base64.b64encode(response.content) re_data["IMAGE_"] = t.decode("utf-8") re_data = super(BranchXyk, self).generic_shuffle(data=data, re_data=re_data, field=None) # print(re_data) re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
def generic_shuffle(self, data, re_data, field=None): """ 通用清洗规则写在这里, 现只有从字段中匹配银行。 :param data: 要清洗的数据 type: dict :param re_data: 要清洗的数据 type: dict :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ... NoneType: None 无需清洗 :return: 清洗完毕的数据 type: dict """ # 涉及银行统一在 __init_____.py 中处理 # if field: # if "BANK_NAME_" not in re_data: # for bank in self.bank_list: # if data["ENTITY_NAME_"][:-4] in bank["ALIAS_"]: # re_data["BACK_CODE_"] = bank["CODE_"] # 银行编码 # re_data["BACK_NAME_"] = bank["NAME_"] # 银行名称 # break if "ID_" not in re_data: serial_number = req_for_serial_number( code=data["ENTITY_CODE_"][:8]) re_data["ID_"] = serial_number # 文件上传 if "YJBG_" in data["ENTITY_CODE_"]: tc = "YJBG" if data["FILE_URL_"]: re_postfix = re.findall(r"\.([pd][do][fc]x?$)", data["FILE_URL_"]) if re_postfix or data.get('ENTITY_CODE_') in [ 'XYK_YJBG_GFYH', 'XYK_YJBG_JTYH' ]: postfix = re_postfix[0] if re_postfix else 'pdf' if "FILE_NAME_" in data: file_name = data["FILE_NAME_"] else: re_file_name = re.findall(rf"/(.*?)\.{postfix}", data["FILE_URL_"], re.IGNORECASE) if re_file_name: file_name = re_file_name[0] else: file_name = str(uuid.uuid1()) try: response = req_for_something(url=data["FILE_URL_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: print('附件请求成功') if response: try: # p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_{tc}_{postfix.upper()}", p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_YJBG", file_name=file_name, postfix=postfix, file=response.content) if "error" in p_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) raise Exception("上传文件出错") else: self.logger.info( f"2.3--success: 文件上传成功." f"{p_response.content.decode('utf-8')}") p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") raise Exception("上传文件出错") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") raise Exception("文件请求失败") if "ENTITY_CODE_" not in re_data: re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] if "ENTITY_NAME_" not in re_data: re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] if "URL_" not in re_data: if "URL_" in data: re_data["URL_"] = data["URL_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time if "PERIOD_CODE_" not in re_data: re_data["PERIOD_CODE_"] = re_data.get("PUBLISH_TIME_", "") if "M_STATUS_" not in re_data: re_data["M_STATUS_"] = "N" if "DELETE_STATUS_" not in re_data: re_data["DELETE_STATUS_"] = "N" if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "VERSION_" not in re_data: re_data["VERSION_"] = "0" if "DATA_VERSION_" not in re_data: re_data["DATA_VERSION_"] = "0" if "MICROBLOG" not in re_data[ "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" return re_data
def generic_shuffle(self, data): """ 清洗脚本写到这里 :param data: :return re_data: """ re_data = dict() serial_number = req_for_serial_number(code="WD_SS_XX") re_data["ID_"] = serial_number # 时间维度 re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace("-", "") # 标签 if "TAGS_" in data: re_data["TAGS_"] = "" # SOURCE source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] # 数据来源名称 re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] # # 数据来源编码 # s_index = data["ENTITY_CODE_"].rfind("_") # re_data["SOURCE_CODE_"] = data["ENTITY_CODE_"][:s_index] # 资讯来源分类 re_data["SOURCE_TYPE_"] = data["ENTITY_CODE_"][3:8] # 得到经度和维度 补全省市区域数据 try: lat_result = get_lat_lng(address=data["ADDR_"]) re_data["LAT_"] = lat_result["result"]["location"]["lat"] re_data["LNG_"] = lat_result["result"]["location"]["lng"] except KeyError: re_data["LAT_"] = None re_data["LNG_"] = None except Exception as e: self.logger.info("获取经纬度失败信息为{}".format(e)) if re_data["LAT_"]: try: area_result = get_area(",".join( [str(re_data["LAT_"]), str(re_data["LNG_"])])) except Exception as e: self.logger.info(f"获取地址失败, ERROR: {e}") else: try: re_data["PROVINCE_NAME_"] = area_result["result"][ "addressComponent"]["province"] re_data["CITY_NAME_"] = area_result["result"][ "addressComponent"]["city"] re_data["AREA_NAME_"] = area_result["result"][ "addressComponent"]["district"] re_data["AREA_CODE_"] = area_result["result"][ "addressComponent"]["adcode"] re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data[ "PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" except KeyError: pass # 学校名称 if "NAME_" in data: re_data["NAME_"] = data["NAME_"] # 属性(市重点、区重点、全国重点) if "LEVEL_" in data: re_data["LEVEL_"] = data["LEVEL_"] # 图片 if "IMAGES_" in data: if data["IMAGES_"]: response = req_for_something(url=data["IMAGES_"]) if response: t = base64.b64encode(response.content) re_data["IMAGES_"] = t.decode("utf-8") # 学校类型 if "SCHOOL_TYPE_" in data: re_data["SCHOOL_TYPE_"] = data["SCHOOL_TYPE_"] # 学校性质 if "SCHOOL_NATURE_" in data: re_data["SCHOOL_NATURE_"] = data["SCHOOL_NATURE_"] # 电话 if "TEL_" in data: pattern1 = re.compile(r"(\d{3,4}-\d{8})(\d{3,4}-\d{8})") pattern2 = re.compile(r"(\d{3,4}-\d{8})(\d{8})") pattern3 = re.compile(r"(\d{3,4}-\d{8})(\d{11})") pattern4 = re.compile(r"(\d{3,4}-\d{8})(\d{8})(\d{8})") pattern5 = re.compile(r"(\d{8})(\d{11})") pattern6 = re.compile(r"(\d{8})(\d{8})") pattern7 = re.compile(r"(\d{3,4}-\d{7})(\d{3,4}-\d{7})") pattern8 = re.compile(r"(\d{3,4}-\d{8})(\d{11})(\d{11})") pattern9 = re.compile(r"(\d{3,4}-\d{7})(\d{7})") if re.match(pattern1, data["TEL_"]): phone_number = re.sub(pattern1, r"\1 \2", data["TEL_"]) elif re.match(pattern2, data["TEL_"]): phone_number = re.sub(pattern2, r"\1 \2", data["TEL_"]) elif re.match(pattern3, data["TEL_"]): phone_number = re.sub(pattern3, r"\1 \2", data["TEL_"]) elif re.match(pattern4, data["TEL_"]): phone_number = re.sub(pattern4, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern5, data["TEL_"]): phone_number = re.sub(pattern5, r"\1 \2", data["TEL_"]) elif re.match(pattern6, data["TEL_"]): phone_number = re.sub(pattern6, r"\1 \2", data["TEL_"]) elif re.match(pattern7, data["TEL_"]): phone_number = re.sub(pattern7, r"\1 \2", data["TEL_"]) elif re.match(pattern8, data["TEL_"]): phone_number = re.sub(pattern8, r"\1 \2 \3", data["TEL_"]) elif re.match(pattern9, data["TEL_"]): phone_number = re.sub(pattern9, r"\1 \2", data["TEL_"]) else: phone_number = data["TEL_"] re_data["TEL_"] = phone_number # 地址 if "ADDR_" in data: re_data["ADDR_"] = data["ADDR_"] re_data = super(Branchssxx, self).generic_shuffle(data=data, re_data=re_data, field=None) return [{"TABLE_NAME_": self.p_client.table_name, "DATA_": re_data}]
def data_shuffle(data): re_data = dict() re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] # 年费 re_data["FEE_"] = data["FEE_"] # 提现额 re_data["CASHING_AMOUNT_"] = data["CASHING_AMOUNT_"] # 信用额(最高) re_data["MOST_AMOUNT_"] = data["MOST_AMOUNT_"] # 卡等级 re_data["CARD_LEVEL_"] = data["CARD_LEVEL_"] # 卡组织 re_data["CARD_ORG_"] = data["CARD_ORG_"] # 卡片IMAGE if "IMG_" in data: image_url = data["IMG_"] response = req_for_something(url=image_url) if response: t = base64.b64encode(response.content) re_data["IMG_"] = t.decode("utf-8") # 卡片名称 re_data["CARD_NAME_"] = data["CARD_NAME_"] # 权益(文字描述) re_data["POWER_WRITING_"] = data["POWER_WRITING_"] # 卡属性 re_data["CARD_ATTR_"] = data["CARD_ATTR_"] # 信用额度 re_data["CREDIT_AMOUNT_"] = data["CREDIT_AMOUNT_"] # 免息期 re_data["INTEREST_FREE_"] = data["INTEREST_FREE_"] # 详细介绍 INTRO_ = BeautifulSoup(data["INTRO_"], "html.parser").getText() pattern = re.compile(r"[\s\S]*卡片介绍([\s\S]*)") if re.match(pattern, INTRO_): a = re.match(pattern, INTRO_) intro = a.group(1) intro = re.sub('[\n]+', '', intro) re_data["INTRO_"] = intro # 卡片介绍 # print(data["CARD_INTRO_"]) soup = BeautifulSoup(data["CARD_INTRO_"], "html.parser") re_data["CARD_INTRO_"] = soup.find('div', {"class": "adp"}).text # pattern = re.compile(r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)") # OTHER_REPAY_其他还款 soup = BeautifulSoup(data["OTHER_REPAY_"], "html.parser") # print(soup) a = soup.find_all('div', {"class": "tt2_1"}) OTHER_REPAY_LIST = list() for item in a: OTHER_REPAY_LIST.append(item.string) OTHER_REPAY_ = "|".join(OTHER_REPAY_LIST) re_data["OTHER_REPAY_"] = OTHER_REPAY_ # OFFLINE_REPAY_ 网点还款 soup = BeautifulSoup(data["OFFLINE_REPAY_"], "html.parser") a = soup.find_all('div', {"class": "tt2_1"}) OFFLINE_REPAY_LIST = list() for item in a: OFFLINE_REPAY_LIST.append(item.string) OFFLINE_REPAY_ = "|".join(OFFLINE_REPAY_LIST) re_data["OFFLINE_REPAY_"] = OFFLINE_REPAY_ # NET_REPAY_ 在线还款 soup = BeautifulSoup(data["NET_REPAY_"], "html.parser") a = soup.find_all('div', {"class": "tt2_1"}) NET_REPAY_LIST = list() for item in a: NET_REPAY_LIST.append(item.string) NET_REPAY_ = "|".join(NET_REPAY_LIST) re_data["NET_REPAY_"] = NET_REPAY_ # ACTIVATE_ 激活 re_data["ACTIVATE_"] = data["ACTIVATE_"] # SCORE_MILEAGE_ 积分兑换里程 SCORE_MILEAGE_ = BeautifulSoup(data["SCORE_MILEAGE_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, SCORE_MILEAGE_): a = re.match(pattern, SCORE_MILEAGE_) score_mileage = a.group(1) score_mileage = re.sub('[\n]+', '', score_mileage) score_mileage = re.sub('\s+', '', score_mileage) re_data["SCORE_MILEAGE_"] = score_mileage # SCORE_METHOD_ 积分兑换方法 SCORE_METHOD_ = BeautifulSoup(data["SCORE_METHOD_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, SCORE_METHOD_): a = re.match(pattern, SCORE_METHOD_) score_method = a.group(1) score_method = re.sub('[\n]+', '', score_method) score_method = re.sub('\s+', '', score_method) re_data["SCORE_METHOD_"] = score_method # SCORE_SEARCH_ 积分查询方式 SCORE_SEARCH_ = BeautifulSoup(data["SCORE_SEARCH_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, SCORE_SEARCH_): a = re.match(pattern, SCORE_SEARCH_) score_search = a.group(1) score_search = re.sub('[\n]+', '', score_search) score_search = re.sub('\s+', '', score_search) re_data["SCORE_SEARCH_"] = score_search # SCORE_ACCU_ 积分累积规则 SCORE_ACCU_ = BeautifulSoup(data["SCORE_ACCU_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, SCORE_ACCU_): a = re.match(pattern, SCORE_ACCU_) score_accu = a.group(1) score_accu = re.sub('[\n]+', '', score_accu) score_accu = re.sub('\s+', '', score_accu) re_data["SCORE_ACCU_"] = score_accu # SCORE_VALID_ 积分有效期 SCORE_VALID_ = BeautifulSoup(data["SCORE_VALID_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, SCORE_VALID_): a = re.match(pattern, SCORE_VALID_) score_valid = a.group(1) score_valid = re.sub('[\n]+', '', score_valid) score_valid = re.sub('\s+', '', score_valid) re_data["SCORE_VALID_"] = score_valid # PREPAYMENT_ 提前还款规定 PREPAYMENT_ = BeautifulSoup(data["PREPAYMENT_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, PREPAYMENT_): a = re.match(pattern, PREPAYMENT_) repayment = a.group(1) repayment = re.sub('[\n]+', '', repayment) repayment = re.sub('\s+', '', repayment) re_data["PREPAYMENT_"] = repayment # CHARE_DEDUCT_ 手续费扣除方式 CHARE_DEDUCT_ = BeautifulSoup(data["CHARE_DEDUCT_"], "html.parser").getText() pattern = re.compile( r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)" ) if re.match(pattern, CHARE_DEDUCT_): a = re.match(pattern, CHARE_DEDUCT_) chage_deduct = a.group(1) chage_deduct = re.sub('[\n]+', '', chage_deduct) chage_deduct = re.sub('\s+', '', chage_deduct) re_data["CHARE_DEDUCT_"] = chage_deduct # NUMBER_RATE_ 期数及费率 # print(data["NUMBER_RATE_"]) from scrapy.selector import Selector import requests response = requests.get( data['URL_'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' }) html = Selector(text=response.content.decode('gb2312')) trs = html.xpath( '//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[position()>5 and position()<last()-1]' ) for tr in trs: try: page = trs.index(tr) + 6 xpath_ = f'//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[{page}]' periods_1 = tr.xpath(xpath_ + '/td[1]/p/span[1]/text()').extract()[0] rate_1 = tr.xpath(xpath_ + '//td[1]/p/span[2]/text()').extract()[-1] periods_2 = tr.xpath(xpath_ + '/td[2]/p/span[1]/text()').extract()[0] rate_2 = tr.xpath(xpath_ + '/td[2]/p/span[2]/text()').extract()[-1] except: periods_1, rate_1, periods_2, rate_2 = '', '', '', '', print(periods_1, rate_1, periods_2, rate_2) return re_data
def data_shuffle(data): if "PDF_" in data: # print(data["PDF_"]) response = req_for_something(url=data["PDF_"]) # print(response.text) try: pdf_content = response.content.decode("utf-8") except UnicodeDecodeError: pdf_content = response.content.decode("gbk") html = HTML(pdf_content) url = html.xpath("//a[contains(text(),\"说明书\")]/@href") if url: url = "http://ewealth.abchina.com/fs" + url[0][1:] # print(url) response2 = req_for_something(url=url) # print(response2.content.decode("utf-8")) try: response2_content = response2.content.decode("utf-8") except UnicodeDecodeError: response2_content = response2.content.decode("gbk") pdf_url1 = re.findall(r"/\w+\.pdf", response2_content) if pdf_url1: pdf_url = "http://ewealth.abchina.com/fs/intro_list" + pdf_url1[ 0] data["PDF_"] = pdf_url else: try: response_content = response.content.decode("utf-8") except UnicodeDecodeError: response_content = response.content.decode("gbk") url = re.findall(r"/\w+\.pdf", response_content) if url: pdf_url = "http://ewealth.abchina.com/fs/intro_list" + url[0] data["PDF_"] = pdf_url # response3 = req_for_something(url=pdf_url) # return response3 if "RISK_LEVEL_" in data: if data["RISK_LEVEL_"] == "低": data["RISK_LEVEL_CODE_"] = "R1" elif data["RISK_LEVEL_"] == "中低": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "较低": data["RISK_LEVEL_CODE_"] = "R2" elif data["RISK_LEVEL_"] == "中等": data["RISK_LEVEL_CODE_"] = "R3" elif data["RISK_LEVEL_"] == "中高": data["RISK_LEVEL_CODE_"] = "R4" elif data["RISK_LEVEL_"] == "高": data["RISK_LEVEL_CODE_"] = "R5" elif "SOURCE_RISK_LEVEL_" in data: if data["SOURCE_RISK_LEVEL_"] == "低": data["RISK_LEVEL_CODE_"] = "R1" elif data["SOURCE_RISK_LEVEL_"] == "中低": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "较低": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "中等": data["RISK_LEVEL_CODE_"] = "R3" elif data["SOURCE_RISK_LEVEL_"] == "中高": data["RISK_LEVEL_CODE_"] = "R4" elif data["SOURCE_RISK_LEVEL_"] == "高": data["RISK_LEVEL_CODE_"] = "R5" return data
def data_shuffle(data, province_list, city_list, area_list): data_list = list() for city in city_list: if city["NAME_"] == "县": city_list.remove(city) prov_c = "" prov_n = "" city_c = "" city_n = "" area_c = "" area_n = "" # 省市级信息 if "北京" in data["CITY_NAME_"]: prov_n = "北京市" prov_c = "1100" city_n = "北京市" city_c = "110100" elif "天津" in data["CITY_NAME_"]: prov_n = "天津市" prov_c = "1200" city_n = "天津市" city_c = "120100" elif "上海" in data["CITY_NAME_"]: prov_n = "上海市" prov_c = "3100" city_n = "上海市" city_c = "310100" elif "重庆" in data["CITY_NAME_"]: prov_n = "重庆市" prov_c = "5000" city_n = "重庆市" city_c = "500100" else: for city in city_list: if city["NAME_"][:-1] in data["CITY_NAME_"]: city_n = city["NAME_"] city_c = city["CODE_"] prov_c = city["PARENT_"] break if prov_c: for prov in province_list: if prov["CODE_"] == prov_c: prov_n = prov["NAME_"] break response = req_for_something(data["URL_"]) a = re.sub(r"[^\w|,]+", "", response.content.decode("utf-8")) b = a.split("|") for each in b: re_data = dict() message = each.split(",") if len(message) == 1: continue # city_n = message[0] # city_c = message[1] name = message[2] addr_ = message[3] tel = message[4] business_time = message[5] + message[6] # lng = message[8] # lat = message[9] # # 区县级清洗 # for area in area_list: # if area["PARENT_"] == city_c: # if area["NAME_"] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:-1] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:4] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:3] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # elif area["NAME_"][:2] in addr_: # area_n = area["NAME_"] # area_c = area["CODE_"] # 地址清洗 if prov_n in addr_: pass elif prov_n[:-1] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:-1], prov_n) + addr_[len(prov_n):] elif prov_n[:4] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:4], prov_n) + addr_[len(prov_n):] elif prov_n[:3] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:3], prov_n) + addr_[len(prov_n):] elif prov_n[:2] in addr_[:len(prov_n)]: addr_ = addr_[:len(prov_n)].replace(prov_n[:2], prov_n) + addr_[len(prov_n):] else: addr_ = prov_n + addr_ if city_n in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_ elif city_n[:-1] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:4] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:4], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:3] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:3], city_n) + addr_[len(prov_n) + len(city_n):] elif city_n[:2] in addr_[:len(prov_n) + len(city_n)]: addr_ = addr_[:len(prov_n) + len(city_n)].replace( city_n[:2], city_n) + addr_[len(prov_n) + len(city_n):] else: addr_ = addr_[:len(prov_n)] + city_n + addr_[len(prov_n):] # "C" re_data["BANK_CODE_"] = "CZB" re_data["BANK_NAME_"] = data["ENTITY_NAME_"][:-3] re_data["SPIDER_TIME_"] = data["DATETIME_"] # re_data["AREA_CODE_"] = area_c # re_data["AREA_NAME_"] = area_n # re_data["UNIT_CODE_"] = "CZB" + "_" + city_c # "F" re_data["ADDR_"] = addr_ re_data["PROVINCE_NAME_"] = prov_n re_data["PROVINCE_CODE_"] = prov_c re_data["CITY_CODE_"] = city_c re_data["CITY_NAME_"] = city_n # re_data["LAT_"] = lat # re_data["LNG_"] = lng re_data["NAME_"] = name # re_data["PROVINCE_CODE_"] = prov_c # re_data["PROVINCE_NAME_"] = prov_n result = get_lat_lng(address=re_data["ADDR_"]) try: re_data["LAT_"] = str(result["result"]["location"]["lat"]) re_data["LNG_"] = str(result["result"]["location"]["lng"]) except KeyError: re_data["LAT_"] = "" re_data["LNG_"] = "" else: dis_result = get_area(",".join([re_data["LAT_"], re_data["LNG_"]])) try: re_data["AREA_NAME_"] = dis_result["result"][ "addressComponent"]["district"] except KeyError: re_data["AREA_NAME_"] = "" try: re_data["AREA_CODE_"] = dis_result["result"][ "addressComponent"]["adcode"] except KeyError: re_data["AREA_CODE_"] = "" else: re_data["CITY_CODE_"] = re_data["AREA_CODE_"][:4] + "00" re_data["PROVINCE_CODE_"] = re_data["AREA_CODE_"][:2] + "00" for city in city_list: if city["CODE_"] == re_data["CITY_CODE_"]: re_data["CITY_NAME_"] = city["NAME_"] break for prov in province_list: if prov["CODE_"] == re_data["PROVINCE_CODE_"]: re_data["PROVINCE_NAME_"] = prov["NAME_"] break re_data["UNIT_CODE_"] = "CZB" + "_" + re_data.get("CITY_CODE_", "") re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["URL_"] = data["URL_"] re_data["TEL_"] = tel re_data["BUSINESS_HOURS_"] = business_time if "SOURCE_TYPE_NAME_" in data: re_data["SOURCE_TYPE_NAME_"] = data["SOURCE_TYPE_NAME_"] re_data["TYPE_NAME_"] = "支行" re_data["TYPE_"] = "ZH" data_list.append(re_data) return data_list
print(sheet.merged_cells) print(len(sheet.merged_cells)) # # 获取整行和整列的值(数组) # rows1 = sheet.row_values(3) # 获取第四行内容 # rows2 = sheet.row_values(4) # 获取第四行内容 # rows3 = sheet.row_values(5) # 获取第四行内容 # cols = sheet.col_values(2) # 获取第三列内容 # print(rows1) # print(rows2) # print(rows3) # # 获取单元格内容 # print # sheet2.cell(1, 0).value.encode('utf-8') # print # sheet2.cell_value(1, 0).encode('utf-8') # print # sheet2.row(1)[0].value.encode('utf-8') # # # 获取单元格内容的数据类型 # print # sheet2.cell(1, 0).ctype if __name__ == '__main__': response = req_for_something(url="http://www.hxb.com.cn/images/grjr/zjyw/dlxsl/2018/10/12/12165251E4AD1CEEB164E48700BC924FC58F1BED.xls") read_excel(response.content)
def generic_shuffle(self, data, field="PRO_NAME_"): """ 清洗规则写这里, 如不需要通用清洗规则则不继承 :param data: :param field: :return: """ # different shuffle rule # 如果data是一个list if isinstance(data, list): re_data_list = [] for item in data: re_data_list.append({"TABLE_NAME_": self.script_name, "DATA_": self.generic_shuffle(item)}) return re_data_list re_data = dict() serial_number = req_for_serial_number(code="JRCP_BX") re_data["ID_"] = serial_number + "TEST" source = re.findall(r"(https?://.*?)/", data["URL_"]) re_data["SOURCE_"] = source[0] re_data["SOURCE_NAME_"] = data["ENTITY_NAME_"] re_data["VERSION_"] = "0" re_data["DATA_VERSION_"] = "0" # todo re_data["SOURCE_TYPE_"] = "" # 模型 re_data["HOT_"] = data["HOT_"] if "HOT_" in data else "0" re_data["PRO_NAME_"] = data["PRO_NAME_"] # 保险公司 if "COM_NAME_" in data: for each in self.company_list: if each["NAME_"]: if data["COM_NAME_"] in each["NAME_"] or each["NAME_"] in data["COM_NAME_"]: re_data["COM_NAME_"] = each["NAME_"] re_data["COM_NAME_CODE_"] = each["CODE_"] elif each["ALIAS_"] and data["COM_NAME_"] in each["ALIAS_"]: re_data["COM_NAME_"] = each["NAME_"] re_data["COM_NAME_CODE_"] = each["CODE_"] if "COM_NAME_" not in re_data: re_data["COM_NAME_"] = data["COM_NAME_"] # 保额 补录 if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # else: # re_data["ENSURE_PRICE_"] = [100000, 500000, 1000000][random.randint(0, 2)] # 保费 补录 if "ENSURE_FEE_" in data: re_data["ENSURE_FEE_"] = data["ENSURE_FEE_"] # else: # re_data["ENSURE_FEE_"] = [50, 100, 200, 150][random.randint(0, 3)] # 产品特色 补录 if "SPECAIL_" in data: re_data["SPECAIL_"] = data["SPECAIL_"] # 产品简介 补录 if "BRIEF_" in data: re_data["BRIEF_"] = data["BRIEF_"] # 承保年龄 补录 if "AGE_" in data: re_data["AGE_"] = data["AGE_"] # else: # re_data["AGE_"] = [50, 70, 60, 80][random.randint(0, 3)] # 保险期间 补录 if "ENSURE_DATE_" in data: re_data["ENSURE_DATE_"] = data["ENSURE_DATE_"] # else: # re_data["ENSURE_DATE_"] = ["至80岁", "至60岁", "一年", "五年", "十年", "终身"][random.randint(0, 5)] # 投保份数 补录 if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] # else: # re_data["BUY_LIMIT_"] = [1, 2, "不限"][random.randint(0, 2)] # 保单形式 补录 if "ENSURE_MODE_" in data: re_data["ENSURE_MODE_"] = data["ENSURE_MODE_"] # 保单 补录 if "ENSURE_MODE_CODE_" in data: re_data["ENSURE_MODE_CODE_"] = data["ENSURE_MODE_CODE_"] # 适用人群 补录 if "SUIT_" in data: re_data["SUIT_"] = data["SUIT_"] # else: # re_data["SUIT_"] = ["20岁以下", "20岁至50岁人群", "无重大疾病隐患者", "不限"][random.randint(0,3)] # 原始保险分类 补录 if "ENSURE_SOURCE_TYPE_" in data: re_data["ENSURE_SOURCE_TYPE_"] = data["ENSURE_SOURCE_TYPE_"] # 保险类型 补录 # type_dict = {"寿险": "SX", "年金险": "NJX", "意外险": "YWX", "个人财险": "GRCX", "企业财险": "QYCX", "旅游险": "LYX", "健康险": "JKX", "理财险": "LCX"} if "ENSURE_TYPE_" in data: re_data["ENSURE_TYPE_"] = data["ENSURE_TYPE_"] # re_data["ENSURE_TYPE_"] = ["寿险", "年金险", "意外险", "个人财险", "企业财险", "旅游险", "健康险", "理财险"][random.randint(0, 7)] # 保险类型分类 补录 # if 1: if "ENSURE_TYPE_CODE_" in data: re_data["ENSURE_TYPE_CODE_"] = data["ENSURE_TYPE_CODE_"] # re_data["ENSURE_TYPE_CODE_"] = type_dict[re_data["ENSURE_TYPE_"]] # 推荐 re_data["RECOMMEND_"] = "N" # 畅销 re_data["GOOD_SALE_"] = "N" # 最新 re_data["NEW_SALE_"] = "N" # 保障内容 补录 if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] # 投保须知 补录 if "NOTICE_" in data: re_data["NOTICE_"] = data["NOTICE_"] # 产品介绍 补录 if "PRO_DETAIL_" in data: re_data["PRO_DETAIL_"] = data["PRO_DETAIL_"] if "ENSURE_PAY_" in data.keys(): re_data["ENSURE_PAY_"] = data["ENSURE_PAY_"].strip().replace("交", "缴") if re_data["ENSURE_PAY_"] not in self.pay_type: re_data["ENSURE_PAY_"] = "其他" re_data["ENSURE_PAY_CODE_"] = self.pay_type[re_data["ENSURE_PAY_"]] # 如果没有缴费方式从产品名字中再获取一次 else: if re.findall(r"期[缴交]", data["PRO_NAME_"]): re_data["ENSURE_PAY_"] = "期缴" re_data["ENSURE_PAY_CODE_"] = "QJ" elif re.findall(r"趸[缴交]", data["PRO_NAME_"]): re_data["ENSURE_PAY_"] = "趸缴" re_data["ENSURE_PAY_CODE_"] = "DJ" # FDFS上传 if "LOCAL_PDF_PATH_" in data: try: p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_PDF", file_name=data["LOCAL_PDF_NAME_"], postfix="pdf", file=open(data["LOCAL_PDF_PATH_"], "rb")) p_response.close() except Exception as e: self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}") if "WORD_" in data: try: response = req_for_something(url=data["WORD_"]) except Exception as e: self.logger.warning(f"_id: {data['_id']},获取PDF失败, ERROR: {e}") else: if response: try: p_response = req_for_file_save(id=re_data["ID_"], type_code=f"CHA_INSURANCE_WORD", file_name=data["PDF_NAME_"].replace(".doc", ""), postfix="doc", file=response.content) self.logger.info(f"{p_response.content.decode('utf-8')}") p_response.close() except Exception as e: self.logger.warning(f"_id: {data['_id']},文件上传失败, ERROR: {e}") finally: response.close() else: self.logger.warning(f'id: {data["_id"]},获取PDF失败') if "HTML_" in data: del data["HTML_"] re_data = super(BranchInsurance, self).generic_shuffle(data=data, re_data=re_data, field="ENTITY_NAME_") re_data["PUBLISH_TIME_"] = re_data["SPIDER_TIME_"] return [{"TABLE_NAME_": self.script_name, "DATA_": re_data}]
def generic_shuffle(self, data): re_data = list() # CHA_BRANCH_WEIBO_INFO info_data = dict() serial_number = req_for_serial_number(code="WEIBO_INFO") info_data["ID_"] = serial_number print(serial_number) info_data["ENTITY_CODE_"] = data["BANK_CODE_"] info_data["URL_"] = data["CONTENT_URL_"] info_data["PERIOD_CODE_"] = data["PUBLISH_TIME_"].replace("-", "") # 数据来源 URL source = re.findall(r"(https?://.*?)/", data["CONTENT_URL_"]) info_data["SOURCE_"] = source[0] # 数据来源 网站名称 info_data["SOURCE_NAME_"] = data["ENTITY_NAME_"].split("-")[0] info_data["SOURCE_TYPE_"] = "WEIBO" info_data["LIKES_"] = data["PRAISES_"] if not info_data["LIKES_"]: info_data["LIKES_"] = 0 info_data["COMMENTS_"] = data["REPLIES_"] if not info_data["COMMENTS_"]: info_data["COMMENTS_"] = 0 info_data["RELAYS_"] = data["RELAYS_"] if not info_data["RELAYS_"]: info_data["RELAYS_"] = 0 info_data["IMPORTANCE_"] = "N" info_data["PUBLISH_TIME_"] = data["PUBLISH_TIME_"] info_data["CONTENT_"] = data["CONTENT_"] if data.get("CONTENT_IMAGES_") and len(data["CONTENT_IMAGES_"]) > 0: for each_image in data["CONTENT_IMAGES_"]: response = req_for_something(url=each_image) if response: t = base64.b64encode(response.content) info_data[f"IMAGE_{data['CONTENT_IMAGES_'].index(each_image)+1}"] = t.decode("utf-8") response.close() # 补录 # info_data["TYPE_"] = data[""] # info_data["TYPE_CODE_"] = data[""] info_data["PUBLISH_STATUS_"] = "N" if "OWN_" in data: if data["OWN_"] == "转载": info_data["OWN_"] = "N" else: info_data["OWN_"] = "Y" for each in self.weibo_list: if each["WEIBO_NAME_"] == data["ENTITY_NAME_"]: info_data["WEIBO_CODE_"] = each["WEIBO_CODE_"] info_data["WEIBO_NAME_"] = each["WEIBO_NAME_"] break # 模型 # 摘要 try: brief = req_for_ts(info_data["CONTENT_"]) if brief: info_data["BRIEF_"] = brief["summary"] except Exception as e: self.logger.info(f"调用模型req_for_ts失败,原因为{e}") info_data["BRIEF_"] = "" # 是否敏感 try: censor = req_for_censor(info_data["CONTENT_"]) if censor: if censor["censor"] == "N": info_data["SENSITIVE_"] = "N" else: info_data["SENSITIVE_"] = "Y" info_data["SENSITIVE_WORD_"] = censor["words"] except Exception as e: self.logger.info(f"调用模型censor失败,错误为{e}") info_data["SENSITIVE_"] = "N" info_data["VERSION_"] = "0" info_data = super(WeiboScript, self).generic_shuffle(data=data, re_data=info_data, field="ENTITY_NAME_") # 清洗浦发银行BANK_NAME_和BANK_CODE_ if info_data["ENTITY_NAME_"] == "上海浦东发展银行微博": info_data["BANK_NAME_"] = "浦发银行" info_data["BANK_CODE_"] = "SPDB" if info_data["ENTITY_NAME_"] == "南海农商银行微博": info_data["BANK_NAME_"] = "广东南海农村商业银行股份有限公司" info_data["BANK_CODE_"] = "NRC" if info_data["ENTITY_NAME_"] == "顺德农商银行微博": info_data["BANK_NAME_"] = "广东顺德农村商业银行股份有限公司" info_data["BANK_CODE_"] = "sdebank" comment = data["INFO_COMMENTS_"] verifieds = 0 for c in comment: if c.get("VERIFIED_", ""): verifieds += 1 # 微博热度 try: hot = req_for_weibo_hot(publish_time=info_data["PUBLISH_TIME_"], relays=info_data["RELAYS_"], replies=len(comment), praises=info_data["LIKES_"], verifieds=verifieds) if hot: info_data["HOT_"] = hot["level"] except Exception as e: self.logger.info(f"调用模型weibo_hot失败,错误为{e}") re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_INFO"), "DATA_": info_data}) if len(comment) > 0: comment_count = 0 for each in comment: # CHA_BRANCH_WEIBO_COMMENT # 每次需要初始化comment_data不然导致数据重复 comment_data = dict() # HBase row_key serial_number = req_for_serial_number(code="WEIBO_COMMENT") comment_data["ID_"] = serial_number comment_data["INFO_ID_"] = info_data["ID_"] comment_data["COMMENT_"] = each["COMMENT_"] comment_data["REPLIER_TIME_"] = each["REPLIER_TIME_"] comment_data["REPLIER_HEAD_"] = each["REPLIER_HEAD_"] comment_data["REPLIER_PRAISES_"] = each["REPLIER_PRAISES_"] comment_data["REPLIER_"] = each["REPLIER_"] comment_data["REPLIER_REPLIES_"] = each["REPLIER_REPLIES_"] # 情感分析 if each.get("COMMENT_") and len(each["COMMENT_"]) > 0: try: sentiment = req_for_comment(each["COMMENT_"]) if sentiment: if sentiment["sentiment"] == "中性": comment_data["EMOTION_"] = "NORMAL" if sentiment["sentiment"] == "积极": comment_data["EMOTION_"] = "POSITIVE" if sentiment["sentiment"] == "敏感": comment_data["EMOTION_"] = "NAGETIVE" else: comment_data["EMOTION_"] = "NORMAL" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["EMOTION_"] = "NORMAL" # 是否敏感 try: censor = req_for_censor(each["COMMENT_"]) if censor: if censor["censor"] == "N": comment_data["SENSITIVE_"] = "N" else: comment_data["SENSITIVE_"] = "Y" comment_data["SENSITIVE_WORD_"] = censor["words"] else: comment_data["SENSITIVE_"] = "N" except Exception as e: self.logger.info(f"调用模型req_for_comment失败,错误为{e}") comment_data["SENSITIVE_"] = "N" comment_data["VERSION_"] = "0" comment_data["CREATE_BY_ID_"] = "P0131857" comment_data["CREATE_BY_NAME_"] = "钟楷文" re_data.append({"TABLE_NAME_": TABLE_NAME("CHA_BRANCH_WEIBO_COMMENT"), "DATA_": comment_data}) comment_count += 1 # 打相关评论日志方便调试 self.logger.info(f'清洗的URL为{info_data["URL_"]}') self.logger.info(f'清洗的评论数为{info_data["COMMENTS_"]}') self.logger.info(f'插入到comment表的数量为{comment_count}') # print(re_data) return re_data
def generic_shuffle(self, data, re_data, field="CONTENT_"): """ 父类通用清洗规则写在这里, 现只有从字段中匹配银行。 :param data: 要清洗的数据 type: dict :param re_data: 要清洗的数据 type: dict :param field: 要清洗的字段名 type: str: "CONTENT_" or "PRO_NAME_" or ... NoneType: None 无需清洗 :return: 清洗完毕的数据 type: dict """ if not field: pass # 涉及银行统一在 __init_____.py 中处理 else: if "BANK_NAME_" not in re_data: if "ZX" in data.get("ENTITY_CODE_", "")[:2]: if field in data: try: result = req_for_ner(data[field]) except Exception as e: self.logger.exception( f"2.2--err: 请求模型 req_for_ner 错误." f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {e}.") else: if result: if "Organ" in result: if result["Organ"].get("entity", ""): organ = result["Organ"]["entity"] for each in self.bank_list: if organ in each["ALIAS_"]: re_data["BANK_NAME_"] = each[ "NAME_"] re_data["BANK_CODE_"] = each[ "CODE_"] break else: bank_list = list() bank_code_list = list() for each in self.bank_list: if each["NAME_"] in data.get(field, ""): bank_list.append(each["NAME_"]) bank_code_list.append(each["CODE_"]) if bank_list: re_data["BANK_NAME_"] = "|".join(bank_list) if bank_code_list: re_data["BANK_CODE_"] = "|".join(bank_code_list) # 地址信息 # # todo 机构 # # data["UNIT_CODE_"] = "" # # data["UNIT_NAME_"] = "" if "ID_" not in re_data: serial_number = req_for_serial_number( code=data["ENTITY_CODE_"][:7]) re_data["ID_"] = serial_number # FDFS 存储 if "ENTITY_CODE_" in data: if data["ENTITY_CODE_"][:2] == "ZX": tc = "NEWS" elif "WECHAT" in data["ENTITY_CODE_"]: tc = "WECHAT" elif "JRCP_BX" in data["ENTITY_CODE_"]: tc = "INSURANCE" elif "JRCP_LCCP" in data["ENTITY_CODE_"]: tc = "LCCP" elif "BANK_CODE_" in data: if "MICROBLOG" in data["BANK_CODE_"]: tc = "WEIBOBASIC" if "HTML_" in data: if data["HTML_"]: if "HTML_NAME_" in data: html_name = data["HTML_NAME_"] elif "PDF_NAME_" in data: html_name = data["PDF_NAME_"] else: html_name = str(uuid.uuid1()) try: response_file = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_HTML", file_name=html_name, postfix="html", file=data["HTML_"].encode("utf-8")) if "error" in response_file.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {response_file.content.decode('utf-8')}." ) raise Exception( f"附件上传错误{response_file.content.decode('utf-8')}") response_file.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") elif "PDF_" in data: if data["PDF_"]: if "HTML_NAME_" in data: pdf_name = data["HTML_NAME_"] elif "PDF_NAME_" in data: pdf_name = data["PDF_NAME_"] else: if ".PDF" in data["PDF_"] or ".pdf" in data["PDF_"]: file_name = re.findall(r"/(.*?).pdf", data["PDF_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data["PDF_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: # todo 文件上传出错是否继续还是跳过 p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in p_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") elif "PDF_1_" in data: if data["PDF_1_"]: for i in range(10): try: if f"PDF_{i}_NAME_" in data: pdf_name = data[f"PDF_{i}_NAME_"] else: if ".PDF" in data[f"PDF_{i}_"] or ".pdf" in data[ f"PDF_{i}_"]: file_name = re.findall(r"/(.*?).pdf", data[f"PDF_{i}_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data[f"PDF_{i}_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: p_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in p_response.content.decode( "utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {p_response.content.decode('utf-8')}." ) p_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") except KeyError: break elif "PDF_URL_" in data: if data["PDF_URL_"]: if "PDF_NAME_" in data: pdf_name = data["PDF_NAME_"] else: if ".PDF" in data["PDF_URL_"] or ".pdf" in data["PDF_URL_"]: file_name = re.findall(r"/(.*?).pdf", data["PDF_URL_"], re.IGNORECASE) if file_name: pdf_name = file_name[0] else: pdf_name = str(uuid.uuid1()) else: pdf_name = str(uuid.uuid1()) try: response = req_for_something(url=data["PDF_URL_"]) except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") else: if response: try: f_response = req_for_file_save( id=re_data["ID_"], type_code=f"CHA_{tc}_PDF", file_name=pdf_name, postfix="pdf", file=response.content) if "error" in f_response.content.decode("utf-8"): self.logger.info( f"2.3--err:文件上传错误." f" 原始数据collection={self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f" error: {f_response.content.decode('utf-8')}." ) f_response.close() except Exception as e: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: {e}.") finally: response.close() else: self.logger.exception( f"2.1--err: PDF" f" 原始数据 collection = {self.m_client.mongo_collection};" f" ENTITY_CODE_ = {data.get('ENTITY_CODE_', 'None')};" f" 原始数据 _id = {data['_id']};" f"error: PDF 请求失败.") if "ENTITY_CODE_" not in re_data: re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] if "ENTITY_NAME_" not in re_data: re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] if "URL_" not in re_data: if "URL_" in data: re_data["URL_"] = data["URL_"] # 创建时间及操作人 time_array = time.localtime() create_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) re_data["CREATE_TIME_"] = create_time re_data["CREATE_BY_ID_"] = CREATE_ID re_data["CREATE_BY_NAME_"] = CREATE_NAME # 爬取时间 if "DATETIME_" in data: re_data["SPIDER_TIME_"] = data["DATETIME_"] elif ("DATETIME_" not in data) and ("DEALTIME_" in data): d_time = arrow.get(data["DEALTIME_"]) date_time = d_time.format("YYYY-MM-DD") re_data["SPIDER_TIME_"] = date_time if "M_STATUS_" not in re_data: re_data["M_STATUS_"] = "N" if "DELETE_STATUS_" not in re_data: re_data["DELETE_STATUS_"] = "N" if "DATA_STATUS_" not in re_data: re_data["DATA_STATUS_"] = "UNCHECK" if "MICROBLOG" not in re_data[ "ENTITY_CODE_"] and "PUBLISH_STATUS_" not in re_data: re_data["PUBLISH_STATUS_"] = "N" return re_data