def stream_y_fund_info(fund_ids=None): sql = "\ SELECT TB_MAIN.version, TB_MAIN.fund_id, fund_name, fund_full_name, fund_status, liquidation_cause, end_date \ FROM y_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) as latest_ver FROM y_fund_info WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE TB_MAIN.fund_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) ac = transform.AddConst({"source_id": "000001"}) dd = transform.SortDropduplicate(sort_by=["version", "fund_id"], ascending=[False, True], subset=["fund_id"]) km = transform.MapSelectKeys({ "fund_id": None, "fund_name": None, "fund_full_name": None, "fund_name_en": None, "source_id": None, "fund_status": None, "liquidation_cause": None, "end_date": None }) stream = Stream(inp, ( ac, dd, km, )) return stream
def stream_x_fund_info_010004(fund_ids=None): def clean_fund_status(limit_date, limit_time): if limit_date is not None: if limit_date <= dt.date.today(): return "终止" else: return "运行中" if limit_time is not None: if limit_time == "无期限": return "运行中" return None sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, fund_time_limit_amac \ FROM x_fund_info_securities TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010004' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap( OrderedDict([("fund_name_amac", lambda x: x.strip()), ("reg_code_amac", lambda x: x.strip()), ("limit_date", (lambda x: dt.datetime.strptime(x, "%Y-%m-%d").date(), "fund_time_limit_amac")), ("fund_time_limit_amac", lambda x: x if x == "无期限" else None), ("fund_status", (lambda x, y: clean_fund_status(x, y), "limit_date", "fund_time_limit_amac"))])) ac = transform.AddConst({"source_id": "010004"}) cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ # "version": None, "matched_id": "fund_id", "source_id": None, "fund_name_amac": "fund_full_name", "reg_code_amac": "reg_code", "foundation_date_amac": "foundation_date", "fund_time_limit_amac": "limit_time", "limit_date": "limit_date", "fund_status": "fund_status" }) stream = Stream(inp, (vm, cln, ac, dd, km), name="x_fund_info_security") return stream
def stream_d_fund_info_020001(fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, reg_code, fund_status, locked_time_limit, open_date, foundation_date \ FROM d_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020001' and is_used = 1 GROUP BY fund_id, source_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020001' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) # JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010004' AND im.is_used = 1" inp1 = MysqlNativeInput(engine_c, sql) # inp2 = MysqlNativeInput(engine_b, "SELECT matched_id, source_id FROM id_match WHERE id_type=1 AND source='020001' AND is_used = 1") vm = transform.ValueMap({ "fund_name": lambda x: x.strip(), "fund_full_name": lambda x: x.strip(), "fund_status": { "正常": "运行中", "终止": "终止" }, }) cln1 = transform.CleanWrongToNone({ "locked_time_limit": "-|--", "open_date": "-|--" }) cln2 = transform.CleanRight({"reg_code": "\w\w\d{4}"}) # jn = transform.Join(inp2, left_on="fund_id", right_on="source_id") dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ # "version": None, "matched_id": "fund_id", "source_id_x": "source_id", "fund_name": None, "fund_full_name": None, "reg_code": None, "fund_status": None, "open_date": None, "foundation_date": None }) stream = Stream(inp1, (vm, cln1, cln2, dd, km), name="d_fund_info") return stream
def stream_d_fund_info_020002(fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, fund_status, locked_time_limit, \ open_date, foundation_date \ FROM d_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020002' and is_used = 1 GROUP BY fund_id, source_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020002' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) cln1 = transform.CleanWrongToNone({"open_date": "-|--|---"}) cln2 = transform.CleanRight({ "locked_time_limit": "(\d*)个月|无封闭期", }) vm = transform.ValueMap({ "fund_status": { "运行中": "运行中", "已清算": "终止", "封闭运行": "运行中", "开放运行": "运行中", "募集中": "募集发行" } }) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ # "version": None, "matched_id": "fund_id", "fund_name": None, "fund_full_name": None, "source_id_x": "source_id", "fund_status": None, "locked_time_limit": None, "open_date": None, "foundation_date": None }) stream = Stream(inp, (cln1, cln2, vm, dd, km)) return stream
def stream_x_fund_info_010005(fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, issuing_scale_amac, number_clients_amac \ FROM x_fund_info_futures TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010005' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap( OrderedDict([ ("fund_name_amac", lambda x: x.strip()), ("reg_code_amac", lambda x: x.strip()), ("number_clients_amac", lambda x: int(x)), ])) cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None) ac = transform.AddConst({"source_id": "010005"}) # jn = transform.Join(inp2, left_on="fund_id", right_on="source_id") dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ "matched_id": "fund_id", "source_id": None, "fund_name_amac": "fund_full_name", "reg_code_amac": "reg_code", "foundation_date_amac": "foundation_date", "issuing_scale_amac": "issuing_scale", "number_clients_amac": "number_clients", }) stream = Stream(inp, (vm, cln, ac, dd, km), name="x_fund_info_future") return stream
def stream_d_fund_info_020003(cls, fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, fund_status, locked_time_limit, \ open_date, foundation_date \ FROM d_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020003' AND is_used = 1 GROUP BY fund_id, source_id) AS TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020003' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap({ "fund_status": { "正在运行": "运行中", "已清盘": "终止", "封闭运行": "运行中" }, "open_date": lambda x: sub_wrong_to_none(x) if type(x) is str else x }) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ "matched_id": "fund_id", "source_id": "source_id", "fund_status": None, "open_date": None, "foundation_date": None, "fund_name": "fund_name" }) stream = Stream(inp, transform=[vm, dd, km]) return stream
def stream_010001(cls): sql = "SELECT xoi.version, im.matched_id, org_name_en, org_full_name, org_category, found_date, org_code, " \ "reg_code, reg_time, manage_type, fund_scale, property, reg_capital, real_capital, " \ "real_capital_proportion, legal_person, is_qualified, qualifying_way, address, reg_address, org_web, " \ "employee_scale, final_report_time, currency, is_member, member_type, initiation_time, " \ "law_firm_name, lawyer_name " \ "FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info GROUP BY org_id) tb_latest " \ "ON xoi.org_id = tb_latest.org_id AND xoi.version = tb_latest.latest_ver " \ "JOIN (SELECT matched_id, source_id FROM base.id_match " \ "WHERE id_type = 2 AND source = '010001' AND is_used = 1) im ON xoi.org_id = im.source_id " inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap(OrderedDict([ ("org_full_name", lambda x: x.strip()), (OrgInfo.org_category.name, "私募基金管理公司"), ("sourec_id", "010001"), ("org_code", lambda x: x.strip()), ("reg_code", lambda x: x.strip()), ("real_capital_proportion", lambda x: x.strip()), ("legal_person", lambda x: x.strip()), ("qualifying_way", lambda x: x.strip()), ("is_qualified", (lambda x: "是" if x in ("通过考试", "资格认定") else "否", "qualifying_way")), ("address", lambda x: x.strip()), ("reg_address", lambda x: x.strip()), ("org_web", lambda x: re.sub("http://|https://|\s", "", x)), ("employee_scale", lambda x: int(x)) ])) vm2 = transform.ValueMap(OrderedDict([ ("org_web", lambda x: x[:-1] if x[-1] == "/" else x), ])) cw = transform.CleanWrongToNone({ "org_full_name": "\s", "org_name_en": "\s", "manage_type": "\s", "org_web": "\s" }) # 如果一个ID出现多次, 保留最新版本数据 sdd = transform.SortDropduplicate(sort_by=["version"], ascending=[False], subset=["matched_id"]) sk = transform.MapSelectKeys({ "matched_id": OrgInfo.org_id.name, "source_id": None, "org_name_en": OrgInfo.org_name_en.name, "org_full_name": OrgInfo.org_full_name.name, "org_category": OrgInfo.org_category.name, "found_date": OrgInfo.found_date.name, "org_code": OrgInfo.org_code.name, "reg_code": OrgInfo.reg_code.name, "reg_time": OrgInfo.reg_time.name, "manage_type": OrgInfo.manage_type.name, "fund_scale": OrgInfo.asset_mgt_scale.name, "property": OrgInfo.property.name, "reg_capital": OrgInfo.reg_capital.name, "real_capital": OrgInfo.real_capital.name, "real_capital_proportion": OrgInfo.real_capital_proportion.name, "legal_person": OrgInfo.legal_person.name, "is_qualified": OrgInfo.is_qualified.name, "qualifying_way": OrgInfo.qualifying_way.name, "address": OrgInfo.address.name, "reg_address": OrgInfo.reg_address.name, "org_web": OrgInfo.org_web.name, "employee_scale": OrgInfo.employee_scale.name, "final_report_time": OrgInfo.final_report_time.name, "currency": OrgInfo.currency.name, "is_member": OrgInfo.is_member.name, "member_type": OrgInfo.member_type.name, "initiation_time": OrgInfo.initiation_time.name, "law_firm_name": OrgInfo.law_firm_name.name, "lawyer_name": OrgInfo.lawyer_name.name, }) s = Stream(inp, transform=[vm, vm2, cw, sdd, sk, ]) return s
def stream_x_fund_info_010002(fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, reg_time_amac, fund_time_limit_amac, issuing_scale_amac, number_clients_amac \ FROM x_fund_info_fundaccount TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_fundaccount WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010002' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) def clean_fund_status(reg_time, limit_time): from dateutil.relativedelta import relativedelta now = dt.date.today() if reg_time is not None and limit_time is not None and limit_time > 0: if limit_time > 1200: # 有些无限期的limit_time会被标记为9999月 return "运行中" limit_date = reg_time + relativedelta(months=int(limit_time)) if limit_date <= now: return "终止" else: return "运行中" return None vm = transform.ValueMap({ "fund_name_amac": lambda x: x.strip(), "reg_code_amac": lambda x: x.strip(), "fund_time_limit_amac": lambda x: int(x), "issuing_scale_amac": lambda x: x * 1e4 if x is not None else None, "number_clients_amac": lambda x: int(x), }) vm2 = transform.ValueMap({ "fund_status": (lambda x, y: clean_fund_status(x, y), "reg_time_amac", "fund_time_limit_amac") }) cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None) # jn = transform.Join( # inp2, left_on="fund_id", right_on="source_id" # ) ac = transform.AddConst({"source_id": "010002"}) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ # "version": None, "matched_id": "fund_id", "source_id": None, "reg_code_amac": "reg_code", "reg_time_amac": "reg_time", "fund_status": "fund_status", "fund_time_limit_amac": "limit_time", "issuing_scale_amac": "issuing_scale", "number_clients_amac": "number_clients", "fund_name_amac": "fund_full_name", }) stream = Stream(inp, (vm, vm2, cln, dd, ac, km), name="x_fund_info_account") return stream
def stream_x_fund_info_010003(fund_ids=None): def clean_currency(x): patt = "(?P<currency>人民币|美元|多币种|欧元|港元|澳元|其他|其它|日元|英镑)?(?P<type>现钞|现汇)?" cpatt = re.compile(patt) sre = cpatt.search(x) if sre is not None: return sre.groupdict() return {} sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, reg_time_amac, foundation_date_amac, currency_name_amac, \ fund_status_amac, final_report_time_amac \ FROM x_fund_info_private TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_private WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010003' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap( OrderedDict([ ("fund_name_amac", lambda x: x.strip()), ("reg_code_amac", lambda x: x.strip()), ("is_abnormal_liquidation", (lambda x: { "延期清算": 1, "提前清算": 1, "投顾协议已终止": 1, "正常清算": 0 }.get(x), "fund_status_amac")), ("liquidation_cause", (lambda x: { "延期清算": "延期清算", "提前清算": "提前清算", "投顾协议已终止": "投顾协议已终止" }.get(x), "fund_status_amac")), ("fund_status_amac", { "正常清算": "终止", "正在运作": "运行中", "延期清算": "终止", "提前清算": "终止", "投顾协议已终止": "终止" }), ("currency_type", (lambda x: clean_currency(x).get("type"), "currency_name_amac")), ("currency_name_amac", lambda x: clean_currency(x).get("currency")), ])) cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None) # jn = transform.Join( # inp2, left_on="fund_id", right_on="source_id" # ) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") ac = transform.AddConst({"source_id": "010003"}) km = transform.MapSelectKeys({ # "version": None, "matched_id": "fund_id", "source_id": None, "fund_name_amac": "fund_full_name", "reg_code_amac": "reg_code", "reg_time_amac": "reg_time", "foundation_date_amac": "foundation_date", "currency_name_amac": "currency", "currency_type": None, "fund_status_amac": "fund_status", "is_abnormal_liquidation": None, "liquidation_cause": None }) stream = Stream(inp, (vm, cln, dd, ac, km), name="x_fund_info_private") return stream