def stream_consultant_010004(cls, d_xoi=None): """ 清洗010004源(证券公司)发行产品的投顾公司配对信息; Args: d_xoi: Returns: """ sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_securities xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010004' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010004'" \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") if d_xoi is None: d_xoi = cls._d_oi_consultant() vm = transform.ValueMap( OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, (lambda x: {"主动管理": "投资顾问"}.get(x), "manage_type_amac")), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ]) ) jn = transform.Join(inp2, how="left", on="org_id") dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) vm2 = transform.ValueMap({ FundOrgMapping.org_type_code.name: (lambda x: 1 if x == "投资顾问" else None, FundOrgMapping.org_type.name), }) dd = transform.DropDuplicate(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) sk = transform.MapSelectKeys(cls.FIELDS) s = Stream(inp, transform=[vm, jn, dn, vm2, dd, sk]) # 先去空, 避免空值产生, 导致整形列变成浮点型 return s
def stream_2(cls): """ 清洗 d_org_info; """ sql = "select INBBM,IGSDM,GPDM from usrZQZB" inp = pd.read_sql(sql, engine2) sk = transform.MapSelectKeys({ "INBBM": "INBBM", 'IGSDM': 'IGSDM', 'GPDM': 'ZQDM' }) s = Stream(inp, transform=[sk]) return s
def stream_d_fund_info_020001(): def clean_regcode(x): patt = "\D\w\d{4}" sre = re.search(patt, x) if sre is not None: return sre.group() return None def clean_locktime(x): patt = "\D\w\d{4}" sre = re.search(patt, x) if sre is not None: return sre.group() return None inp = MysqlInput(engine_c, "\ SELECT TB_MAIN.version, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, reg_code, fund_status, locked_time_limit \ open_date, \ FROM d_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020001' and is_used = 1 GROUP BY fund_id, source_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id") vm = transform.ValueMap( { "fund_name": lambda x: remove_blank(x), "fund_full_name": lambda x: remove_blank(x), "reg_code": lambda x: lambda x: clean_regcode(x), "locked_time_limit": lambda x: transform.CleanWrong("-|--"), "fund_status": {"正常": "运行中", "终止": "终止"} } ) km = transform.MapSelectKeys( { # "version": None, "fund_id": "source_id", "fund_name": None, "fund_full_name": None, "reg_code": None, "fund_status": None, "issuing_scale_amac": "issuing_scale", "number_clients_amac": "number_clients", } ) stream = Stream(inp.frame, ()) return stream
def stream_010001(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( DBondInfo.source_id == "010001").with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type, DBondInfo.interest_freq) inp = MysqlInput(session.bind, stmt) def clean_interest_freq(type_, freq, fname): if fname.find("贴现") >= 0: return "贴现" if type_ in {"零息", "贴现"}: return "贴现" elif type_ == "一次还本息": return "到期一次还本付息" else: d_freq = { "到期一次还本付息": "到期一次还本付息", # 除去type_为"零息"的情况, 其余freq为"到期一次还本付息"映射为"到期一次还本付息" "按年付息": "年付息", "按半年付息": "半年付息", "按季度付息": "季度付息", "按月付息": "月付息", } return d_freq.get(freq) vm = transform.ValueMap({ BondInfo.interest_freq.name: (lambda type_, freq, fname: clean_interest_freq(type_, freq, fname), DBondInfo.interest_type.name, DBondInfo.interest_freq.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_freq.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_freq.name: None, }) s = Stream(inp, (vm, dn, km)) return s
def stream_000001(cls): sql = "SELECT fund_id FROM base.fund_info" inp = MysqlNativeInput(engine_base, sql) vm = transform.ValueMap({ "region": (lambda x: region.get(x), "fund_id"), }) km = transform.MapSelectKeys( { "fund_id": "fund_id", "region": "region" } ) stream = Stream(inp, transform=[vm, km]) return stream
def stream_000001(cls): sql = "SELECT fund_id FROM base.fund_info" inp = MysqlNativeInput(engine_base, sql) vm = transform.ValueMap({ "init_nav": (lambda x: init_nav.get(x), "fund_id"), }) vm2 = transform.ValueMap( {"init_nav": lambda x: 1 if np.isnan(x) else x}) km = transform.MapSelectKeys({ "fund_id": "fund_id", "init_nav": "init_nav" }) stream = Stream(inp, transform=[vm, vm2, km]) return stream
def stream_020003(cls): """ 清洗 fund_custodian; """ sql = "SELECT idh.matched_id, ff.fund_custodian, fi.fund_name FROM ( \ SELECT matched_id,source_id FROM base_public.id_match \ where id_type = 1 AND is_used = 1 AND data_source = '020003') as idh \ JOIN \ (SELECT * FROM ( \ SELECT MAX(version) as mm, fund_id as id FROM crawl_public.d_fund_info GROUP BY fund_id) as idm \ JOIN crawl_public.d_fund_info as df \ ON idm.id = df.fund_id AND idm.mm = df.version \ WHERE df.data_source = '020003') AS ff \ ON ff.id = idh.matched_id \ JOIN base_public.fund_info as fi \ ON fi.fund_id = idh.matched_id \ where idh.matched_id <> '777777'" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "org_id": (lambda x: cls.org_full_name.get(x), "fund_custodian"), }) vm2 = transform.ValueMap({ "type_code": (lambda x: cls.type_code.get(x), "org_id"), }) vm3 = transform.ValueMap({ "type_name": (lambda x: cls.type_name.get(x), "type_code"), }) dr = transform.Dropna(axis=0, how="any") sk = transform.MapSelectKeys({ "matched_id": "fund_id", "org_id": "org_id", "fund_custodian": "org_name", "fund_name": "fund_name", "type_code": "type_code", "type_name": "type_name" }) s = Stream(inp, transform=[vm, vm2, vm3, dr, sk]) return s
def dfundnv2src(fund_id=None, source_id=None, update_time=None): def cal_added_by_nv_and_adjnv(added_nv, nv, adj_nv, sid): if sid != "020003": if added_nv != 0: return added_nv else: return None else: if nv == adj_nv: return nv else: return None session = dbsession(bind=engine_r) stmt = session.query(DFundNv).join( IdMatch, and_(IdMatch.source_id == DFundNv.fund_id, IdMatch.source == DFundNv.source_id, IdMatch.id_type == 1, IdMatch.is_used == 1) ).filter( DFundNv.is_used == 1, DFundNv.source_id == source_id, DFundNv.statistic_date > MIN_DATE, DFundNv.nav < 2000 ).with_entities( IdMatch.matched_id, DFundNv.fund_id, DFundNv.statistic_date, DFundNv.source_id, DFundNv.nav, DFundNv.added_nav, DFundNv.adjusted_nav ) if update_time is not None: stmt = stmt.filter(DFundNv.update_time >= update_time) if fund_id is not None: stmt = stmt.filter(DFundNv.fund_id == fund_id) inp = MysqlInput(session.bind, stmt) vm = transform.ValueMap({ # DFundNv.added_nav.name: lambda x: None if x == 0 else x DFundNv.added_nav.name: ( lambda added_nv, nv, adj_nv, sid: cal_added_by_nv_and_adjnv(added_nv, nv, adj_nv, sid), DFundNv.added_nav.name, DFundNv.nav.name, DFundNv.adjusted_nav.name, DFundNv.source_id.name ) }) km = transform.MapSelectKeys({ IdMatch.matched_id.name: FundNvDataSource.fund_id.name, DFundNv.statistic_date.name: FundNvDataSource.statistic_date.name, DFundNv.source_id.name: FundNvDataSource.source_id.name, DFundNv.nav.name: FundNvDataSource.nav.name, DFundNv.added_nav.name: FundNvDataSource.added_nav.name, DFundNv.adjusted_nav.name: FundNvDataSource.adjusted_nav.name, }) s = Stream(inp, (vm, km)) return s
def stream_main(cls, fund_ids=None): c = cls.conflu1(fund_ids) fn = transform.FillnaByColumn({ "foundation_date": "reg_time", "fund_name": "fund_full_name" }) vm = transform.ValueMap({ "fund_status": lambda x: "运行中" if type(x) is not str else x, "is_umbrella_fund": (lambda x: 1 if type(x) is str and ('伞型' or '伞形') in x else 0, "fund_full_name"), "limit_time": lambda x: "无限期" if type(x) is float and x > 900 else x, "currency": lambda x: "人民币" if type(x) is not str else x, }) sk = transform.MapSelectKeys({ "source_id": "source_id", "limit_date": "limit_date", "issuing_scale": "issuing_scale", "fund_name": "fund_name", "foundation_date": "foundation_date", "fund_full_name": "fund_full_name", "open_date": "open_date", "number_clients": "number_clients", "currency": "currency", "reg_code": "reg_code", "limit_time": "limit_time", "end_date": "end_date", "currency_type": "currency_type", "is_abnormal_liquidation": "is_abnormal_liquidation", "fund_id": "fund_id", "reg_time": "reg_time", "fund_status": "fund_status", "locked_time_limit": "locked_time_limit", "liquidation_cause": "liquidation_cause", "is_umbrella_fund": "is_umbrella_fund" }) s = Stream(c, transform=([fn, vm, sk])) return s
def stream_010001(cls): """ 清洗 x_org_executive_info; """ sql = "SELECT im.matched_id, `name`, qualifying_way FROM crawl_private.x_org_executive_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.x_org_executive_info GROUP BY " \ "person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '010001' AND im.is_used = 1 " inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "name": (lambda x: re.sub("(.*)", "", x), "name"), "qualifying_way": lambda x: { "通过考试": "通过考试", "资格认定": "资格认定" }.get(x) if type(x) is str else x, "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "name"), }) vm2 = transform.ValueMap({ "is_fund_qualification": (lambda x: int(bool(x)), "qualifying_way") }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'qualifying_way': 'fund_qualification_way', 'name': 'person_name', "person_name_py": "person_name_py", "is_fund_qualification": "is_fund_qualification" }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def conflu(cls): s01, s11 = cls.stream_000001(), cls.stream_010001() p = { 0: { "org_full_name": ("source_id", "010001"), }, 1: { "org_full_name": ("source_id", "000001"), }, } c = Confluence(s01, s11, on=[OrgInfo.org_id.name], prio_l1=p) dk = transform.DropKeys(["source_id"]) s = Stream(c, transform=[dk]) return Confluence(s)
def fund_info_subsidiary(cls): """ fund_info_subsidiary 映射; """ sql = " \ SELECT idm.matched_id, fis.risk_income_character,fis.investment_restriction, fis.investment_target, \ fis.income_distribution FROM ( \ SELECT DISTINCT matched_id FROM base.id_match WHERE id_type = 1 and is_used =1 \ ) as idm \ JOIN \ (SELECT fund_id, investment_restriction, investment_target, risk_income_character, \ income_distribution FROM base.fund_info_subsidiary) as fis \ ON fis.fund_id = idm.matched_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ 'income_distribution': lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, 'investment_restriction': lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, 'investment_target': lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, 'risk_income_character': lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", "income_distribution": "income_distribution", 'investment_restriction': 'investment_restriction', 'investment_target': 'investment_target', 'risk_income_character': 'risk_income_character' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_x_fund_info_010005(fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, issuing_scale_amac, number_clients_amac \ FROM x_fund_info_futures TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010005' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap( OrderedDict([ ("fund_name_amac", lambda x: x.strip()), ("reg_code_amac", lambda x: x.strip()), ("number_clients_amac", lambda x: int(x)), ])) cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None) ac = transform.AddConst({"source_id": "010005"}) # jn = transform.Join(inp2, left_on="fund_id", right_on="source_id") dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ "matched_id": "fund_id", "source_id": None, "fund_name_amac": "fund_full_name", "reg_code_amac": "reg_code", "foundation_date_amac": "foundation_date", "issuing_scale_amac": "issuing_scale", "number_clients_amac": "number_clients", }) stream = Stream(inp, (vm, cln, ac, dd, km), name="x_fund_info_future") return stream
def y_org_description(cls): """ 清洗 y_org_description; """ sql = "SELECT org_id,profile,team,investment_idea,prize,major_shareholder,shareholder_structure FROM crawl_private.`y_org_description`" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "investment_idea": lambda x: sub_wrong_to_none(x), "profile": lambda x: sub_wrong_to_none(x), "team": lambda x: sub_wrong_to_none(x), "prize": lambda x: sub_wrong_to_none(x), "major_shareholder": lambda x: sub_wrong_to_none(x), "shareholder_structure": lambda x: sub_wrong_to_none(x), }) sk = transform.MapSelectKeys({ 'org_id': 'org_id', 'investment_idea': 'investment_idea', 'profile': 'profile', 'team': 'team', 'prize': 'prize', 'major_shareholder': 'major_shareholder', 'shareholder_structure': 'shareholder_structure' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_010001(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( and_(DBondInfo.source_id == "010001", DBondInfo.is_used == 1)).with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type) inp = MysqlInput(session.bind, stmt) def clean_interest_type(type_, fname): if fname.find("贴现") >= 0: return "零息" else: d_type = { "零息": "零息", "固定利息": "固定利率", "浮动利息": "浮动利率", } res = d_type.get(type_) if res is None: if fname.find("短期融资券") >= 0: return "固定利率" return res vm = transform.ValueMap({ BondInfo.interest_type.name: (lambda type_, fname: clean_interest_type(type_, fname), DBondInfo.interest_type.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_type.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_type.name: None }) s = Stream(inp, (vm, dn, km)) return s
def stream_020003(cls): """ 清洗公募数米 fund_info; """ sql = "select * FROM (SELECT fund_id,fund_name,fund_full_name,data_source, \ foundation_date,fund_status, purchase_status,redemption_status, \ aip_status, recommendation_start,recommendation_end,init_raise \ FROM crawl_public.d_fund_info \ WHERE data_source = '020003' ORDER BY version DESC ) AS T \ GROUP BY T.fund_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "fund_status": lambda x: sub_wrong_to_none(x), "purchase_status": lambda x: cls.PURCHASE_STATUS_020003.get(x), "init_raise": lambda x: sub_wrong_to_none(x) }) vm2 = transform.ValueMap( {"init_raise": lambda x: float(re.sub("亿", "", x))}) sk = transform.MapSelectKeys({ "data_source": "source_id", 'fund_id': 'fund_id', 'fund_name': 'fund_name', 'fund_full_name': 'fund_full_name', 'foundation_date': 'foundation_date', 'fund_status': 'fund_status', 'purchase_status': 'purchase_status', 'redemption_status': 'redemption_status', 'aip_status': 'aip_status', 'recommendation_start': 'recommendation_start', 'recommendation_end': 'recommendation_end', 'init_raise': 'init_raise', }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_manager_010005(cls, d_xoi=None): """ 清洗010005源(期货公司)发行产品的基金管理人配对信息; Args: d_xoi: Returns: """ sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_futures xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010005' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi " \ "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010005' " \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") if d_xoi is None: d_xoi = cls._d_oi_manager() vm = transform.ValueMap(OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, "基金管理人"), # 该来源的fund_issue_org_amac都是基金管理人; (FundOrgMapping.org_type_code.name, 2), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ])) jn = transform.Join(inp2, how="left", on="org_id") sk = transform.MapSelectKeys(cls.FIELDS) dn = transform.Dropna( subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name]) s = Stream(inp, transform=(vm, jn, sk, dn,)) return s
def end_date(cls): """ 清洗 fund_manager_mapping; """ sql = "SELECT idm.matched_id, fmg.end_date FROM (SELECT DISTINCT matched_id FROM base.id_match) as idm \ JOIN base.fund_manager_mapping AS fmg ON idm.matched_id = fmg.fund_id \ WHERE end_date> '{time}' GROUP BY fund_id".format( time=cls.datetime) inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "end_date": "end_date", 'matched_id': 'fund_id' }) s = Stream(inp, transform=[sk]) return s
def stream_020001(cls): """ 清洗d_fund_se(020001) """ sql = "SELECT * FROM \ (SELECT * from ( \ SELECT fund_id,fund_name,source_id,stock_id,stock_name,statistic_date,stock_ratio,stock_sum, \ stock_change,variable_rate,change_num from crawl_private.d_fund_security \ where source_id = '020001' ORDER BY version DESC) AS T \ GROUP BY T.fund_id,T.stock_id,T.statistic_date) as dfs \ LEFT JOIN \ (SELECT matched_id,source_id as source_id2 from base.id_match where id_type=1 and source = '020001') as i \ ON dfs.fund_id = i.source_id2 \ WHERE i.matched_id is not NULL" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ 'security_category': '股票', 'source': '第三方', 'source_id': '020001' }) sk = transform.MapSelectKeys({ 'statistic_date': 'statistic_date', 'matched_id': 'fund_id', 'fund_name': 'fund_name', 'source_id': 'source_id', 'security_category': 'security_category', 'stock_id': 'id', 'stock_name': 'name', 'stock_sum': 'sum', 'stock_ratio': 'ratio', 'value': 'value', 'stock_change': 'change', 'variable_rate': 'change_rate', 'source': 'source' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_010101(cls): sql = "SELECT im.matched_id as person_id, pi.person_name, xpc.status, xpc.org_name as org_name_ " \ "FROM crawl_private.x_person_certificate_010101 xpc " \ "JOIN base.id_match im ON xpc.person_id = im.source_id " \ "LEFT JOIN base.person_info pi ON im.matched_id = pi.person_id " \ "WHERE im.source = '010101' AND im.id_type = 3 AND im.is_used = 1 " \ "AND xpc.org_name not like '空'" def org_name_dict(): import pandas as pd sql_oi = "SELECT org_full_name, org_id, org_name FROM base.org_info" res = pd.read_sql(sql_oi, ENGINE) v = tuple([(oid, oname) for oid, oname in zip(res["org_id"], res["org_name"])]) res = dict(zip(res["org_full_name"], v)) return res inp = MysqlInput(ENGINE, sql) d = org_name_dict() vm = transform.ValueMap({ "org_id": (lambda x: d.get(x)[0], "org_name_"), "org_name": (lambda x: d.get(x)[1], "org_name_"), "is_current": (lambda x: {"正常": "1", "离职": "0"}.get(x), "status"), "duty": None }) sk = transform.MapSelectKeys({ "person_id": "person_id", "person_name": "person_name", "org_id": "org_id", "org_name": "org_name", "is_current": "is_current", "duty": "duty" }) dn = transform.Dropna( subset=["person_id", "org_id"] ) s = Stream(inp, [vm, sk, dn]) return s
def stream_020001(cls): """ 清洗 d_org_shareholder; """ sql = " \ SELECT idm.matched_id, doi.data_source, doi.statistic_date, doi.shareholder_name, \ doi.shareholder_num, doi.capital_stock, doi.stock_held, doi.proportion_held \ FROM (SELECT matched_id,source_id from base_public.id_match \ WHERE data_source = '020001' AND is_used = 1 and id_type = 2) as idm \ JOIN crawl_public.d_org_shareholder AS doi \ ON doi.org_id = idm.source_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "capital_stock": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "stock_held": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "proportion_held": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, }) vm2 = transform.ValueMap({ "capital_stock": lambda x: float(x) if type(x) is str else x, "stock_held": lambda x: float(x) if type(x) is str else x, "proportion_held": lambda x: round(float(x)/100, 6) if type(x) is str else x, }) sk = transform.MapSelectKeys({ "matched_id": "org_id", 'data_source': 'data_source', 'statistic_date': 'statistic_date', 'capital_stock': 'capital_stock', 'stock_held': 'stock_held', 'proportion_held': 'proportion_held', 'shareholder_num': 'shareholder_num', 'shareholder_name': 'shareholder_name' }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_manager(cls): """ 清洗 fund_manager_mapping; """ sql = 'SELECT fi.fund_id, GROUP_CONCAT(fmm.person_name SEPARATOR ",") as person_name \ FROM base_public.fund_info fi \ JOIN base_public.fund_manager_mapping fmm ON fi.fund_id = fmm.fund_id \ WHERE fmm.is_current = 1 \ GROUP BY fund_id' inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "person_name": "fund_manager", 'fund_id': 'fund_id' }) s = Stream(inp, transform=[sk]) return s
def stream_manager(cls): """ 清洗 fund_manager_mapping; """ sql = 'SELECT fi.matched_id, GROUP_CONCAT(fmm.person_name SEPARATOR ",") as person_name \ FROM (SELECT DISTINCT matched_id FROM base.id_match WHERE id_type = 1 AND is_used = 1) fi \ JOIN base.fund_manager_mapping fmm ON fi.matched_id = fmm.fund_id \ WHERE fmm.is_current is NULL or fmm.is_current > 0 \ GROUP BY fund_id ' inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "person_name": "fund_member", 'matched_id': 'fund_id' }) s = Stream(inp, transform=[sk]) return s
def stream_d_fund_info_020003(cls, fund_ids=None): sql = "\ SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, fund_status, locked_time_limit, \ open_date, foundation_date \ FROM d_fund_info TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020003' AND is_used = 1 GROUP BY fund_id, source_id) AS TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \ JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020003' AND im.is_used = 1 " if fund_ids is not None: fids = SQL.values4sql(fund_ids) sql += "WHERE im.matched_id IN {fids}".format(fids=fids) inp = MysqlNativeInput(engine_c, sql) vm = transform.ValueMap({ "fund_status": { "正在运行": "运行中", "已清盘": "终止", "封闭运行": "运行中" }, "open_date": lambda x: sub_wrong_to_none(x) if type(x) is str else x }) dd = transform.SortDropduplicate(sort_by=["version", "matched_id"], ascending=[False, True], subset="matched_id", keep="first") km = transform.MapSelectKeys({ "matched_id": "fund_id", "source_id": "source_id", "fund_status": None, "open_date": None, "foundation_date": None, "fund_name": "fund_name" }) stream = Stream(inp, transform=[vm, dd, km]) return stream
def stream_x_fund_info_security(): def clean_date(x): try: return dt.datetime.strptime(x, "%Y-%m-%d").date() except: return None inp = MysqlInput(engine_c, "\ SELECT TB_MAIN.version, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, fund_time_limit_amac \ FROM x_fund_info_securities TB_MAIN \ JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \ ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id") vm = transform.ValueMap( OrderedDict( [ ("fund_name_amac", lambda x: remove_blank(x)), ("reg_code_amac", lambda x: remove_blank(x)), ("limit_date", (lambda x: clean_date(x), "fund_time_limit_amac")), ("fund_time_limit_amac", lambda x: x if x == "无期限" else None) ] ) ) km = transform.MapSelectKeys( { # "version": None, "fund_id": "source_id", "fund_name_amac": "fund_full_name", "reg_code_amac": "reg_code", "foundation_date_amac": "foundation_date", "fund_time_limit_amac": "limit_time", "limit_date": "limit_date", "fund_status_amac": "fund_status", } ) stream = Stream(inp.frame, (vm, km)) return stream
def stream_is_reg(cls): sql = "SELECT idm.matched_id, idd.matched_id AS is_reg FROM \ (SELECT DISTINCT matched_id FROM base.id_match WHERE id_type = 1 AND is_used = 1) AS idm \ LEFT JOIN \ (SELECT DISTINCT matched_id FROM base.id_match WHERE \ source in ('010002','010003','010004','010005') and is_used =1 and id_type=1 ) AS idd \ ON idm.matched_id = idd.matched_id" inp = MysqlNativeInput(ENGINE_RD, sql) vm = transform.ValueMap( {"is_reg": lambda x: 1 if type(x) is str else 0}) km = transform.MapSelectKeys({ "matched_id": "fund_id", "is_reg": "is_reg" }) stream = Stream(inp, transform=[vm, km]) return stream
def stream_y(cls): sql = "SELECT org_id,org_name,person_id,person_name,duty,duty_detail," \ "tenure_date,is_current FROM crawl_private.y_org_person_mapping" inp = MysqlInput(ENGINE, sql) sk = transform.MapSelectKeys( { "person_id": "person_id", "person_name": "person_name", "org_id": "org_id", "org_name": "org_name", "duty": "duty", "duty_detail": "duty_detail", "is_current": "is_current", 'tenure_date': "tenure_date" } ) s = Stream(inp, transform=[sk]) return s
def stream_custodian(cls): """ 清洗 fund_custodian; """ sql = " \ SELECT idm.matched_id, fom.org_name FROM (SELECT DISTINCT matched_id FROM base_public.id_match \ WHERE id_type = 1 AND is_used = 1) AS idm \ JOIN base_public.fund_org_mapping \ AS fom ON idm.matched_id = fom.fund_id WHERE fom.type_code = 2" inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "org_name": "fund_custodian", 'matched_id': 'fund_id', }) s = Stream(inp, transform=[sk]) return s
def stream_type(cls): """ 清洗 fund_type; """ sql = " \ SELECT * FROM (SELECT DISTINCT matched_id FROM base_public.id_match WHERE id_type = 1 AND is_used = 1) AS idm \ JOIN (SELECT fund_id,type_name FROM base_public.fund_type_mapping WHERE typestandard_code = '02') \ AS ftm ON idm.matched_id = ftm.fund_id \ " inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "type_name": "fund_type", 'fund_id': 'fund_id', }) s = Stream(inp, transform=[sk]) return s
def stream_custodian(cls): sql = "select ff.fund_id,GROUP_CONCAT(o.org_name SEPARATOR ',') as org_name from \ ((SELECT DISTINCT matched_id as fund_id from base.id_match where id_type=1 and is_used=1) as ff \ JOIN \ (SELECT fund_id,org_id from base.fund_org_mapping WHERE org_type_code = 5) as fom \ ON ff.fund_id = fom.fund_id \ LEFT JOIN \ (SELECT org_id,org_name from base.org_info) as o \ on fom.org_id = o.org_id) \ GROUP BY ff.fund_id" inp = MysqlNativeInput(ENGINE_RD, sql) km = transform.MapSelectKeys({ "fund_id": "fund_id", "org_name": "fund_custodian" }) stream = Stream(inp, transform=[km]) return stream