def stream_020001(cls, fund_id): """ 清洗 d_fund_position(020001); """ sql = " \ SELECT dp.fund_id, dp.fund_name, dp.data_source, dp.statistic_date, dp.subject_id, dp.subject_name, \ dp.quantity, dp.scale, dp.proportion, fa.total_asset FROM \ crawl_public.d_fund_position as dp \ JOIN base_public.fund_asset_scale AS fa \ ON dp.fund_id = fa.fund_id and dp.statistic_date=fa.statistic_date \ WHERE dp.type= '股票' AND dp.data_source= '020001'" if fund_id is not None: fids = SQL.values4sql(fund_id) sql += "AND dp.fund_id IN {fids}".format(fids=fids) inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "quantity": lambda x: sub_wrong_to_none(x), "scale": lambda x: sub_wrong_to_none(x), "proportion": lambda x: sub_wrong_to_none(x), "total_asset": lambda x: float(x) if type(x) is str else x }) vm1 = transform.ValueMap({ "quantity": lambda x: cls.clean_amount(x), "scale": lambda x: cls.clean_amount(x) }) vm2 = transform.ValueMap({ "quantity": lambda x: float(x), "scale": lambda x: float(x), "proportion": lambda x: float(x) / 100 if type(x) is str else x }) sk = transform.MapSelectKeys({ "fund_id": "fund_id", "fund_name": "fund_name", "data_source": "data_source", "statistic_date": "statistic_date", "subject_id": "subject_id", "subject_name": "subject_name", # "quantity": "quantity", "scale": "scale", "proportion": "proportion_net", "total_asset": "asset_scale" }) s = Stream(inp, transform=[vm, vm1, vm2, sk]) return s
def completion_020001(cls): sql = "SELECT fund_id, statistic_date, holder_type, share_held \ FROM crawl_public.d_fund_holder \ WHERE data_source = '020001' \ AND fund_id IN (SELECT DISTINCT fund_id FROM base_public.fund_holder)" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "share_held": lambda x: sub_wrong_to_none(x), "holder_type": lambda x: cls.hold_type_dict.get(x) }) vm2 = transform.ValueMap({ "share_held": lambda x: round(x, 6) if type(x) is float else x }) sk = transform.MapSelectKeys({ "fund_id": "fund_id", "share_held": "share_held", "holder_type": "holder_type", "statistic_date": "statistic_date" }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_010005(): sql_fi = "SELECT im.matched_id as fund_id, fi.fund_name, fi.foundation_date, issuing_scale_amac " \ "FROM crawl_private.{tb} xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM crawl_private.{tb} GROUP BY fund_id) tb_latest " \ "ON tb_latest.latest_ver = xfi.version AND tb_latest.fund_id = xfi.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.source = '010005' AND im.id_type = 1 AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi ON fi.fund_id = im.matched_id".format(tb="x_fund_info_futures") inp = MysqlInput(ENGINE_C, sql_fi) vm = transform.ValueMap({ "issuing_scale_amac": lambda x: float(x), }) vm2 = transform.ValueMap({ "issuing_scale_amac": lambda x: None if x == 0 else x, }) dn = transform.Dropna( subset=["fund_id", "foundation_date", "issuing_scale_amac"]) sk = transform.MapSelectKeys({ "fund_id": FundAssetScale.fund_id.name, "fund_name": FundAssetScale.fund_name.name, "issuing_scale_amac": FundAssetScale.asset_scale.name, "foundation_date": FundAssetScale.statistic_date.name, }) s = base.Stream(inp, transform=(vm, vm2, dn, sk)) return s
def conflu_place(cls): # region, prov, city, area sql = "SELECT org_id, address FROM {tb_test} WHERE address IS NOT NULL".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) vm0 = transform.ValueMap({ "__tmp": (lambda x: cls._clean_place(x), "address") }) vm = transform.ValueMap({ OrgInfo.region.name: (lambda x: x[0], "__tmp"), OrgInfo.prov.name: (lambda x: x[1], "__tmp"), OrgInfo.city.name: (lambda x: x[2], "__tmp"), OrgInfo.area.name: (lambda x: x[3], "__tmp"), }) sk = transform.MapSelectKeys({ OrgInfo.org_id.name: None, OrgInfo.region.name: None, OrgInfo.prov.name: None, OrgInfo.city.name: None, OrgInfo.area.name: None, }) s = Stream(inp, transform=[vm0, vm, sk]) return Confluence(s)
def stream_602_4(cls, c_cleaned): sql = "SELECT fi.fund_id, fi.fund_name " \ "FROM base.fund_info fi " \ "WHERE fi.fund_id NOT IN (SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602) " inp = MysqlInput(ENGINE, sql) # 非惰性求值 fids = set(c_cleaned.dataframe["fund_id"]) vm0 = transform.ValueMap({ "__tmp": (lambda x: (60201, "非结构化", None, None) if x not in fids else (None, None, None, None), "fund_id") }) vm1 = transform.ValueMap({ FundTypeMappingImport.typestandard_code.name: 602, FundTypeMappingImport.typestandard_name.name: "按结构类型分类", FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"), FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"), FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"), FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"), FundTypeMappingImport.classified_by.name: CLASSIFIED_BY }) sk = transform.MapSelectKeys(cls.FIXED_FIELDS) dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all") s = base.Stream(inp, [vm0, vm1, sk, dn]) return s
def stream_020003(cls): """ 清洗 d_fund_position(020003); """ sql = " \ SELECT fps.fund_id,fps.statistic_date,fps.subject_id,dfp.quantity, fps.data_source \ FROM base_public.fund_position_stock AS fps \ JOIN crawl_public.d_fund_position AS dfp \ ON fps.fund_id = dfp.fund_id AND fps.statistic_date = dfp.statistic_date \ AND fps.subject_id = dfp.subject_id \ WHERE dfp.data_source = '020001' AND dfp.quantity is not NULL" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "quantity": lambda x: cls.sub_wrong_to_none(x) }) vm2 = transform.ValueMap({ "quantity": lambda x: float(x) }) sk = transform.MapSelectKeys({ "fund_id": "fund_id", "data_source": "data_source", "statistic_date": "statistic_date", "subject_id": "subject_id", "quantity": "quantity" }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_020008(cls): """ 清洗 d_org_info; """ sql = " \ SELECT idm.matched_id, dfi.open_date, dfi.locked_time_limit, dfi.min_purchase_amount, \ dfi.min_append_amount,dfi.fee_subscription,dfi.fee_redeem, dfi.fee_manage, dfi.duration, \ dfi.fee_pay, dfi.source_id, dfi.precautious_line, dfi.stop_loss_line \ FROM \ (SELECT matched_id, source_id FROM base.id_match where id_type=1 and is_used=1 AND source='020008' GROUP BY matched_id) as idm \ JOIN \ (SELECT MAX(version) maxversion, fund_id FROM crawl_private.d_fund_info WHERE source_id = '020008' GROUP BY fund_id) as ma \ ON idm.source_id = ma.fund_id \ JOIN crawl_private.d_fund_info as dfi \ on dfi.version = ma.maxversion and dfi.fund_id = ma.fund_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "min_purchase_amount": lambda x: cls.clean_01(x) if type(x) is str else None, "min_append_amount": lambda x: cls.clean_01(x) if type(x) is str else x, "fee_subscription": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "fee_redeem": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "fee_manage": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "fee_pay": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, }) vm2 = transform.ValueMap({ "min_append_amount_remark": (lambda x, y: ','.join( [str for str in [x, y] if str not in [None]]) if type(x) or type(y) is str else None, "min_purchase_amount", "min_append_amount") }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", 'min_purchase_amount': 'min_purchase_amount', "min_append_amount": "min_append_amount", "min_append_amount_remark": "min_append_amount_remark", 'fee_subscription': 'fee_subscription', 'fee_redeem': 'fee_redeem', 'duration': 'duration', 'fee_pay': 'fee_pay', 'fee_manage': 'fee_manage', 'source_id': 'source_id', 'precautious_line': 'precautious_line', 'stop_loss_line': 'stop_loss_line' }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_020001(cls): """ 清洗公募好买源 fund_info; """ sql = "SELECT idm.fund_id , dfi2.fund_name ,dfi2.fund_full_name, dfi2.data_source, \ dfi2.foundation_date, dfi2.fund_status, dfi2.purchase_status, dfi2.redemption_status, \ dfi2.aip_status, dfi2.recommendation_start, dfi2.recommendation_end, dfi2.init_raise \ FROM (SELECT matched_id as fund_id from base_public.id_match \ WHERE id_type =1 AND is_used =1 AND data_source = '020001') AS idm \ JOIN \ (SELECT mdfi.fund_id , dfi.fund_name ,dfi.fund_full_name, dfi.data_source, \ dfi.foundation_date, dfi.fund_status, dfi.purchase_status, dfi.redemption_status, \ dfi.aip_status, dfi.recommendation_start, dfi.recommendation_end, dfi.init_raise \ FROM \ (SELECT MAX(version) as mm,fund_id FROM crawl_public.d_fund_info GROUP BY fund_id) AS mdfi \ JOIN \ crawl_public.d_fund_info AS dfi \ ON mdfi.fund_id = dfi.fund_id \ and mdfi.mm = dfi.version \ WHERE dfi.data_source = '020001') as dfi2 \ ON idm.fund_id = dfi2.fund_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "fund_status": lambda x: sub_wrong_to_none(x), "purchase_status": lambda x: cls.PURCHASE_STATUS_020001.get(x), "redemption_status": lambda x: cls.REDEMPTION_STATUS_020001.get(x), "init_raise": lambda x: sub_wrong_to_none(x) }) vm2 = transform.ValueMap({"fund_status": lambda x: cls.FUND_STATUS_020001.get(x), "init_raise": lambda x: float(re.sub("亿", "", x)) }) sk = transform.MapSelectKeys({ "data_source": "source_id", 'fund_id': 'fund_id', 'fund_name': 'fund_name', 'fund_full_name': 'fund_full_name', 'foundation_date': 'foundation_date', 'fund_status': 'fund_status', 'purchase_status': 'purchase_status', 'redemption_status': 'redemption_status', 'aip_status': 'aip_status', 'recommendation_start': 'recommendation_start', 'recommendation_end': 'recommendation_end', 'init_raise': 'init_raise' }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_020001_sws(cls): """ 清洗 d_org_portfolio_industry; """ sql = " \ SELECT idm.matched_id, doi.data_source, doi.statistic_date, doi.type, \ doi.proportion, doi.asset_scale FROM (SELECT matched_id,source_id from base_public.id_match \ WHERE data_source = '020002' AND is_used = 1 and id_type = 2) as idm \ JOIN crawl_public.d_org_portfolio_industry AS doi \ ON doi.org_id = idm.source_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "asset_scale": lambda x: float(x) / 10e7 if type(x) is str else x, "type": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "proportion": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, }) vm2 = transform.ValueMap({ "proportion": lambda x: round(float(x) / 100, 6) if type(x) is str else x, "style": 1, "type": lambda x: cls.sws.get(x) if type(x) is str else x }) vm3 = transform.ValueMap({ "scale": (lambda x, y: x * y, "asset_scale", "proportion"), }) vm4 = transform.ValueMap({ "asset_scale": lambda x: round(x, 6) if type(x) is float else x, "scale": lambda x: round(x, 6) if type(x) is float else x }) sk = transform.MapSelectKeys({ "matched_id": "org_id", 'proportion': 'proportion', 'statistic_date': 'statistic_date', 'asset_scale': 'asset_scale', 'scale': 'scale', 'style': 'style', 'type': 'type', }) s = Stream(inp, transform=[vm, vm2, vm3, vm4, sk]) return s
def stream_manager_010003(cls, d_xoi=None): sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, type_name_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_private xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_private GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010003' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi " \ "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010003' " \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") def clean_org_type(mng_tp, tp): # 私募证券投资基金 = 证券投资基金; 其他私募投资基金 = 其他投资基金 if mng_tp == "顾问管理": if tp in {"私募证券投资基金", "证券投资基金"}: return "基金管理人" elif mng_tp == "受托管理": if tp in {"私募证券投资基金", "证券投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金", "股权投资基金"}: return "基金管理人" elif mng_tp == "自我管理": if tp in {"私募证券投资基金", "证券投资基金", "股权投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金"}: return "基金管理人" if d_xoi is None: d_xoi = cls._d_oi_manager() vm = transform.ValueMap( OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, (lambda mng_tp, tp: clean_org_type(mng_tp, tp), "manage_type_amac", "type_name_amac")), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ]) ) jn = transform.Join(inp2, how="left", on="org_id") dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) vm2 = transform.ValueMap({ FundOrgMapping.org_type_code.name: (lambda x: 2 if x == "基金管理人" else None, FundOrgMapping.org_type.name), }) sk = transform.MapSelectKeys(cls.FIELDS) s = Stream(inp, transform=(vm, jn, dn, vm2, sk)) return s
def stream_020001(cls): """ 清洗公募好买源 fund_info; """ sql = "select * FROM (SELECT fund_id,fund_name,fund_full_name,data_source, \ foundation_date,fund_status,purchase_status,redemption_status, \ aip_status, recommendation_start,recommendation_end,init_raise \ FROM crawl_public.d_fund_info \ WHERE data_source = '020001' ORDER BY version DESC ) AS T \ GROUP BY T.fund_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "fund_status": lambda x: sub_wrong_to_none(x), "purchase_status": lambda x: cls.PURCHASE_STATUS_020001.get(x), "redemption_status": lambda x: cls.REDEMPTION_STATUS_020001.get(x), "init_raise": lambda x: sub_wrong_to_none(x), }) vm2 = transform.ValueMap({ "fund_status": lambda x: cls.FUND_STATUS_020001.get(x), "init_raise": lambda x: float(re.sub("亿", "", x)) }) sk = transform.MapSelectKeys({ "data_source": "source_id", 'fund_id': 'fund_id', 'fund_name': 'fund_name', 'fund_full_name': 'fund_full_name', 'foundation_date': 'foundation_date', 'fund_status': 'fund_status', 'purchase_status': 'purchase_status', 'redemption_status': 'redemption_status', 'aip_status': 'aip_status', 'recommendation_start': 'recommendation_start', 'recommendation_end': 'recommendation_end', 'init_raise': 'init_raise', }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream_020001(cls): """ 清洗 fund_custodian; """ dict_01 = {'中国银行(香港)有限公司': '02000001'} #往这里可以添加需要的匹配关系 dict_020001 = dict(cls.org_full_name, **dict_01) sql = "SELECT idh.matched_id, ff.fund_custodian, fi.fund_name FROM ( \ SELECT matched_id,source_id FROM base_public.id_match \ where id_type = 1 AND is_used = 1 AND data_source = '020001') as idh \ JOIN \ (SELECT * FROM ( \ SELECT MAX(version) as mm, fund_id as id FROM crawl_public.d_fund_info GROUP BY fund_id) as idm \ JOIN crawl_public.d_fund_info as df \ ON idm.id = df.fund_id AND idm.mm = df.version \ WHERE df.data_source = '020001') AS ff \ ON ff.id = idh.matched_id \ JOIN base_public.fund_info as fi \ ON fi.fund_id = idh.matched_id \ where idh.matched_id <> '777777'" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "org_id": (lambda x: dict_020001.get(x), "fund_custodian"), }) vm2 = transform.ValueMap({ "type_code": (lambda x: cls.type_code.get(x), "org_id"), }) vm3 = transform.ValueMap({ "type_name": (lambda x: cls.type_name.get(x), "type_code"), }) dr = transform.Dropna(axis=0, how="any") sk = transform.MapSelectKeys({ "matched_id": "fund_id", "org_id": "org_id", "fund_custodian": "org_name", "fund_name": "fund_name", "type_code": "type_code", "type_name": "type_name" }) s = Stream(inp, transform=[vm, vm2, vm3, dr, sk]) return s
def stream_020002_ts401(cls): source_id = "020002" tb = "crawl_private.d_fund_info" tmpsql = cls.SQL_X01.format(tb=tb, sid=source_id) inp = MysqlInput(ENGINE, tmpsql) vm0 = transform.ValueMap({ "type_name": lambda x: "" if x.replace("-", "") == "" else x, "stype_name": lambda x: "" if x.replace("-", "") == "" else x, }) vm = transform.ValueMap( OrderedDict([ [FundTypeSource.source_id.name, source_id], [FundTypeSource.typestandard_code.name, "101"], [FundTypeSource.typestandard_name.name, "按投资策略分类"], [ "__cached", (lambda tn, stn: cls.TS_MAPPER_401.get( (tn, stn), [None] * 4), "type_name", "stype_name") ], [FundTypeSource.type_code.name, (lambda x: x[0], "__cached")], [FundTypeSource.type_name.name, (lambda x: x[2], "__cached")], [FundTypeSource.stype_code.name, (lambda x: x[1], "__cached")], [FundTypeSource.stype_name.name, (lambda x: x[3], "__cached")], ])) sk = transform.MapSelectKeys({ "matched_id": FundTypeSource.fund_id.name, "fund_name": FundTypeSource.fund_name.name, FundTypeSource.source_id.name: None, FundTypeSource.typestandard_code.name: None, FundTypeSource.typestandard_name.name: None, FundTypeSource.type_code.name: None, FundTypeSource.type_name.name: None, FundTypeSource.stype_code.name: None, FundTypeSource.stype_name.name: None, }) dn = transform.Dropna(subset=[ FundTypeSource.type_code.name, FundTypeSource.stype_code.name ], how="all") s = base.Stream(inp, [vm0, vm, sk, dn]) return s
def stream_consultant_010004(cls, d_xoi=None): """ 清洗010004源(证券公司)发行产品的投顾公司配对信息; Args: d_xoi: Returns: """ sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_securities xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010004' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010004'" \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") if d_xoi is None: d_xoi = cls._d_oi_consultant() vm = transform.ValueMap( OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, (lambda x: {"主动管理": "投资顾问"}.get(x), "manage_type_amac")), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ]) ) jn = transform.Join(inp2, how="left", on="org_id") dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) vm2 = transform.ValueMap({ FundOrgMapping.org_type_code.name: (lambda x: 1 if x == "投资顾问" else None, FundOrgMapping.org_type.name), }) dd = transform.DropDuplicate(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) sk = transform.MapSelectKeys(cls.FIELDS) s = Stream(inp, transform=[vm, jn, dn, vm2, dd, sk]) # 先去空, 避免空值产生, 导致整形列变成浮点型 return s
def stream_020001(cls, fund_ids): sql = "SELECT im.matched_id, fi.fund_name, im.data_source, dfp.statistic_date, dfp.subject_id, dfp.subject_name, " \ "dfp.scale, dfp.proportion, fas.total_asset " \ "FROM base_public.id_match im " \ "JOIN crawl_public.d_fund_position dfp " \ "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \ "JOIN base_public.fund_info fi ON im.matched_id = fi.fund_id " \ "left JOIN base_public.fund_asset_scale fas ON dfp.fund_id = fas.fund_id AND dfp.statistic_date = fas.statistic_date " \ "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \ "AND dfp.fund_id NOT IN (" \ "SELECT DISTINCT fund_id FROM crawl_public.d_fund_position WHERE data_source = '020002')" \ "AND im.data_source = '020001'".format(fids=sf.SQL.values4sql(fund_ids)) inp = MysqlInput(cls.engine, sql) vm = transform.ValueMap({ "proportion": lambda x: float(x.replace("%", "")) / 100, "scale": lambda x: cls._clean_amount(x), }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", "fund_name": None, "data_source": None, "statistic_date": None, "subject_id": None, "subject_name": None, "scale": None, "proportion": "proportion_net", "total_asset": "asset_scale", }) return base.Stream(inp, transform=[vm, sk])
def conflu_is_member(cls): # is_member, member_type sql = "SELECT org_id, is_reg_now, is_member, member_type FROM {tb_test} ".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) def clean_membertype(is_reg, is_member, member_type, org_id): if is_reg == "否": if org_id[0] == "P": return "注销备案资格" else: return "未备案" if is_reg == "是": if is_member == "否": return "尚未取得会员资格" return member_type vm = transform.ValueMap({ OrgInfo.is_member.name: (lambda is_reg, is_member: "否" if is_reg == "否" else is_member, "is_reg_now", "is_member"), OrgInfo.member_type.name: (lambda is_reg, is_member, member_type, org_id: clean_membertype(is_reg, is_member, member_type, org_id), "is_reg_now", "is_member", "member_type", "org_id") }) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, OrgInfo.is_member.name: None, OrgInfo.member_type.name: None, }) s = Stream(inp, [vm, sk]) return Confluence(s)
def conflu_is_reg_now(cls): # is_reg_now sql = "SELECT xoi.org_id as org_id_, oi.org_id " \ "FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT DISTINCT org_id, version FROM crawl_private.x_org_info WHERE version >= {ver} ) tmp " \ "ON xoi.org_id = tmp.org_id AND xoi.version = tmp.version " \ "JOIN (SELECT matched_id, source_id FROM base.id_match WHERE id_type = 2 AND source = '010001' AND is_used = 1) im " \ "ON xoi.org_id = im.source_id " \ "RIGHT JOIN (SELECT org_id FROM {tb_test} WHERE org_category = '私募基金管理公司') oi " \ "ON im.matched_id = oi.org_id ".format(tb_test=TEST_TABLE, ver=(dt.datetime.now() - dt.timedelta(8)).strftime("%Y%m%d%H")) inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ OrgInfo.is_reg_now.name: (lambda x: "否" if x is None else "是", "org_id_") }) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, OrgInfo.is_reg_now.name: None }) s = Stream(inp, [vm, sk]) return Confluence(s)
def stream18_60409(cls): tmp_sql = "SELECT fi.fund_id, fi.fund_name " \ "FROM fund_type_source fts " \ "JOIN fund_info fi ON fts.fund_id = fi.fund_id WHERE fi.fund_id NOT IN (" \ "SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 604) " \ "AND fi.fund_id IN (" \ "SELECT DISTINCT fund_id FROM fund_type_source WHERE " \ "type_code IN (30410, 30411)) " \ "AND fi.fund_id IN (" \ "SELECT DISTINCT fund_id FROM fund_type_source WHERE " \ "type_code = 40407)" inp = MysqlInput(ENGINE, tmp_sql) vm = transform.ValueMap({ FundTypeMappingImport.typestandard_code.name: 604, FundTypeMappingImport.typestandard_name.name: "按发行主体分类", FundTypeMappingImport.type_code.name: 60409, FundTypeMappingImport.type_name.name: "单账户", FundTypeMappingImport.classified_by.name: CLASSIFIED_BY }) sk = transform.MapSelectKeys(cls.FIXED_FIELDS) s = base.Stream(inp, [vm, sk]) return s
def stream_y_person_info(cls): """ 清洗 y_person_info; """ sql = "SELECT person_id, person_name, gender, background, education," \ "graduate_school, investment_years FROM crawl_private.y_person_info" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name"), }) sk = transform.MapSelectKeys({ "person_id": "person_id", 'person_name': 'person_name', "person_name_py": "person_name_py", "gender": "gender", "background": "background", "graduate_school": "graduate_school", "investment_years": "investment_years" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020001_op(): sql = "SELECT im.matched_id, person_name FROM crawl_private.d_org_person tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_org_person GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '020001' AND im.is_used = 1 " inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap( OrderedDict([ [ PersonInfo.person_name_py.name, (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name") ], ])) sk = transform.MapSelectKeys({ "matched_id": PersonInfo.person_id.name, "person_name": PersonInfo.person_name.name, PersonInfo.person_name_py.name: None, }) s = base.Stream(inp, transform=(vm, sk)) return s
def stream_consultant_000001(cls): sql = "SELECT fund_id, fund_name, oi.org_id, oi.org_name, org_type_code, start_date, end_date, is_current " \ "FROM crawl_private.y_fund_org_mapping yfom " \ "JOIN base.org_info oi ON yfom.org_id = oi.org_id " \ "WHERE org_type_code = 1 AND is_used = 1" inp = MysqlInput(ENGINE_C, sql) vm = transform.ValueMap({ "org_type": "投资顾问" }) sk = transform.MapSelectKeys({ "fund_id": None, "fund_name": None, "org_id": None, "org_name": None, "org_type": None, "org_type_code": None, "start_date": None, "end_date": None, "is_current": None, }) dn = transform.Dropna( subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name] ) s = Stream(inp, transform=[vm, sk, dn]) return s
def stream_x_org_info(): sql = "SELECT xoi.org_id, xoi.final_report_time, fund_num, fund_scale FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info WHERE is_used = 1 GROUP BY org_id) tb " \ "ON xoi.org_id = tb.org_id AND xoi.version = tb.latest_ver" inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap({OrgAssetScale.data_time.name: dt.date.today()}) sk = transform.MapSelectKeys({ "org_id": OrgAssetScale.org_id.name, "final_report_time": OrgAssetScale.statistic_date.name, "fund_num": OrgAssetScale.funds_num.name, "fund_scale": OrgAssetScale.asset_scale.name, OrgAssetScale.data_time.name: None }) dn = transform.Dropna(subset=[ OrgAssetScale.asset_scale.name, OrgAssetScale.funds_num.name, OrgAssetScale.statistic_date.name ], how="all") s = base.Stream(inp, transform=(vm, sk, dn)) return s
def stream_010001(cls): """ 清洗x_org_info(010001) """ sql = "SELECT org_id,legal_person_resume,special_tips FROM ( \ SELECT * FROM (SELECT matched_id,source_id FROM base.id_match WHERE \ source='010001' and id_type=2 and is_used=1) \ as b LEFT JOIN \ (SELECT org_id,legal_person_resume,special_tips FROM \ crawl_private.`x_org_info` \ ORDER BY version DESC) as p ON b.source_id=p.org_id \ WHERE p.org_id is not NULL) AS T \ GROUP BY T.org_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "legal_person_resume": lambda x: sub_wrong_to_none(x), "special_tips": lambda x: sub_wrong_to_none(x) }) sk = transform.MapSelectKeys({ 'org_id': 'org_id', "special_tips": "special_tips", "legal_person_resume": "legal_person_resume" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020005(cls): """ 清洗 d_org_info; """ sql = " \ SELECT idm.matched_id, dfi.open_date, dfi.locked_time_limit, dfi.min_purchase_amount, \ dfi.min_append_amount,dfi.fee_subscription,dfi.fee_redeem, dfi.fee_manage,dfi.duration, \ dfi.fee_pay, dfi.source_id \ FROM \ (SELECT matched_id, source_id FROM base.id_match where id_type=1 and is_used=1 AND source='020001' GROUP BY matched_id) as idm \ JOIN \ (SELECT MAX(version) maxversion, fund_id FROM crawl_private.d_fund_info WHERE source_id = '020001' GROUP BY fund_id) as ma \ ON idm.source_id = ma.fund_id \ JOIN crawl_private.d_fund_info as dfi \ on dfi.version = ma.maxversion and dfi.fund_id = ma.fund_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "open_date": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, "locked_time_limit": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", 'locked_time_limit': 'locked_time_limit', 'open_date': 'open_date', 'source_id': 'source_id' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_x_org_info(): sql = "SELECT xoi.org_id, xoi.final_report_time, real_capital, reg_capital, employee_scale FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info WHERE is_used = 1 GROUP BY org_id) tb " \ "ON xoi.org_id = tb.org_id AND xoi.version = tb.latest_ver" inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap( { "employee_scale": lambda x: int(x.replace(",", "")), OrgTimeseries.data_time.name: dt.date.today() } ) sk = transform.MapSelectKeys( { "org_id": OrgTimeseries.org_id.name, "final_report_time": OrgTimeseries.statistic_date.name, "real_capital": OrgTimeseries.real_capital.name, "reg_capital": OrgTimeseries.reg_capital.name, "employee_scale": OrgTimeseries.employee_scale.name, OrgTimeseries.data_time.name: None } ) s = base.Stream(inp, transform=(vm, sk)) return s
def stream(cls): sql = "SELECT idm.person_id FROM( \ SELECT person_id FROM base.person_info) as idm \ JOIN \ base.org_person_mapping as op \ on op.person_id = idm.person_id \ WHERE op.duty_detail IN ( \ '基金经理', \ '投资总监', \ '投资经理', \ '投研总监', \ '风控总监', \ '投资部经理', \ '投资决策委员会主席', \ '总经理', \ '投资部主管' \ )GROUP BY idm.person_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "is_core_member": "是" }) sk = transform.MapSelectKeys({ "person_id": "person_id", 'is_core_member': 'is_core_member' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020002(cls): """ 清洗 d_org_asset_scale; """ sql = " \ select idm.matched_id, doa.org_name, doa.data_source, doa.statistic_date, doa.total_asset, doa.funds_num \ FROM crawl_public.d_org_asset_scale as doa \ JOIN base_public.id_match as idm \ ON doa.org_id = idm.source_id \ WHERE idm.id_type = 2 AND idm.is_used = 1 AND doa.data_source = '020002' \ AND idm.data_source = '020002'" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "total_asset": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x }) sk = transform.MapSelectKeys({ "matched_id": "org_id", # 'org_name': 'org_name', 'data_source': 'data_source', 'statistic_date': 'statistic_date', 'total_asset': 'total_asset', 'funds_num': 'funds_num' }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020003(cls): """ 清洗 d_person_info; """ sql = "SELECT im.matched_id, person_name, background FROM crawl_private.d_person_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_person_info GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '020003' AND im.is_used = 1 " inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "background": lambda x: cls.BACKGROUND.get(x) if type(x) is str else x, "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name"), }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'background': 'background', 'person_name': 'person_name', "person_name_py": "person_name_py" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_04xxxx_type2(cls): """ 清洗id_match, 04xxx源基金管理人 """ sql = "SELECT idm.matched_id,oi.org_id,oi.org_full_name " \ "FROM (SELECT DISTINCT matched_id,source FROM base.id_match " \ "WHERE id_type = 1 AND is_used = 1 AND source LIKE '040%%' " \ "AND matched_id NOT IN (SELECT fund_id FROM base.fund_org_mapping WHERE org_type_code=2)) idm " \ "JOIN data_test.source_info_org sig ON idm.source = sig.source_id " \ "JOIN base.org_info oi ON sig.org_id = oi.org_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "org_type": "基金管理人", "org_type_code": 2 }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", 'org_full_name': 'org_name', "org_type": "org_type", "org_type_code": "org_type_code", "org_id": "org_id" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020003(cls, fund_ids): sql = "SELECT im.matched_id, dfp.statistic_date, dfp.subject_id, fpb.data_source, dfp.quantity " \ "FROM base_public.id_match im " \ "JOIN crawl_public.d_fund_position dfp " \ "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \ "JOIN base_test.fund_position_bond_test_20180515 fpb " \ "ON im.matched_id = fpb.fund_id AND fpb.statistic_date = dfp.statistic_date AND fpb.subject_id = dfp.subject_id " \ "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \ "AND im.data_source = '020003'".format(fids=sf.SQL.values4sql(fund_ids)) inp = MysqlInput(cls.engine, sql) vm = transform.ValueMap({ "quantity": lambda x: cls._clean_amount(x), }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", "statistic_date": None, "subject_id": None, "quantity": None }) dn = transform.Dropna(subset=["quantity"]) return base.Stream(inp, transform=[vm, sk, dn])