def stream_020002_investment_years(cls): """ 清洗 d_person_info; """ sql = "select idm.matched_id, dd.investment_years FROM \ (SELECT matched_id,source,source_id FROM base.id_match \ where id_type = 3 and is_used = 1 AND source = '020002') as idm \ JOIN \ (SELECT * FROM \ (select MAX(version) as mm,person_id as pi,source_id as si \ FROM crawl_private.d_person_info GROUP BY person_id) as ma \ JOIN crawl_private.d_person_info as dpd \ ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \ ON dd.pi = idm.source_id \ AND dd.si = idm.source" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "investment_years": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x, }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'investment_years': 'investment_years' }) dp = transform.Dropna(how="any", axis=0) s = Stream(inp, transform=[vm, sk, dp]) return s
def stream_020003_resume(cls): sql = "select idm.matched_id, dd.resume FROM \ (SELECT matched_id,source,source_id FROM base.id_match \ where id_type = 3 and is_used = 1 AND source = '020003') as idm \ JOIN \ (SELECT * FROM \ (select MAX(version) as mm,person_id as pi,source_id as si \ from crawl_private.d_person_description GROUP BY person_id) ma \ JOIN crawl_private.d_person_description as dpd \ ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \ ON dd.pi = idm.source_id \ AND dd.si = idm.source" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "investment_years": (lambda x: cls.clean_01(x) if type(x) is str else x, 'resume') }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'investment_years': 'investment_years' }) dp = transform.Dropna(how='any', axis=0) s = Stream(inp, transform=[vm, sk, dp]) return s
def stream_020003_graduate_school(cls): """ 清洗 d_person_info; """ sql = "select idm.matched_id, dd.graduate_school FROM \ (SELECT matched_id,source,source_id FROM base.id_match \ where id_type = 3 and is_used = 1 AND source = '020003') as idm \ JOIN \ (SELECT * FROM \ (select MAX(version) as mm,person_id as pi,source_id as si \ FROM crawl_private.d_person_info GROUP BY person_id) as ma \ JOIN crawl_private.d_person_info as dpd \ ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \ ON dd.pi = idm.source_id \ AND dd.si = idm.source" inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'graduate_school': 'graduate_school' }) dp = transform.Dropna(how='any', axis=0) s = Stream(inp, transform=[sk, dp]) return s
def stream_602_4(cls, c_cleaned): sql = "SELECT fi.fund_id, fi.fund_name " \ "FROM base.fund_info fi " \ "WHERE fi.fund_id NOT IN (SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602) " inp = MysqlInput(ENGINE, sql) # 非惰性求值 fids = set(c_cleaned.dataframe["fund_id"]) vm0 = transform.ValueMap({ "__tmp": (lambda x: (60201, "非结构化", None, None) if x not in fids else (None, None, None, None), "fund_id") }) vm1 = transform.ValueMap({ FundTypeMappingImport.typestandard_code.name: 602, FundTypeMappingImport.typestandard_name.name: "按结构类型分类", FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"), FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"), FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"), FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"), FundTypeMappingImport.classified_by.name: CLASSIFIED_BY }) sk = transform.MapSelectKeys(cls.FIXED_FIELDS) dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all") s = base.Stream(inp, [vm0, vm1, sk, dn]) return s
def stream_000001(): session = dbsession(bind=engine_r) stmt = session.query(YBondInfo).join( BondInfo, YBondInfo.bond_id == BondInfo.bond_id).filter( and_(YBondInfo.is_used == 1, YBondInfo.is_used == 1)).with_entities( YBondInfo.bond_id, YBondInfo.interest_type) inp = MysqlInput(session.bind, stmt) dn = transform.Dropna(subset=[YBondInfo.interest_type.name]) km = transform.MapSelectKeys({ YBondInfo.bond_id.name: BondInfo.bond_id.name, YBondInfo.source_id.name: None, YBondInfo.interest_type.name: BondInfo.interest_type.name, }) s = Stream(inp, ( dn, km, )) return s
def stream_010005_ts103(cls): source_id = "010005" tb = "crawl_private.x_fund_info_futures" tmpsql = cls.SQL_103.format(sid=source_id, tb=tb) inp = MysqlInput(ENGINE, tmpsql) vm = transform.ValueMap(OrderedDict( [ [FundTypeSource.type_code.name, (lambda x: cls.TS_MAPPER_103.get(x, [None, None])[0], "fund_type_target_amac")], [FundTypeSource.type_name.name, (lambda x: cls.TS_MAPPER_103.get(x, [None, None])[1], "fund_type_target_amac")], [FundTypeSource.typestandard_code.name, "103"], [FundTypeSource.typestandard_name.name, "按投资标的分类"], [FundTypeSource.source_id.name, source_id] ] )) sk = transform.MapSelectKeys( { "matched_id": FundTypeSource.fund_id.name, "fund_name": FundTypeSource.fund_name.name, FundTypeSource.source_id.name: None, FundTypeSource.typestandard_name.name: None, FundTypeSource.typestandard_code.name: None, FundTypeSource.type_code.name: None, FundTypeSource.type_name.name: None, } ) dn = transform.Dropna(subset=[FundTypeSource.type_code.name]) s = base.Stream(inp, transform=(vm, sk, dn)) return s
def stream_020001_ts304(cls): source_id = "020001" tb = "crawl_private.d_fund_info" tmpsql = cls.SQL_X04.format(tb=tb, sid=source_id) inp = MysqlInput(ENGINE, tmpsql) vm = transform.ValueMap( OrderedDict([ [FundTypeSource.source_id.name, source_id], [FundTypeSource.typestandard_code.name, "104"], [FundTypeSource.typestandard_name.name, "按发行主体分类"], [ "__cached", (lambda x: cls.TS_MAPPER_304.get(x, [None, None]), "fund_type_issuance") ], [FundTypeSource.type_code.name, (lambda x: x[0], "__cached")], [FundTypeSource.type_name.name, (lambda x: x[1], "__cached")], ])) sk = transform.MapSelectKeys({ "matched_id": FundTypeSource.fund_id.name, "fund_name": FundTypeSource.fund_name.name, FundTypeSource.source_id.name: None, FundTypeSource.typestandard_code.name: None, FundTypeSource.typestandard_name.name: None, FundTypeSource.type_code.name: None, FundTypeSource.type_name.name: None, }) dn = transform.Dropna(subset=[FundTypeSource.type_code.name]) s = base.Stream(inp, [vm, sk, dn]) return s
def stream_000001(cls): """ 清洗stock_info_010001 CSRC分类; """ sql = "SELECT stock_id,type_csrc FROM crawl_finance.stock_info_010001" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "type_csrc": lambda x: re.sub("\s", "", x.split(",")[1]), "category_code": (lambda x: cls.TYPE_CODE.get(x), "type_csrc"), "category_name": (lambda x: cls.TYPE_NAME.get(x), "type_csrc"), "type_code": (lambda x: cls.STYPE_CODE.get(x), "type_csrc"), "stock_id": lambda x: str(cls.add_stock_suffix(x)) + ".SH" }) sk = transform.MapSelectKeys({ "stock_id": "stock_id", "category_name": "category_name", "category_code": "category_code", "type_code": "type_code", "type_csrc": "type_name" }) dn = transform.Dropna(subset=None, axis=0, how='any') s = Stream(inp, transform=[vm, sk, dn]) return s
def stream_020003(cls, fund_ids): sql = "SELECT im.matched_id, dfp.statistic_date, dfp.subject_id, fpb.data_source, dfp.quantity " \ "FROM base_public.id_match im " \ "JOIN crawl_public.d_fund_position dfp " \ "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \ "JOIN base_test.fund_position_bond_test_20180515 fpb " \ "ON im.matched_id = fpb.fund_id AND fpb.statistic_date = dfp.statistic_date AND fpb.subject_id = dfp.subject_id " \ "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \ "AND im.data_source = '020003'".format(fids=sf.SQL.values4sql(fund_ids)) inp = MysqlInput(cls.engine, sql) vm = transform.ValueMap({ "quantity": lambda x: cls._clean_amount(x), }) sk = transform.MapSelectKeys({ "matched_id": "fund_id", "statistic_date": None, "subject_id": None, "quantity": None }) dn = transform.Dropna(subset=["quantity"]) return base.Stream(inp, transform=[vm, sk, dn])
def stream_010005(): sql_fi = "SELECT im.matched_id as fund_id, fi.fund_name, fi.foundation_date, issuing_scale_amac " \ "FROM crawl_private.{tb} xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM crawl_private.{tb} GROUP BY fund_id) tb_latest " \ "ON tb_latest.latest_ver = xfi.version AND tb_latest.fund_id = xfi.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.source = '010005' AND im.id_type = 1 AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi ON fi.fund_id = im.matched_id".format(tb="x_fund_info_futures") inp = MysqlInput(ENGINE_C, sql_fi) vm = transform.ValueMap({ "issuing_scale_amac": lambda x: float(x), }) vm2 = transform.ValueMap({ "issuing_scale_amac": lambda x: None if x == 0 else x, }) dn = transform.Dropna( subset=["fund_id", "foundation_date", "issuing_scale_amac"]) sk = transform.MapSelectKeys({ "fund_id": FundAssetScale.fund_id.name, "fund_name": FundAssetScale.fund_name.name, "issuing_scale_amac": FundAssetScale.asset_scale.name, "foundation_date": FundAssetScale.statistic_date.name, }) s = base.Stream(inp, transform=(vm, vm2, dn, sk)) return s
def stream_x_org_info(): sql = "SELECT xoi.org_id, xoi.final_report_time, fund_num, fund_scale FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info WHERE is_used = 1 GROUP BY org_id) tb " \ "ON xoi.org_id = tb.org_id AND xoi.version = tb.latest_ver" inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap({OrgAssetScale.data_time.name: dt.date.today()}) sk = transform.MapSelectKeys({ "org_id": OrgAssetScale.org_id.name, "final_report_time": OrgAssetScale.statistic_date.name, "fund_num": OrgAssetScale.funds_num.name, "fund_scale": OrgAssetScale.asset_scale.name, OrgAssetScale.data_time.name: None }) dn = transform.Dropna(subset=[ OrgAssetScale.asset_scale.name, OrgAssetScale.funds_num.name, OrgAssetScale.statistic_date.name ], how="all") s = base.Stream(inp, transform=(vm, sk, dn)) return s
def stream_010003_ts104(cls): source_id = "010003" tb = "crawl_private.x_fund_info_private" sql = "SELECT im.matched_id, xfi.type_name_amac, fi.fund_name " \ "FROM {tb} xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM {tb} GROUP BY fund_id) t " \ "ON xfi.fund_id = t.fund_id AND xfi.version = t.latest_ver " \ "JOIN (SELECT matched_id, source_id FROM base.id_match WHERE source = '{sid}' AND id_type = 1 AND is_used = 1) im " \ "ON xfi.fund_id = im.source_id " \ "LEFT JOIN fund_info fi ON im.matched_id = fi.fund_id".format( tb=tb, sid=source_id ) inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap( { FundTypeSource.source_id.name: source_id, FundTypeSource.typestandard_code.name: "104", FundTypeSource.typestandard_name.name: "按发行主体分类", FundTypeSource.type_code.name: (lambda x: cls.TS_MAPPER_104.get(x, [None, None])[0], "type_name_amac"), FundTypeSource.type_name.name: (lambda x: cls.TS_MAPPER_104.get(x, [None, None])[1], "type_name_amac") } ) dn = transform.Dropna(subset=[FundTypeSource.fund_id.name, FundTypeSource.type_code.name]) s = base.Stream(inp, [vm, cls.SK, dn]) return s
def stream_consultant_000001(cls): sql = "SELECT fund_id, fund_name, oi.org_id, oi.org_name, org_type_code, start_date, end_date, is_current " \ "FROM crawl_private.y_fund_org_mapping yfom " \ "JOIN base.org_info oi ON yfom.org_id = oi.org_id " \ "WHERE org_type_code = 1 AND is_used = 1" inp = MysqlInput(ENGINE_C, sql) vm = transform.ValueMap({ "org_type": "投资顾问" }) sk = transform.MapSelectKeys({ "fund_id": None, "fund_name": None, "org_id": None, "org_name": None, "org_type": None, "org_type_code": None, "start_date": None, "end_date": None, "is_current": None, }) dn = transform.Dropna( subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name] ) s = Stream(inp, transform=[vm, sk, dn]) return s
def stream_securities(cls): """ 清洗 x_info_securities(010004); """ sql = " \ SELECT id.matched_id, ac.orientation_amac FROM ( \ SELECT A.matched_id, A.source_id FROM \ (SELECT matched_id, source_id, entry_time FROM base.id_match \ WHERE source = '010004' and is_used = 1 and id_type = 1 ORDER BY entry_time DESC) \ AS A GROUP BY A.matched_id) AS id \ JOIN \ (SELECT * FROM (SELECT MAX(version) mm ,fund_id as maid FROM crawl_private.x_fund_info_securities \ GROUP BY fund_id) AS p \ JOIN crawl_private.x_fund_info_securities as ff \ on ff.fund_id = p.maid and ff.version = p.mm) AS ac \ ON ac.maid = id.source_id" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ 'orientation_amac': lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x }) dr = transform.Dropna(axis=0, how="any") sk = transform.MapSelectKeys({ "matched_id": "fund_id", "orientation_amac": "investment_range" }) s = Stream(inp, transform=[vm, dr, sk]) return s
def stream_manager_010003(cls, d_xoi=None): sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, type_name_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_private xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_private GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010003' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi " \ "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010003' " \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") def clean_org_type(mng_tp, tp): # 私募证券投资基金 = 证券投资基金; 其他私募投资基金 = 其他投资基金 if mng_tp == "顾问管理": if tp in {"私募证券投资基金", "证券投资基金"}: return "基金管理人" elif mng_tp == "受托管理": if tp in {"私募证券投资基金", "证券投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金", "股权投资基金"}: return "基金管理人" elif mng_tp == "自我管理": if tp in {"私募证券投资基金", "证券投资基金", "股权投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金"}: return "基金管理人" if d_xoi is None: d_xoi = cls._d_oi_manager() vm = transform.ValueMap( OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, (lambda mng_tp, tp: clean_org_type(mng_tp, tp), "manage_type_amac", "type_name_amac")), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ]) ) jn = transform.Join(inp2, how="left", on="org_id") dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) vm2 = transform.ValueMap({ FundOrgMapping.org_type_code.name: (lambda x: 2 if x == "基金管理人" else None, FundOrgMapping.org_type.name), }) sk = transform.MapSelectKeys(cls.FIELDS) s = Stream(inp, transform=(vm, jn, dn, vm2, sk)) return s
def stream_010101(cls): sql = "SELECT im.matched_id as person_id, pi.person_name, xpc.status, xpc.org_name as org_name_ " \ "FROM crawl_private.x_person_certificate_010101 xpc " \ "JOIN base.id_match im ON xpc.person_id = im.source_id " \ "LEFT JOIN base.person_info pi ON im.matched_id = pi.person_id " \ "WHERE im.source = '010101' AND im.id_type = 3 AND im.is_used = 1" def org_name_dict(): import pandas as pd sql_oi = "SELECT org_full_name, org_id, org_name FROM base.org_info" res = pd.read_sql(sql_oi, ENGINE) v = tuple([(oid, oname) for oid, oname in zip(res["org_id"], res["org_name"])]) res = dict(zip(res["org_full_name"], v)) return res inp = MysqlInput(ENGINE, sql) d = org_name_dict() vm = transform.ValueMap({ OrgPersonMapping.org_id.name: (lambda x: d.get(x)[0], "org_name_"), OrgPersonMapping.org_name.name: (lambda x: d.get(x)[1], "org_name_"), OrgPersonMapping.is_current.name: (lambda x: { "正常": 1, "离职": 0 }.get(x), "status"), OrgPersonMapping.duty.name: None }) sk = transform.MapSelectKeys({ "person_id": OrgPersonMapping.person_id.name, "person_name": OrgPersonMapping.person_name.name, OrgPersonMapping.org_id.name: None, "org_name": OrgPersonMapping.org_name.name, OrgPersonMapping.is_current.name: None, OrgPersonMapping.duty.name: None }) dn = transform.Dropna(subset=[ OrgPersonMapping.person_id.name, OrgPersonMapping.org_id.name ]) s = base.Stream(inp, [vm, sk, dn]) return s
def stream_020001(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( DBondInfo.source_id == "020001").with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type, DBondInfo.interest_freq) inp = MysqlInput(session.bind, stmt) def clean_interest_freq(type_, freq, fname): if fname.find("贴现") >= 0: return "贴现" if type_ in {"贴现", "零息"}: return "贴现" else: d_type_freq = { "固定": { "年付": "年付息", "半年付": "半年付息", "季付": "季度付息", "月付": "月付息", "贴现": "贴现" }, "浮动": { "年付": "年付息", "半年付": "半年付息", "季付": "季度付息", "月付": "月付息", } } return d_type_freq.get(type_).get(freq) vm = transform.ValueMap({ DBondInfo.interest_freq.name: (lambda type_, freq, fname: clean_interest_freq(type_, freq, fname), DBondInfo.interest_type.name, DBondInfo.interest_freq.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_freq.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_freq.name: None, }) s = Stream(inp, (vm, dn, km)) return s
def stream_020002_ts401(cls): source_id = "020002" tb = "crawl_private.d_fund_info" tmpsql = cls.SQL_X01.format(tb=tb, sid=source_id) inp = MysqlInput(ENGINE, tmpsql) vm0 = transform.ValueMap({ "type_name": lambda x: "" if x.replace("-", "") == "" else x, "stype_name": lambda x: "" if x.replace("-", "") == "" else x, }) vm = transform.ValueMap( OrderedDict([ [FundTypeSource.source_id.name, source_id], [FundTypeSource.typestandard_code.name, "101"], [FundTypeSource.typestandard_name.name, "按投资策略分类"], [ "__cached", (lambda tn, stn: cls.TS_MAPPER_401.get( (tn, stn), [None] * 4), "type_name", "stype_name") ], [FundTypeSource.type_code.name, (lambda x: x[0], "__cached")], [FundTypeSource.type_name.name, (lambda x: x[2], "__cached")], [FundTypeSource.stype_code.name, (lambda x: x[1], "__cached")], [FundTypeSource.stype_name.name, (lambda x: x[3], "__cached")], ])) sk = transform.MapSelectKeys({ "matched_id": FundTypeSource.fund_id.name, "fund_name": FundTypeSource.fund_name.name, FundTypeSource.source_id.name: None, FundTypeSource.typestandard_code.name: None, FundTypeSource.typestandard_name.name: None, FundTypeSource.type_code.name: None, FundTypeSource.type_name.name: None, FundTypeSource.stype_code.name: None, FundTypeSource.stype_name.name: None, }) dn = transform.Dropna(subset=[ FundTypeSource.type_code.name, FundTypeSource.stype_code.name ], how="all") s = base.Stream(inp, [vm0, vm, sk, dn]) return s
def stream_020001(cls): """ 清洗 fund_custodian; """ dict_01 = {'中国银行(香港)有限公司': '02000001'} #往这里可以添加需要的匹配关系 dict_020001 = dict(cls.org_full_name, **dict_01) sql = "SELECT idh.matched_id, ff.fund_custodian, fi.fund_name FROM ( \ SELECT matched_id,source_id FROM base_public.id_match \ where id_type = 1 AND is_used = 1 AND data_source = '020001') as idh \ JOIN \ (SELECT * FROM ( \ SELECT MAX(version) as mm, fund_id as id FROM crawl_public.d_fund_info GROUP BY fund_id) as idm \ JOIN crawl_public.d_fund_info as df \ ON idm.id = df.fund_id AND idm.mm = df.version \ WHERE df.data_source = '020001') AS ff \ ON ff.id = idh.matched_id \ JOIN base_public.fund_info as fi \ ON fi.fund_id = idh.matched_id \ where idh.matched_id <> '777777'" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "org_id": (lambda x: dict_020001.get(x), "fund_custodian"), }) vm2 = transform.ValueMap({ "type_code": (lambda x: cls.type_code.get(x), "org_id"), }) vm3 = transform.ValueMap({ "type_name": (lambda x: cls.type_name.get(x), "type_code"), }) dr = transform.Dropna(axis=0, how="any") sk = transform.MapSelectKeys({ "matched_id": "fund_id", "org_id": "org_id", "fund_custodian": "org_name", "fund_name": "fund_name", "type_code": "type_code", "type_name": "type_name" }) s = Stream(inp, transform=[vm, vm2, vm3, dr, sk]) return s
def stream_consultant_010004(cls, d_xoi=None): """ 清洗010004源(证券公司)发行产品的投顾公司配对信息; Args: d_xoi: Returns: """ sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_securities xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010004' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010004'" \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") if d_xoi is None: d_xoi = cls._d_oi_consultant() vm = transform.ValueMap( OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, (lambda x: {"主动管理": "投资顾问"}.get(x), "manage_type_amac")), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ]) ) jn = transform.Join(inp2, how="left", on="org_id") dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) vm2 = transform.ValueMap({ FundOrgMapping.org_type_code.name: (lambda x: 1 if x == "投资顾问" else None, FundOrgMapping.org_type.name), }) dd = transform.DropDuplicate(subset=["matched_id", "org_id", FundOrgMapping.org_type.name]) sk = transform.MapSelectKeys(cls.FIELDS) s = Stream(inp, transform=[vm, jn, dn, vm2, dd, sk]) # 先去空, 避免空值产生, 导致整形列变成浮点型 return s
def stream_010001(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( DBondInfo.source_id == "010001").with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type, DBondInfo.interest_freq) inp = MysqlInput(session.bind, stmt) def clean_interest_freq(type_, freq, fname): if fname.find("贴现") >= 0: return "贴现" if type_ in {"零息", "贴现"}: return "贴现" elif type_ == "一次还本息": return "到期一次还本付息" else: d_freq = { "到期一次还本付息": "到期一次还本付息", # 除去type_为"零息"的情况, 其余freq为"到期一次还本付息"映射为"到期一次还本付息" "按年付息": "年付息", "按半年付息": "半年付息", "按季度付息": "季度付息", "按月付息": "月付息", } return d_freq.get(freq) vm = transform.ValueMap({ BondInfo.interest_freq.name: (lambda type_, freq, fname: clean_interest_freq(type_, freq, fname), DBondInfo.interest_type.name, DBondInfo.interest_freq.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_freq.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_freq.name: None, }) s = Stream(inp, (vm, dn, km)) return s
def stream_602_2(cls): sql = "SELECT fi.fund_id, fi.fund_name, fi.fund_full_name " \ "FROM base.fund_info fi " \ "WHERE fi.fund_id NOT IN (SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602) " inp = MysqlInput(ENGINE, sql) def clean_tc(fund_full_name): patt = "优先级|(?<!\w)A级|劣后级|进取级|B级|普通级|次级|风险级|中间级|管理级|结构化|分级" # A级前面不为数字和字母 matched = set(re.findall(patt, fund_full_name)) if ("优先级" in matched) or ("A级" in matched): return 60202, "结构化", 6020202, "优先级" elif ("劣后级" in matched) or ("进取级" in matched)\ or ("B级" in matched) or ("普通级" in matched)\ or ("次级" in matched) or ("风险级" in matched): return 60202, "结构化", 6020203, "进取级/劣后级" elif "中间级" in matched: return 60202, "结构化", 6020204, "中间级(夹层)" elif "管理级" in matched: return 60202, "结构化", 6020205, "管理型份额" elif ("结构化" in matched) or ("分级" in matched): return 60202, "结构化", None, None return None, None, None, None vm0 = transform.ValueMap({ "__tmp": (lambda x: clean_tc(x), "fund_full_name") }) vm = transform.ValueMap({ FundTypeMappingImport.typestandard_code.name: 602, FundTypeMappingImport.typestandard_name.name: "按结构类型分类", FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"), FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"), FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"), FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"), FundTypeMappingImport.classified_by.name: CLASSIFIED_BY }) sk = transform.MapSelectKeys(cls.FIXED_FIELDS) dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all") s = base.Stream(inp, [vm0, vm, sk, dn]) return s
def stream_010001(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( and_(DBondInfo.source_id == "010001", DBondInfo.is_used == 1)).with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type) inp = MysqlInput(session.bind, stmt) def clean_interest_type(type_, fname): if fname.find("贴现") >= 0: return "零息" else: d_type = { "零息": "零息", "固定利息": "固定利率", "浮动利息": "浮动利率", } res = d_type.get(type_) if res is None: if fname.find("短期融资券") >= 0: return "固定利率" return res vm = transform.ValueMap({ BondInfo.interest_type.name: (lambda type_, fname: clean_interest_type(type_, fname), DBondInfo.interest_type.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_type.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_type.name: None }) s = Stream(inp, (vm, dn, km)) return s
def stream_manager_010005(cls, d_xoi=None): """ 清洗010005源(期货公司)发行产品的基金管理人配对信息; Args: d_xoi: Returns: """ sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \ "FROM x_fund_info_futures xfi " \ "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures GROUP BY fund_id) tb_latest " \ "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \ "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010005' AND im.is_used = 1 " \ "LEFT JOIN base.fund_info fi " \ "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010005' " \ "WHERE fund_name_amac NOT LIKE '%%信托计划'" inp = MysqlInput(ENGINE_C, sql_maxver) inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info") if d_xoi is None: d_xoi = cls._d_oi_manager() vm = transform.ValueMap(OrderedDict([ ("fund_issue_org_amac", lambda x: x.strip()), ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")), (FundOrgMapping.org_type.name, "基金管理人"), # 该来源的fund_issue_org_amac都是基金管理人; (FundOrgMapping.org_type_code.name, 2), (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status")) ])) jn = transform.Join(inp2, how="left", on="org_id") sk = transform.MapSelectKeys(cls.FIELDS) dn = transform.Dropna( subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name]) s = Stream(inp, transform=(vm, jn, sk, dn,)) return s
def stream_xof_manager(s): vm = transform.ValueMap( OrderedDict([ (FundOrgMapping.org_type_code.name, (lambda x: {"受托管理": 2, "自我管理": 2}.get(x), "org_type")), (FundOrgMapping.org_type.name, (lambda x: {"受托管理": "基金管理人", "自我管理": "基金管理人"}.get(x), "org_type")) ]) ) dn = transform.Dropna() sk = transform.MapSelectKeys( { "fund_id": FundOrgMapping.fund_id.name, "org_id": FundOrgMapping.org_id.name, FundOrgMapping.org_type.name: None, FundOrgMapping.org_type_code.name: None, "fund_name": FundOrgMapping.fund_name.name, "org_name": FundOrgMapping.org_name.name } ) s = Stream(s, transform=(vm, dn, sk)) return s
def stream_010002_ts102(cls): source_id = "010002" tb = "crawl_private.x_fund_info_fundaccount" tmpsql = cls.SQL_102.format(tb=tb, sid=source_id) inp = MysqlInput(ENGINE, tmpsql) vm = transform.ValueMap( { FundTypeSource.source_id.name: source_id, FundTypeSource.typestandard_code.name: "102", FundTypeSource.typestandard_name.name: "按结构类型分类", FundTypeSource.type_code.name: (lambda x: cls.TS_MAPPER_102.get(x, [None, None])[0], "fund_type_allocation_amac"), FundTypeSource.type_name.name: (lambda x: cls.TS_MAPPER_102.get(x, [None, None])[1], "fund_type_allocation_amac"), } ) dn = transform.Dropna(subset=[FundTypeSource.fund_id.name, FundTypeSource.type_code.name]) s = base.Stream(inp, [vm, cls.SK, dn]) return s
def stream_xof_consultant(s): vm = transform.ValueMap( OrderedDict([ (FundOrgMapping.org_type_code.name, (lambda x: {"顾问管理": 1, "自我管理": 1}.get(x), "org_type")), (FundOrgMapping.org_type.name, (lambda x: {"顾问管理": "投资顾问", "自我管理": "投资顾问",}.get(x), "org_type")) ]) ) dn = transform.Dropna() sk = transform.MapSelectKeys( { "fund_id": FundOrgMapping.fund_id.name, "org_id": FundOrgMapping.org_id.name, FundOrgMapping.org_type.name: None, FundOrgMapping.org_type_code.name: None, "fund_name": FundOrgMapping.fund_name.name, "org_name": FundOrgMapping.org_name.name } ) s = Stream(s, transform=(vm, dn, sk)) return s
def stream_020003(): session = dbsession(bind=ENGINE) query_oh = session.query(IdMatch).join( DFundInfo, and_(IdMatch.id_type == 1, IdMatch.source_id == DFundInfo.fund_id, IdMatch.data_source == DFundInfo.data_source, IdMatch.data_source == "020003") ).outerjoin( FundInfo, and_(IdMatch.id_type == 1, IdMatch.matched_id == FundInfo.fund_id) ).filter( IdMatch.is_used == 1 ).with_entities( FundInfo.fund_id, FundInfo.fund_name, IdMatch.data_source, DFundInfo.fund_type ) inp = MysqlInput(session.bind, query_oh) vm = transform.ValueMap( OrderedDict([ [FundTypeMappingSource.type_code.name, (lambda ds, tp: etl.EnumMap.DFundInfo.type_code4.get((ds, tp)), DFundInfo.data_source.name, DFundInfo.fund_type.name)], [FundTypeMappingSource.type_name.name, (lambda tc: etl.Enum.TypeMapping.type_name.get(tc), FundTypeMappingSource.type_code.name)], [FundTypeMappingSource.typestandard_code.name, (lambda tc: tc[:-2], FundTypeMappingSource.type_code.name)], [FundTypeMappingSource.typestandard_name.name, (lambda tsc: etl.Enum.TypeMapping.typestandard_name.get(tsc), FundTypeMappingSource.typestandard_code.name)] ]) ) dn = transform.Dropna(subset=[FundInfo.fund_id.name, FundTypeMappingSource.typestandard_code.name]) sk = transform.MapSelectKeys( { FundInfo.fund_id.name: None, FundInfo.fund_name.name: None, IdMatch.data_source.name: FundTypeMappingSource.data_source.name, FundTypeMappingSource.typestandard_code.name: None, FundTypeMappingSource.typestandard_name.name: None, FundTypeMappingSource.type_code.name: None, FundTypeMappingSource.type_name.name: None, } ) s = base.Stream(inp, transform=(vm, dn, sk)) return s
def stream_020003(): session = dbsession(bind=engine_r) stmt = session.query(DBondInfo).join( BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter( DBondInfo.source_id == "020003").with_entities( DBondInfo.bond_id, DBondInfo.source_id, DBondInfo.bond_full_name, DBondInfo.interest_type, DBondInfo.interest_freq) inp = MysqlInput(session.bind, stmt) def clean_interest_freq(freq, fname): if fname.find("贴现") >= 0: return "贴现" d_freq = { "到期一次还本付息": "到期一次还本付息", "贴现": "贴现", "按年付息": "年付息", "半年付息": "半年付息", "按季付息": "季度付息", } return d_freq.get(freq) vm = transform.ValueMap({ DBondInfo.interest_freq.name: (lambda freq, fname: clean_interest_freq(freq, fname), DBondInfo.interest_freq.name, DBondInfo.bond_full_name.name) }) dn = transform.Dropna(subset=[BondInfo.interest_freq.name]) km = transform.MapSelectKeys({ DBondInfo.bond_id.name: BondInfo.bond_id.name, BondInfo.interest_freq.name: None, }) s = Stream(inp, (vm, dn, km)) return s
def stream_602_3(cls): sql = "SELECT fi.fund_id, fi.fund_name, fts.type_code FROM fund_info fi " \ "JOIN fund_type_source fts ON fi.fund_id = fts.fund_id " \ "WHERE fi.fund_id NOT IN (" \ "SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602)" \ "AND fts.type_code IN (10201, 10202, 30201, 30202)" inp = MysqlInput(ENGINE, sql) structured, unstructured = (60202, "结构化", None, None), (60201, "非结构化", None, None) d = { 10201: structured, 10202: unstructured, 30201: structured, 30202: unstructured } vm0 = transform.ValueMap({ "__tmp": (lambda x: d.get(x, (None, None, None, None)), "type_code") }) vm = transform.ValueMap({ FundTypeMappingImport.typestandard_code.name: 602, FundTypeMappingImport.typestandard_name.name: "按结构类型分类", FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"), FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"), FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"), FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"), FundTypeMappingImport.classified_by.name: CLASSIFIED_BY }) sk = transform.MapSelectKeys(cls.FIXED_FIELDS) dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all") s = base.Stream(inp, [vm0, vm, sk, dn]) return s