def conflu_manager(cls): d_xoi = cls._d_oi_manager() streams = [ cls.stream_manager_000001(), cls.stream_manager_010002_(d_xoi), cls.stream_manager_010005_(d_xoi), cls.stream_manager_010004_(d_xoi), cls.stream_manager_010003_(d_xoi), cls.stream_manager_010002(d_xoi), cls.stream_manager_010005(d_xoi), cls.stream_manager_010004(d_xoi), cls.stream_manager_010003(d_xoi) ] # stream的定义需根据业务要求保持对应的顺序, 优先级越高, 顺序越靠前 c = Confluence(*streams, ) dn = transform.DropDuplicate(subset=[FundOrgMapping.fund_id.name]) s = Stream(c, transform=[dn]) return Confluence(s)
def confluence(cls): streams = [cls.stream_020001_investment_years(), cls.stream_020002_investment_years(), cls.stream_020003_graduate_school(), cls.stream_020003_investment_years(), cls.stream_020001_resume(), cls.stream_020002_resume(), cls.stream_020003_resume(), cls.stream_gf()] c = Confluence(*streams, on=["person_id"]) return c
def conflu_place(cls): # region, prov, city, area sql = "SELECT org_id, address FROM {tb_test} WHERE address IS NOT NULL".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) vm0 = transform.ValueMap({ "__tmp": (lambda x: cls._clean_place(x), "address") }) vm = transform.ValueMap({ OrgInfo.region.name: (lambda x: x[0], "__tmp"), OrgInfo.prov.name: (lambda x: x[1], "__tmp"), OrgInfo.city.name: (lambda x: x[2], "__tmp"), OrgInfo.area.name: (lambda x: x[3], "__tmp"), }) sk = transform.MapSelectKeys({ OrgInfo.org_id.name: None, OrgInfo.region.name: None, OrgInfo.prov.name: None, OrgInfo.city.name: None, OrgInfo.area.name: None, }) s = Stream(inp, transform=[vm0, vm, sk]) return Confluence(s)
def conflu_is_reg_now(cls): # is_reg_now sql = "SELECT xoi.org_id as org_id_, oi.org_id " \ "FROM crawl_private.x_org_info xoi " \ "JOIN (SELECT DISTINCT org_id, version FROM crawl_private.x_org_info WHERE version >= {ver} ) tmp " \ "ON xoi.org_id = tmp.org_id AND xoi.version = tmp.version " \ "JOIN (SELECT matched_id, source_id FROM base.id_match WHERE id_type = 2 AND source = '010001' AND is_used = 1) im " \ "ON xoi.org_id = im.source_id " \ "RIGHT JOIN (SELECT org_id FROM {tb_test} WHERE org_category = '私募基金管理公司') oi " \ "ON im.matched_id = oi.org_id ".format(tb_test=TEST_TABLE, ver=(dt.datetime.now() - dt.timedelta(8)).strftime("%Y%m%d%H")) inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ OrgInfo.is_reg_now.name: (lambda x: "否" if x is None else "是", "org_id_") }) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, OrgInfo.is_reg_now.name: None }) s = Stream(inp, [vm, sk]) return Confluence(s)
def conflu_is_member(cls): # is_member, member_type sql = "SELECT org_id, is_reg_now, is_member, member_type FROM {tb_test} ".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) def clean_membertype(is_reg, is_member, member_type, org_id): if is_reg == "否": if org_id[0] == "P": return "注销备案资格" else: return "未备案" if is_reg == "是": if is_member == "否": return "尚未取得会员资格" return member_type vm = transform.ValueMap({ OrgInfo.is_member.name: (lambda is_reg, is_member: "否" if is_reg == "否" else is_member, "is_reg_now", "is_member"), OrgInfo.member_type.name: (lambda is_reg, is_member, member_type, org_id: clean_membertype(is_reg, is_member, member_type, org_id), "is_reg_now", "is_member", "member_type", "org_id") }) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, OrgInfo.is_member.name: None, OrgInfo.member_type.name: None, }) s = Stream(inp, [vm, sk]) return Confluence(s)
def conflu1(cls, fund_ids=None): s00 = cls.stream_y_fund_info(fund_ids) s12 = cls.stream_x_fund_info_010002(fund_ids) s13 = cls.stream_x_fund_info_010003(fund_ids) s14 = cls.stream_x_fund_info_010004(fund_ids) s15 = cls.stream_x_fund_info_010005(fund_ids) s21 = cls.stream_d_fund_info_020001(fund_ids) s22 = cls.stream_d_fund_info_020002(fund_ids) s23 = cls.stream_d_fund_info_020003(fund_ids) s25 = cls.stream_d_fund_info_020005(fund_ids) # s28 = cls.stream_d_fund_info_020008(fund_ids) streams = [s00, s12, s13, s14, s15, s21, s22, s23, s25] p = { 0: { "fund_name": ("source_id", "000001"), "fund_full_name": ("source_id", "010002"), "reg_time": ("source_id", "010003"), "reg_code": ("source_id", "010002"), }, 1: { "reg_code": ("source_id", "010003"), "fund_full_name": ("source_id", "010003"), }, 2: { "fund_full_name": ("source_id", "010004"), }, 3: { "fund_full_name": ("source_id", "010005"), } } conflu = Confluence(*streams, on=["fund_id"], prio_l1=p) return conflu
def conflu_master_strategy(cls): # master_strategy # 外层以org_info表左连接查询, 以确保所有主键都会被更新到, 以处理源表数据变动的情况; sql = "SELECT t1.org_id, t2.stype_name FROM {tb_test} t1 " \ "LEFT JOIN (SELECT oi.org_id, ftm.stype_name " \ "FROM {tb_test} oi " \ "JOIN base.fund_org_mapping fom ON oi.org_id = fom.org_id " \ "JOIN (SELECT fund_id, stype_code, stype_name FROM fund_type_mapping " \ "WHERE typestandard_code = 601 AND stype_code <> 6010901 AND flag = 1) ftm ON fom.fund_id = ftm.fund_id " \ "WHERE fom.org_type_code = 1 " \ "GROUP BY oi.org_id, ftm.stype_code " \ "ORDER BY org_id ASC, COUNT(fom.fund_id) DESC) t2 ON t1.org_id = t2.org_id ".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) dd = transform.DropDuplicate(subset=["org_id"]) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, "stype_name": OrgInfo.master_strategy.name }) s = Stream(inp, [dd, sk]) return Confluence(s)
def confluence(cls): streams = [cls.fund_security_data0507(), cls.stream_020001()] c = Confluence(*streams, on=[ 'fund_id', 'statistic_date', 'id', 'source_id', 'security_category' ]) return c
def confluence(cls): streams = [ cls.stream_020001(), cls.stream_020002(), cls.stream_020003() ] c = Confluence(*streams, on=["org_id", "fund_id"]) return c
def main(): s01 = stream_000001() s11 = stream_010001() s21 = stream_020001() s22 = stream_020002() s23 = stream_020003() c = Confluence(s01, s11, s21, s22, s23, on=[BondInfo.bond_id.name]) io.to_sql(BondInfo.__tablename__, engine_w, c.dataframe)
def confluence(cls): streams = [ cls.stream_010xxx(), cls.stream_020001(), cls.stream_020002(), cls.stream_020003() ] c = Confluence(*streams, on=["fund_id", "statistic_date"]) return c
def confluence(cls): streams = [ cls.stream_fundaccount(), cls.stream_securities(), cls.stream_futures(), cls.stream_private() ] c = Confluence(*streams, on=["fund_id"]) return c
def confluence(cls): prio_1 = { 0: {"purchase_status": ("source_id", "020002")}, 1: {"purchase_status": ("source_id", "020001")} } streams = [cls.stream_020001(), cls.stream_020002(), cls.stream_020003()] c = Confluence(*streams, on=["fund_id"], prio_l1=prio_1) return c
def conflu(cls): s01, s11 = cls.stream_000001(), cls.stream_010001() p = { 0: { "org_full_name": ("source_id", "010001"), }, 1: { "org_full_name": ("source_id", "000001"), }, } c = Confluence(s01, s11, on=[OrgInfo.org_id.name], prio_l1=p) dk = transform.DropKeys(["source_id"]) s = Stream(c, transform=[dk]) return Confluence(s)
def confluence(cls): streams = [ cls.y_org_description(), cls.stream_020001(), cls.stream_020002(), cls.stream_020003(), cls.stream_010001() ] c = Confluence(*streams, on=["org_id"]) return c
def confluence(cls): streams = [ cls.stream_y_person_info(), cls.stream_010001(), cls.stream_020001(), cls.stream_020002(), cls.stream_020001_op(), cls.stream_020003() ] c = Confluence(*streams, on=["person_id"]) return c
def confluence(cls): streams = [ cls.stream_030001(), cls.stream_020001(), cls.stream_020002(), cls.stream_resume_020001(), cls.stream_resume_020002(), cls.stream_resume_020003() ] c = Confluence(*streams, on=["person_id"]) return c
def confluence(cls): p = { 0: { "locked_time_limit": ("source_id", "020001"), "min_purchase_amount": ("source_id", "020001"), "min_append_amount": ("source_id", "020001"), "min_append_amount_remark": ("source_id", "020001"), "fee_subscription": ("source_id", "020001"), "fee_redeem": ("source_id", "020001"), }, 1: { "locked_time_limit": ("source_id", "020002"), "min_purchase_amount": ("source_id", "020002"), "min_append_amount": ("source_id", "020008"), "min_append_amount_remark": ("source_id", "020002"), "fee_subscription": ("source_id", "020002"), "fee_redeem": ("source_id", "020002"), }, 2: { "locked_time_limit": ("source_id", "020005"), "min_purchase_amount": ("source_id", "020008"), "min_append_amount": ("source_id", "020003"), "min_append_amount_remark": ("source_id", "020008"), "fee_subscription": ("source_id", "020008"), "fee_redeem": ("source_id", "020008"), }, 3: { "min_purchase_amount": ("source_id", "020003"), "locked_time_limit": ("source_id", "020002"), "min_append_amount": ("source_id", "020004"), "min_append_amount_remark": ("source_id", "020003"), "fee_subscription": ("source_id", "020003"), "fee_redeem": ("source_id", "020003"), }, 4: { "min_purchase_amount": ("source_id", "020004"), "min_append_amount_remark": ("source_id", "020004"), "fee_subscription": ("source_id", "020004"), "fee_redeem": ("source_id", "020004"), } } streams = [ cls.stream_020001(), cls.stream_020002(), cls.stream_020008(), cls.stream_020003(), cls.stream_020005(), cls.stream_020004() ] c = Confluence(*streams, on=["fund_id"], prio_l1=p) return c
def conflu_1(cls): """ 合并010001, 020001, 020002源数据流 Returns: base.Confluence """ s11 = cls.stream_010001() s21 = cls.stream_op_020001() s22 = cls.stream_op_020002() sy = cls.stream_y() c = Confluence(s11, s21, s22, sy, on=["person_id", "org_id", "duty"]) return c
def conflu_2(cls): """ 合并数据源010101数据流和合流1 Returns: base.Confluence """ c1 = cls.conflu_1() s11 = cls.stream_010101() c = Confluence(s11, c1, on=["org_id", "person_id"]) df = c.dataframe.dropna(subset=['duty']) return df
def conflu_fund_num(cls): # fund_num, fund_total_num # 外层以org_info表左连接查询, 以确保所有主键都会被更新到, 以处理源表数据变动的情况; sql_operating = "SELECT t1.org_id, t2.fund_num FROM {tb_test} t1 " \ "JOIN (SELECT oi.org_id, COUNT(fom.fund_id) as fund_num FROM {tb_test} oi " \ "JOIN base.fund_org_mapping fom ON oi.org_id = fom.org_id " \ "JOIN base.fund_info fi ON fom.fund_id = fi.fund_id " \ "WHERE fom.org_type_code = 1 AND fi.fund_status = '运行中'" \ "GROUP BY fom.org_id) t2 " \ "ON t1.org_id = t2.org_id ".format(tb_test=TEST_TABLE) sql_total = "SELECT t1.org_id, t2.fund_total_num FROM {tb_test} t1 " \ "JOIN (SELECT oi.org_id, COUNT(fom.fund_id) as fund_total_num FROM {tb_test} oi " \ "JOIN base.fund_org_mapping fom ON oi.org_id = fom.org_id " \ "JOIN base.fund_info fi ON fom.fund_id = fi.fund_id " \ "WHERE fom.org_type_code = 1 " \ "GROUP BY fom.org_id) t2 " \ "ON t1.org_id = t2.org_id".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql_operating) inp_total = MysqlInput(ENGINE_RD, sql_total) jn = transform.Join(inp_total, how="outer", on="org_id") vm = transform.ValueMap({ "fund_num": lambda x: 0 if np.isnan(x) else x, "fund_total_num": lambda x: 0 if np.isnan(x) else x, }) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, "fund_num": OrgInfo.fund_num.name, "fund_total_num": OrgInfo.fund_total_num.name }) s = Stream(inp, [jn, vm, sk]) return Confluence(s)
def main(): s11 = stream_010001() s21 = stream_020001() s22 = stream_020002() s23 = stream_020003() # 122577.SH, par_value key_priority = { 0: { BondInfo.issue_price.name: (DBondInfo.source_id.name, "020002"), BondInfo.issue_amount.name: (DBondInfo.source_id.name, "010001"), BondInfo.coupon_rate.name: (DBondInfo.source_id.name, "020001"), BondInfo.maturity_date.name: (DBondInfo.source_id.name, "010001"), BondInfo.value_date.name: (DBondInfo.source_id.name, "020003"), }, 1: { BondInfo.issue_price.name: (DBondInfo.source_id.name, "020001"), BondInfo.issue_amount.name: (DBondInfo.source_id.name, "020001"), BondInfo.coupon_rate.name: (DBondInfo.source_id.name, "020002"), BondInfo.maturity_date.name: (DBondInfo.source_id.name, "020002"), }, 2: { BondInfo.issue_price.name: (DBondInfo.source_id.name, "010001"), BondInfo.issue_amount.name: (DBondInfo.source_id.name, "020002"), BondInfo.coupon_rate.name: (DBondInfo.source_id.name, "010001"), BondInfo.maturity_date.name: (DBondInfo.source_id.name, "020003"), }, 3: { BondInfo.maturity_date.name: (DBondInfo.source_id.name, "020001"), }, } conflu = Confluence(s11, s21, s22, s23, on=BondInfo.bond_id.name, prio_l1=key_priority) io.to_sql(BondInfo.__tablename__, engine_w, conflu.dataframe.drop(DBondInfo.source_id.name, axis=1))
def conflu_pinyin(cls): # org_name_py sql = "SELECT org_id, org_name FROM {tb_test}".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) vm1 = transform.ValueMap({ OrgInfo.org_name.name: lambda x: re.sub("(.*)|\(.*\)", "", x) }) vm2 = transform.ValueMap({ OrgInfo.org_name_py.name: ( lambda x: "".join([x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), OrgInfo.org_name.name ) }) sk = transform.MapSelectKeys({ OrgInfo.org_id.name: None, OrgInfo.org_name_py.name: None }) s = Stream(inp, transform=(vm1, vm2, sk)) return Confluence(s)
def conflu1(fund_ids=None): s00 = stream_y_fund_info(fund_ids) s12 = stream_x_fund_info_010002(fund_ids) s13 = stream_x_fund_info_010003(fund_ids) s14 = stream_x_fund_info_010004(fund_ids) s15 = stream_x_fund_info_010005(fund_ids) s21 = stream_d_fund_info_020001(fund_ids) s22 = stream_d_fund_info_020002(fund_ids) p = { 0: { "fund_name": ("source_id", "000001"), "fund_full_name": ("source_id", "010002"), "reg_time": ("source_id", "010003"), "reg_code": ("source_id", "010002"), }, 1: { "reg_code": ("source_id", "010003"), "fund_full_name": ("source_id", "010003"), }, 2: { "fund_full_name": ("source_id", "010004"), }, 3: { "fund_full_name": ("source_id", "010005"), } } conflu = Confluence(s00, s12, s13, s14, s15, s21, s22, on=["fund_id"], prio_l1=p) return conflu
def conflu_total_asset_mgt_scale_(cls): # asset_mgt_scale # 外层以org_info表左连接查询, 以确保所有主键都会被更新到, 以处理源表数据变动的情况; sql = "SELECT t1.org_id, t2.total_asset_mgt_scale " \ "FROM {tb_test} t1 " \ "LEFT JOIN (SELECT oi.org_id, SUM(fas.asset_scale) total_asset_mgt_scale FROM {tb_test} oi " \ "JOIN base.fund_org_mapping fom ON oi.org_id = fom.org_id " \ "JOIN base.fund_asset_scale fas ON fom.fund_id = fas.fund_id " \ "JOIN (SELECT fund_id, MAX(statistic_date) md FROM base.fund_asset_scale GROUP BY fund_id ) fas_latest " \ "ON fas.fund_id = fas_latest.fund_id AND fas.statistic_date = fas_latest.md " \ "WHERE fom.org_type_code = 1 " \ "GROUP BY org_id) t2 " \ "ON t1.org_id = t2.org_id" \ "".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) sk = transform.MapSelectKeys({ "org_id": OrgInfo.org_id.name, "total_asset_mgt_scale": OrgInfo.total_asset_mgt_scale.name }) s = Stream(inp, [sk]) return Confluence(s)
def confluence(cls): streams = [cls.stream_020001(), cls.stream_020003()] c = Confluence(*streams, on=["fund_id", "data_source", "statistic_date", "subject_id"]) return c
def confluence(cls): streams = [cls.stream_000001()] c = Confluence(*streams, on=["fund_id"]) return c
def confluence_2(cls): streams = [cls.stream_name()] c = Confluence(*streams, on=["org_id"]) return c
def confluence(cls): streams = [cls.stream_03xxxx_type1(), cls.stream_03xxxx_type2(), cls.stream_04xxxx_type2()] c = Confluence(*streams, on=["fund_id", "org_type_code"]) return c
def main(): s = stream_020004() c = Confluence(s) io.to_sql(BondRating.__tablename__, engine_w, c.dataframe, chunksize=500)