def main(): print("fetching get_data...") ftm = pd.read_sql(sql_ftm, engine) if len(ftm) == 0: print("there is no get_data to be updated...") return None fi = pd.read_sql(sql_fi, engine) ftm_stra = ftm.loc[ftm["typestandard_code"] == 1, ["fund_id", "type_name", "stype_name"]] ftm_stra["fund_type_strategy"] = ftm_stra["type_name"] + "_" + ftm_stra[ "stype_name"] ftm_stra = ftm_stra[["fund_id", "fund_type_strategy"]] ftm_issu = ftm.loc[ftm["typestandard_code"] == 3, ["fund_id", "type_name", "stype_name"]] ftm_issu = ftm_issu[["fund_id", "type_name"]] ftm_issu.columns = ["fund_id", "fund_type_issuance"] ftm_stru = ftm.loc[ftm["typestandard_code"] == 4, ["fund_id", "type_name", "stype_name"]] ftm_stru.columns = [[ "fund_id", "fund_type_structure", "structure_hierarchy" ]] tmp = fi.copy() tmp = tmp[["fund_id"]] result = pd.merge(tmp, ftm_stra, on="fund_id", how="right") result = pd.merge(result, ftm_issu, on="fund_id", how="right") result = pd.merge(result, ftm_stru, on="fund_id", how="right") print("get_data to db...") io.to_sql("fund_info", engine, result, "update")
def main(): sql = "SELECT org_id, address, region FROM base.org_info" with engine_rd.connect() as conn: df_oi = pd.read_sql(sql, conn) # 在去空值后, 找出region字段不被包含在address字段的机构, 并且根据address去空 df = df_oi.loc[list( map(lambda x, y: (x not in y), df_oi["region"].fillna("__NONE1"), df_oi["address"].fillna("__NONE2")))].dropna(subset=["address"]) # 对剩余address不为空, 但是region为空或者与address不一致的数据, 从address中提取region; df["region_1"] = df["address"].apply(lambda x: re.search(CITIES_re, x)) df["region_1"] = df["region_1"].apply(lambda x: x.groups()[0] if x is not None else None) df["region_2"] = df["address"].apply( lambda x: re.search(PROVINCES_CHN_re, x)) df["region_2"] = df["region_2"].apply(lambda x: x.groups()[0] if x is not None else None) df["region"] = df["region_1"].fillna(df["region_2"]) del df["region_1"] del df["region_2"] del df["address"] df = df.dropna(subset=["region"]) with engine_wt.connect() as conn: io.to_sql("org_info", conn, df) conn.close()
def start_cal(oids, freq): table = { "w": { "dynamic": "org_weekly_index", "static": "org_weekly_index_static" }, "m": { "dynamic": "org_monthly_index", "static": "org_monthly_index_static" } } oi = OrgIndex(freq, engine_rd, engine_rd) tasks = list(oi.loop_by_orgs(oids, CALDATE)) for task in tasks: for results in task: d, s = results if d is not None: d.reset_index(inplace=True) d = d.reset_index()[[ "sample_num", "index_value", "index_id", "index_name", "org_id", "date" ]].rename(columns={"date": "statistic_date"}) io.to_sql(table[freq]["dynamic"], engine_rd, d) s.reset_index(inplace=True) s = s.reset_index()[[ "sample_num", "index_value", "index_id", "index_name", "org_id", "date" ]].rename(columns={"date": "statistic_date"}) io.to_sql(table[freq]["static"], engine_rd, s)
def write(cls): df = cls.confluence() io.to_sql("base_public.fund_info", ENGINE_RD, df.dataframe, type="update") print(df.dataframe)
def test(): c = Streams.conflu() c.dataframe io.delete("base_test.fund_manager_mapping_test", ENGINE_RD, c.dataframe[["fund_id"]].drop_duplicates()) io.to_sql("base_test.fund_manager_mapping_test", ENGINE_RD, c.dataframe)
def _merge_records(self, table, id_right, ids_wrong, engine): pks = get_pk(table, engine) if len(pks) == 1: sql_right = "SELECT * FROM {tb} WHERE {id_col} = '{id_right}'".format( tb=table, id_col=self.ID_COLUMN, id_right=id_right) sql_wrong = "SELECT * FROM {tb} WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col=self.ID_COLUMN, ids_wrong=ids_wrong) df_right, df_wrong = [ pd.read_sql(sql, engine) for sql in (sql_right, sql_wrong) ] df_right = df_right.applymap( replace_abnormalstr) # 将空字符, 异常字符视作空值; df_res = df_right.append(df_wrong).fillna(method="bfill")[:1] sql_del = "DELETE FROM {tb} WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col=self.ID_COLUMN, ids_wrong=ids_wrong) io.to_sql(table, engine, df_res, type="update") # 将合并后的结果更新至原表 engine.execute(sql_del) else: sql_upt = "UPDATE IGNORE {tb} SET {id_col} = '{id_right}' WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col=self.ID_COLUMN, id_right=id_right, ids_wrong=ids_wrong) engine.execute(sql_upt)
def main(): sql_ids_used = "SELECT DISTINCT fund_id FROM fund_info \ WHERE fund_id IN (SELECT fund_id FROM (SELECT fund_id, COUNT(fund_id) cnt FROM fund_nv_data_standard \ GROUP BY fund_id HAVING cnt >= 3) as T)" ids_to_update = [x[0] for x in engine_rd.execute(sql_ids_used).fetchall()] fi = get_original(ids_to_update) result_m = is_monthly(ids_to_update) result_w = is_weekly(ids_to_update) result_d = is_daily(ids_to_update) result = result_m.copy() result.update(result_w) result.update(result_d) fi["data_freq2"] = [ result[id_] if id_ in result.keys() else None for id_ in fi["fund_id"].tolist() ] fi["data_freq2"] = fi["data_freq2"].fillna("其他") fi["data_freq"] = fi["data_freq2"] io.to_sql("fund_info", engine_wt, fi[["fund_id", "data_freq"]], "update")
def main(): bid_whole = get_id() step = 500 for i in range(0, len(bid_whole), step): res = crwal(bid_whole[i:i + step]) print(len(res), res) io.to_sql('d_bond_rating', engine_private, res)
def write(cls): df = cls.confluence() io.to_sql("base_public.org_portfolio_asset", ENGINE_RD, df.dataframe, type="update") cls.org_name()
def main(): for conflu in (s01.conflu_ts104(), s01.conflu_ts103(), s01.conflu_ts102()): io.to_sql("base.fund_type_source", ENGINE_RD, conflu.dataframe) for conflu in (s02.conflu_ts20X(), s02.conflu_ts30X(), s02.conflu_ts40X(), s02.conflu_ts501()): io.to_sql("base.fund_type_source", ENGINE_RD, conflu.dataframe)
def main(): pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(calculate, range(1, 14)) pool.close() pool.join() for result in results: io.to_sql("fund_month_index_static", engine_rd, result)
def write(cls): df = cls.confluence() io.to_sql("base_public.org_position_stock", ENGINE_RD, df.dataframe, type="update") cls.org_name()
def main(export_path=None): pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(partial(calculate, export_path=export_path), range(1, 14)) pool.close() pool.join() for result in results: io.to_sql(table.name, engine_rd, result)
def fetch_nv(fids): # fids = str(tuple(fids)) fids = "'" + '\',\''.join(fids) + "'" print(fids) sql = "select org_id, org_name, org_name_en, org_name_py, org_full_name, \ org_category, found_date, base_date, org_code, reg_code, \ reg_time, is_reg_now, is_member, member_type, master_strategy, \ fund_num, manage_type, \ asset_mgt_scale, property, reg_capital, real_capital, \ real_capital_proportion, legal_person, is_qualified, \ qualifying_way, region, prov, city, area, address, \ reg_address, org_web, final_report_time, currency, \ initiation_time, law_firm_name, lawyer_name, phone, profile, \ legal_person_resume, prize, team, major_shareholder, \ shareholder_structure, special_tips, investment_idea, email, \ entry_time, update_time FROM org_info WHERE org_id in ({fids})".format( fids=fids) # sql = "SELECT * FROM base.org_info " \ # "WHERE org_id in ({fids})".format( # fids=fids # ) df = pd.read_sql(sql, engine) df.rename(columns={"fund_total_num": "total_fund_num"}, inplace=True) print(df) io.to_sql('test_gt.org_info', engine_test_gt, df, type="update")
def _migrate_data(self, ids_to_mig): """ Args: ids_to_mig: dict<str: list> <id_right: [id_wrong1, id_wrong2, ...,]> Returns: """ for id_right, ids_wrong in ids_to_mig.items(): ids_wrong = sf.SQL.values4sql(ids_wrong) for table in self.MOD_NOT_KEPP_LIST.get(self._id_type, []): # 使用忽略更新, 遇到重复主键无法更新时, 忽略这些id, 并交由DelHelper类处理. if table == "config_private.sync_source": sql_upt = "UPDATE IGNORE {tb} SET {id_col} = '{id_right}' WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col="`pk`", id_right=id_right, ids_wrong=ids_wrong) engine_base.execute(sql_upt) elif table == "id_match": sql_upt = "UPDATE IGNORE {tb} SET {id_col} = '{id_right}' WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col="`matched_id`", id_right=id_right, ids_wrong=ids_wrong) engine_base.execute(sql_upt) else: self._merge_records(table, id_right, ids_wrong, engine_base) print("Mov:", table) for table in self.MOD_AND_KEEP_LIST.get(self._id_type, []): if table == "config_private.sync_source": col = "pk" elif table == "id_match": col = "matched_id" else: col = self.ID_COLUMN.replace("`", "") sql_upt = "SELECT * FROM {tb} WHERE {id_col} IN {ids_wrong}".format( tb=table, id_col=col, id_right=id_right, ids_wrong=ids_wrong) df = pd.read_sql(sql_upt, engine_base) df[col] = id_right io.to_sql(table, engine_base, df, type="ignore") # MAY BE WRONG return True
def main(): # df = fetch_xls("ftp://115.29.204.48/webdata/spperf.xls") # result = df.ix[df["index_code"] == "H11001", ["index_level", "statistic_date"]] # result.columns = ["cbi", "statistic_date"] # http://www.csindex.com.cn/zh-CN/indices/index-detail/H11001 result = fetch_parse_xlx("http://www.csindex.com.cn/uploads/file/autofile/perf/H11001perf.xls") io.to_sql("market_index", engine, result, "update")
def main(): c = StreamsMain.confluence() io.to_sql("base_public.fund_info", ENGINE_RD, c.dataframe.drop(["source_id"], axis=1), type='update') c2 = StreamsMain.confluence_2() io.to_sql("base_public.fund_info", ENGINE_RD, c2.dataframe, type='update')
def sync(fids): sql = "SELECT * FROM {tb} WHERE fund_id IN {fids}".format(tb=TABLE_TO_SYNC, fids=str(tuple(fids))) print("fetching...") print(sql) df = pd.read_sql(sql, ENGINE_R) print("writing...") io.to_sql(TABLE_TO_SYNC, ENGINE_W, df) print("done")
def main(export_path=None): pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(partial(calculate, export_path=export_path), range(1, 13 + 1)) pool.close() pool.join() for result in results: io.to_sql("fund_weekly_index_static", engine_wt, result)
def main(): s01 = stream_000001() s11 = stream_010001() s21 = stream_020001() s22 = stream_020002() s23 = stream_020003() c = Confluence(s01, s11, s21, s22, s23, on=[BondInfo.bond_id.name]) io.to_sql(BondInfo.__tablename__, engine_w, c.dataframe)
def write(cls): df = cls.confluence().dataframe df_next = df.loc[(df["type"] != "ERROR") & (df["proportion"] >= 0.00001)] io.to_sql("base_public.org_portfolio_industry", ENGINE_RD, df_next, type="update") cls.org_name()
def main(): data_sources = ["020001", "020002", "020003"] try: for data_source in data_sources: print(data_source) print(match(data_source)) io.to_sql(IdMatch.__tablename__, engine_wt, match(data_source)) except Exception as e: print(data_source, e)
def main(): from utils.database import io from utils.algorithm.ranking.ranking import CalculateHelper t = dt.date.today() t = CalculateHelper.last_rank_date(dt.date.today()) if (t.month, t.day) in {(3, 31), (6, 30), (9, 30), (12, 31)}: print("RANK DATE: ", t) f = MutRanking(t) res = f.rank_all io.to_sql("base_public.fund_rank", cfg.load_engine()["2Gb"], res)
def main(): s11 = stream_010001() s21 = stream_020001() s22 = stream_020002() s21_op = stream_020001_op() c = base.Confluence(s11, s21, s22, s21_op, on=[PersonInfo.person_id.name]) io.to_sql(PersonInfo.__schema_table__, ENGINE, c.dataframe)
def main(): fids = DataSetup.init_funds(DataSetup, engine) for fid in fids: print(fid) df_nv, df_dividend, df_split = DataSetup.init_data(DataSetup, fid, engine) calculator = AdjustedNvCalculator(df_nv, df_dividend, df_split) res = calculator.cal_by_nv()[["fund_id", "date", "adjusted_nav"]] res.columns = ["fund_id", "statistic_date", "swanav"] io.to_sql("fund_nv", engine, res)
def test1(): from utils.database import io for y in range(2016, 2018): for m in (3, 6, 9, 12): d = cld.monthrange(y, m)[1] date = dt.date(y, m, d) print("RANK DATE: ", date) f = MutRanking(date) a = f.rank_all io.to_sql("base_public.fund_rank", cfg.load_engine()["2Gb"], a)
def cal(start=None, end=None): start = start or dt.date(2015, 4, 1) end = end or dt.date.today() dates = [x.date() for x in pd.date_range(start, end, freq=const.bday_chn)] for ed in dates: try: prox = resultproxy.Fama3(ed, "d") io.to_sql("factor_style_d", engine, prox.result) print(prox.result) except DataError: continue
def to_db(self, dataframe): dataframe = dataframe.dropna(how='all').drop_duplicates() for i in range(len(dataframe)): df = dataframe.iloc[i, :] c = pd.DataFrame(df).T condition = self.generate_condition(c) sql = "SELECT * FROM {tb} WHERE {criterion}".format( tb=self.new, criterion=condition) df = pd.read_sql(sql, self.engine_new) print(df) io.to_sql(self.old, self.engine_old, df, type="update")
def cal(start=None, end=None): start = start or dt.date(2015, 4, 1) end = end or dt.date.today() dates = [x.date() for x in pd.date_range(start, end, freq="W-FRI")] for ed in dates: try: prox = resultproxy.Carhart4(ed, "w") io.to_sql("factor_style_w", engine, prox.result) print(prox.result) except DataError: continue
def main(): results = [ StreamsMain.conflu(), StreamsDerivative.conflu_place(), StreamsDerivative.conflu_pinyin(), StreamsDerivative.conflu_is_reg_now(), StreamsDerivative.conflu_is_member(), StreamsDerivative.conflu_fund_num(), # StreamsDerivative.conflu_total_asset_mgt_scale_(), StreamsDerivative.conflu_master_strategy() ] for res in results: io.to_sql(TEST_TABLE, ENGINE_RD, res.dataframe)