def calculate(idx, export_path=None): dfs = pd.DataFrame() PEIndex = sf.PEIndex(idx) first_date = PEIndex.firstmonday result_r = {} components_num = {} components = {} for year in range(first_date.timetuple().tm_year, _process_date.year + 1): if year == _process_date.year: month = _process_date.month day = tu.date_of_weekday(_process_date, 0, (0, 0)).day if day > _process_date.day: # 修正周一在上个月,跨月后产生的日期问题 month -= 1 else: month = 12 day = 31 date_s = dt.date(year, month, day) # sql_i = sf.SQL_PEIndex(idx, year).yeardata_w["nv"] sql_mindate = sf.SQL_PEIndex(idx, year).yeardata_w["t_min"] conn = engine_rd.connect() su.tic("Getting Data") d = pd.read_sql(sql_i, conn) d.index = range(len(d)) t_min = pd.read_sql(sql_mindate, conn)["statistic_date_earliest"].tolist() t_min = [time.mktime(x.timetuple()) for x in t_min] # conn.close() su.tic("Preprocessing...") d["statistic_date"] = d["statistic_date"].apply( lambda x: time.mktime(x.timetuple())) d_dd = d.drop_duplicates("fund_id") idx_slice = d_dd.index.tolist() idx_slice.append(len(d)) ids = d_dd["fund_id"].tolist() last_monday = date_s - dt.timedelta( cld.weekday(date_s.year, date_s.month, date_s.day)) # t_std = tu.timeseries_std(last_monday, "a", 52, extend=1) # if year == first_date.timetuple().tm_year: t_std = t_std[:-1] # su.tic("Slicing") t_std_long = tu.timeseries_std( last_monday, tu.periods_in_interval(last_monday, dt.date(year - 1, 11, 30), 12)) t_std_long_p1m = [(x + relativedelta(months=1)).timestamp() for x in tu.tr(t_std_long)] real_p1m = su.compare(t_min, t_std_long) # 实际最早日期和标准序列日期比较 p1m_std = su.compare(t_std_long_p1m, t_std) # 加一个月的标准序列日期和标准序列日期比较 data_used = [p1m_std[x - 1] for x in real_p1m] su.tic("Grouping...") ds = [ d[idx_slice[i]:idx_slice[i + 1]] for i in range(len(idx_slice) - 1) ] ts = [x["statistic_date"].tolist() for x in ds] navs = [x["nav"].tolist() for x in ds] su.tic("Matching...") matchs = [tu.outer_match4index_w(x, t_std, False) for x in ts] su.tic("Getting Result...") # t_matchs = [x[0] for x in matchs] # t_matchs = [tu.tr(x) for x in t_matchs] idx_matchs = [x[1] for x in matchs] nav_matchs = [[ navs[i][idx] if idx is not None else None for idx in idx_matchs[i].values() ] for i in range(len(idx_matchs))] su.tic("Calculating Index...") nvs = pd.DataFrame(nav_matchs).T.astype(float).as_matrix() print(nvs.shape) for i in range(len(ids)): nvs[data_used[i] + 1:, i] = np.nan rs = nvs[:-1] / nvs[1:] - 1 rs[rs > 10] = np.nan rs[rs < -1] = np.nan r = np.nanmean(rs, axis=1) r[np.isnan(r)] = 0 result_r[year] = r components_num[year] = np.sum(~np.isnan(rs), axis=1) # log samples tmp = pd.DataFrame(nvs, columns=ids).T tmp["fund_id"] = tmp.index tmp = tmp[[tmp.columns[-1], *tmp.columns[:-1]]] components[year] = tmp su.tic("Year:{0}, Done...".format(year)) values_r = [] values_num = [] for year in range(first_date.timetuple().tm_year, date_s.year + 1): if len(values_r) == 0: values_r = result_r[year].tolist()[::-1] values_num = components_num[year].tolist()[::-1] else: values_r.extend(result_r[year].tolist()[::-1]) values_num.extend(components_num[year].tolist()[::-1]) result = (np.array(values_r) + 1).cumprod() * 1000 result = result.tolist() result.insert(0, 1000) values_num.insert(0, 0) tag = tu.timeseries_std(dt.datetime(year, month, day), tu.periods_in_interval( dt.datetime(year, month, day), dt.datetime(first_date.year, 1, 1), 12), 52, extend=5)[::-1] tag = [x for x in tag if x >= first_date.timestamp()] tag = [dt.date.fromtimestamp(x) for x in tag] # local debug op = pd.DataFrame(list(zip(tag, result, values_num))) op.columns = ["statistic_date", "index_value", "funds_num"] cols = [ "index_id", "index_name", "typestandard_code", "typestandard_name", "type_code", "type_name", "stype_code", "stype_name", "index_method", "data_source", "data_source_name" ] values = [ PEIndex.id, PEIndex.name, PEIndex.typestandard["code"], PEIndex.typestandard["name"], PEIndex.type["code"], PEIndex.type["name"], PEIndex.stype["code"], PEIndex.stype["name"], 1, 0, "私募云通" ] col_dict = dict(zip(cols, values)) for col, val in col_dict.items(): op[col] = val dfs = dfs.append(op) if export_path is not None: tmp = tag.copy() for year in sorted(components.keys(), reverse=True): # print(year, len(tmp)) components[year].columns = [ "fund_id", *[tmp.pop() for i in range(len(components[year].columns) - 2)], tmp[-1] ] io.export_to_xl( components, "{sd}_{index_name}_w_samples".format( sd=last_monday.strftime("%Y%m%d"), index_name=PEIndex.id), export_path) return dfs
def calculate(idx): dfs = pd.DataFrame() PEIndex = sf.PEIndex(idx) first_year = PEIndex.firstyear result_r = {} components_num = {} for year in range(first_year, process_date.year + 1): if year == process_date.timetuple().tm_year: month = process_date.month else: month = 12 sql_i = sf.SQL_PEIndex(PEIndex.idx, year).yeardata_m conn = engine_rd.connect() date_s = dt.date(year, month, 1) - dt.timedelta(1) su.tic("Getting Data") d = pd.read_sql(sql_i, conn) conn.close() su.tic("Preprocessing...") d["statistic_date"] = d["statistic_date"].apply( lambda x: time.mktime(x.timetuple())) d_dd = d.drop_duplicates("fund_id") idx_slice = d_dd.index.tolist() idx_slice.append(len(d)) t_std = tu.timeseries_std(dt.datetime(year, month, 10), month, 12, 1, use_lastday=True) t_std1 = t_std[:-1] su.tic("Grouping...") ds = [ d[idx_slice[i]:idx_slice[i + 1]] for i in range(len(idx_slice) - 1) ] ts = [x["statistic_date"].tolist() for x in ds] navs = [x["nav"].tolist() for x in ds] su.tic("Matching...") matchs1 = [tu.outer_match4index_f7(x, t_std1, False) for x in ts] matchs2 = [tu.outer_match4index_b7(x, t_std1) for x in ts] matchs3 = [tu.outer_match4index_m(x, t_std, False) for x in ts] matchs = [ su.merge_result(x1, x2, x3) for x1, x2, x3 in zip(matchs1, matchs2, matchs3) ] su.tic("Getting Result...") t_matchs = [x[0] for x in matchs] t_matchs = [tu.tr(x) for x in t_matchs] idx_matchs = [x[1] for x in matchs] nav_matchs = [[ navs[i][idx] if idx is not None else None for idx in idx_matchs[i].values() ] for i in range(len(idx_matchs))] su.tic("Calculating Index...") nvs = pd.DataFrame(nav_matchs).T.astype(float).as_matrix() rs = nvs[:-1] / nvs[1:] - 1 rs[rs > 30] = np.nan rs[rs < -1] = np.nan r = np.nanmean(rs, axis=1) r[np.isnan(r)] = 0 result_r[year] = r components_num[year] = np.sum(~np.isnan(rs), axis=1) su.tic("Year:{0}, Done...".format(year)) values_r = [] values_num = [] for year in range(first_year, process_date.timetuple().tm_year + 1): if len(values_r) == 0: values_r = result_r[year].tolist()[::-1] values_num = components_num[year].tolist()[::-1] else: values_r.extend(result_r[year].tolist()[::-1]) values_num.extend(components_num[year].tolist()[::-1]) adjust_periods = 1 date_tmp = date_s - relativedelta.relativedelta(months=adjust_periods + 1) date_tmp = dt.date(date_tmp.year, date_tmp.month, cld.monthrange(date_tmp.year, date_tmp.month)[1]) sql_base = "SELECT index_value FROM fund_month_index_static WHERE index_id = '{idx_id}' \ AND statistic_date = '{sd}'".format(idx_id=PEIndex.id, sd=date_tmp) base = pd.read_sql(sql_base, engine_rd).get_value(0, "index_value") result = (np.array(values_r)[-(adjust_periods + 1) - 1:] + 1).cumprod() * base result = result.tolist() values_num = values_num[-(adjust_periods + 1) - 1:] tag = tu.timeseries_std( dt.datetime(year, month + 1, 10), tu.periods_in_interval(dt.datetime(year, month + 1, 10), dt.datetime(first_year, 1, 10), 12), 12)[::-1] tag = [dt.date.fromtimestamp(x - 864000) for x in tag] tag = tag[-(adjust_periods + 1) - 1:] op = pd.DataFrame(list(zip(tag, result, values_num))) op.columns = ["statistic_date", "index_value", "funds_num"] cols = [ "index_id", "index_name", "typestandard_code", "typestandard_name", "type_code", "type_name", "stype_code", "stype_name", "index_method", "data_source", "data_source_name" ] values = [ PEIndex.id, PEIndex.name, PEIndex.typestandard["code"], PEIndex.typestandard["name"], PEIndex.type["code"], PEIndex.type["name"], PEIndex.stype["code"], PEIndex.stype["name"], 1, 0, "私募云通" ] col_dict = dict(zip(cols, values)) for col, val in col_dict.items(): op[col] = val dfs = dfs.append(op[:-1]) return dfs
def calculate(idx, export_path=None): dfs = pd.DataFrame() PEIndex = sf.PEIndex(idx) first_year = PEIndex.firstyear result_r = {} components_num = {} components = {} for year in range(first_year, process_date.year + 1): if year == process_date.timetuple().tm_year: month = process_date.month else: month = 12 sql_i = sf.SQL_PEIndex(PEIndex.idx, year).yeardata_m conn = engine_rd.connect() su.tic("Getting Data") d = pd.read_sql(sql_i, conn) conn.close() su.tic("Preprocessing...") d["statistic_date"] = d["statistic_date"].apply( lambda x: time.mktime(x.timetuple())) d_dd = d.drop_duplicates("fund_id") idx_slice = d_dd.index.tolist() idx_slice.append(len(d)) ids = d_dd["fund_id"].tolist() t_std = tu.timeseries_std(dt.datetime(year, month, 10), month, 12, 1, use_lastday=True) t_std1 = t_std[:-1] su.tic("Grouping...") ds = [ d[idx_slice[i]:idx_slice[i + 1]] for i in range(len(idx_slice) - 1) ] ts = [x["statistic_date"].tolist() for x in ds] navs = [x["nav"].tolist() for x in ds] su.tic("Matching...") matchs1 = [tu.outer_match4index_f7(x, t_std1, False) for x in ts] matchs2 = [tu.outer_match4index_b7(x, t_std1) for x in ts] matchs3 = [tu.outer_match4index_m(x, t_std, False) for x in ts] matchs = [ su.merge_result(x1, x2, x3) for x1, x2, x3 in zip(matchs1, matchs2, matchs3) ] su.tic("Getting Result...") t_matchs = [x[0] for x in matchs] t_matchs = [tu.tr(x) for x in t_matchs] idx_matchs = [x[1] for x in matchs] nav_matchs = [[ navs[i][idx] if idx is not None else None for idx in idx_matchs[i].values() ] for i in range(len(idx_matchs))] su.tic("Calculating Index...") nvs = pd.DataFrame(nav_matchs).T.astype(float).as_matrix() rs = nvs[:-1] / nvs[1:] - 1 rs[rs > 30] = np.nan rs[rs < -1] = np.nan r = np.nanmean(rs, axis=1) r[np.isnan(r)] = 0 result_r[year] = r components_num[year] = np.sum(~np.isnan(rs), axis=1) # log samples tmp = pd.DataFrame(nvs, columns=ids).T tmp["fund_id"] = tmp.index tmp = tmp[[tmp.columns[-1], *tmp.columns[:-1]]] components[year] = tmp su.tic("Year:{0}, Done...".format(year)) values_r = [] values_num = [] for year in range(first_year, process_date.timetuple().tm_year + 1): if len(values_r) == 0: values_r = result_r[year].tolist()[::-1] values_num = components_num[year].tolist()[::-1] else: values_r.extend(result_r[year].tolist()[::-1]) values_num.extend(components_num[year].tolist()[::-1]) result = (np.array(values_r) + 1).cumprod() * 1000 result = result.tolist() result.insert(0, 1000) values_num.insert(0, 0) # tag = tu.timeseries_std(dt.datetime(year, month + 1, 10), # tu.periods_in_interval(dt.datetime(year, month + 1, 10), dt.datetime(first_year, 1, 10), # 12), 12)[::-1] tag = tu.timeseries_std( dt.datetime(year, month + 1, 10), tu.periods_in_interval(dt.datetime(year, month + 1, 10), dt.datetime(first_year, 1, 10), 12), 12)[::-1] tag = [dt.date.fromtimestamp(x - 864000) for x in tag] op = pd.DataFrame(list(zip(tag, result, values_num))) op.columns = ["statistic_date", "index_value", "funds_num"] cols = [ "index_id", "index_name", "typestandard_code", "typestandard_name", "type_code", "type_name", "stype_code", "stype_name", "index_method", "data_source", "data_source_name" ] values = [ PEIndex.id, PEIndex.name, PEIndex.typestandard["code"], PEIndex.typestandard["name"], PEIndex.type["code"], PEIndex.type["name"], PEIndex.stype["code"], PEIndex.stype["name"], 1, 0, "私募云通" ] col_dict = dict(zip(cols, values)) for col, val in col_dict.items(): op[col] = val dfs = dfs.append(op[:-1]) if export_path is not None: tmp = tag.copy() for year in sorted(components.keys(), reverse=True): print(year, len(tmp)) components[year].columns = [ "fund_id", *[tmp.pop() for i in range(len(components[year].columns) - 2)], tmp[-1] ] io.export_to_xl( components, "{sd}_{index_name}_m_samples".format(sd=tag[-2].strftime("%Y%m%d"), index_name=PEIndex.id), export_path) return dfs