def testWeeksDaysAlgebra(self): twoWeeks = Period(length=2, units=TimeUnits.Weeks) oneWeek = Period(length=1, units=TimeUnits.Weeks) threeDays = Period(length=3, units=TimeUnits.Days) oneDay = Period(length=1, units=TimeUnits.Days) n = 2 flag = twoWeeks / n == oneWeek self.assertTrue( flag, "division error: {0} / {1:d}" " not equal to {2}".format(twoWeeks, n, oneWeek)) n = 7 flag = oneWeek / 7 == oneDay self.assertTrue( flag, "division error: {0} / {1:d}" " not equal to {2}".format(oneWeek, n, oneDay)) sum = threeDays sum += oneDay flag = sum == Period(length=4, units=TimeUnits.Days) self.assertTrue( flag, "sum error: {0}" " + {1}" " != {2}".format(threeDays, oneDay, Period(length=4, units=TimeUnits.Days))) sum += oneWeek flag = sum == Period(length=11, units=TimeUnits.Days) self.assertTrue( flag, "sum error: {0}" " + {1}" " + {2}" " != {3}".format(threeDays, oneDay, oneWeek, Period(length=11, units=TimeUnits.Days))) sevenDays = Period(length=7, units=TimeUnits.Days) flag = sevenDays.length() == 7 self.assertTrue( flag, "normalization error: sevenDays.length" " is {0:d}" " instead of 7".format(sevenDays.length())) flag = sevenDays.units() == TimeUnits.Days self.assertTrue( flag, "normalization error: sevenDays.units" " is {0:d}" " instead of {1:d}".format(sevenDays.units(), TimeUnits.Days)) normalizedSevenDays = sevenDays.normalize() flag = normalizedSevenDays.length() == 1 self.assertTrue( flag, "normalization error: normalizedSevenDays.length" " is {0:d}" " instead of 1".format(normalizedSevenDays.length())) flag = normalizedSevenDays.units() == TimeUnits.Weeks self.assertTrue( flag, "normalization error: TwelveMonths.units" " is {0:d}" " instead of {1:d}".format(normalizedSevenDays.units(), TimeUnits.Weeks))
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) alpha_logger.info("return data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) return dates, df[['trade_date', 'code', 'dx']], df[[ 'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry' ] + transformer.names]
def _map_horizon(frequency: str) -> int: parsed_period = Period(frequency) unit = parsed_period.units() length = parsed_period.length() if unit == TimeUnits.BDays or unit == TimeUnits.Days: return length - 1 elif unit == TimeUnits.Weeks: return 5 * length - 1 elif unit == TimeUnits.Months: return 22 * length - 1 else: raise ValueError( '{0} is an unrecognized frequency rule'.format(frequency))
def prepare_data(engine: SqlEngine, factors: Union[Transformer, Iterable[object]], start_date: str, end_date: str, frequency: str, universe: Universe, benchmark: int, warm_start: int = 0, fit_target: Union[Transformer, object] = None): if warm_start > 0: p = Period(frequency) p = Period(length=-warm_start * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', start_date, p).strftime('%Y-%m-%d') dates = makeSchedule(start_date, end_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Forward) dates = [d.strftime('%Y-%m-%d') for d in dates] horizon = map_freq(frequency) if isinstance(factors, Transformer): transformer = factors else: transformer = Transformer(factors) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).sort_values( ['trade_date', 'code']) alpha_logger.info("factor data loading finished") if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) alpha_logger.info("fit target data loading finished") industry_df = engine.fetch_industry_range(universe, dates=dates) alpha_logger.info("industry data loading finished") benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates) alpha_logger.info("benchmark data loading finished") df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left') df = pd.merge(df, industry_df, on=['trade_date', 'code']) df['weight'] = df['weight'].fillna(0.) df.dropna(inplace=True) return dates, df[[ 'trade_date', 'code', 'dx' ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] + transformer.names]
def fetch_predict_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fillna: str = None, fit_target: Union[Transformer, object] = None): if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fillna: factor_df = factor_df.groupby('trade_date').apply( lambda x: x.fillna(x.median())).reset_index(drop=True).dropna() else: factor_df = factor_df.dropna() if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) train_x = pd.merge(train_x, target_df, on=['trade_date', 'code'], how='left') risk_exp = train_x[neutralized_risk].values.astype(float) else: train_x = pd.merge(factor_df, target_df, on=['trade_date', 'code'], how='left') risk_exp = None train_x.dropna(inplace=True, subset=train_x.columns[:-1]) x_values = train_x[names].values.astype(float) y_values = train_x[['dx']].values.astype(float) date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] this_raw_y = y_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] ne_y = ne_y[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None ne_y = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'code': codes, 'y': ne_y.flatten() } return ret
def fetch_train_phase(engine, alpha_factors: Union[Transformer, Iterable[object]], ref_date, frequency, universe, batch=1, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0, fit_target: Union[Transformer, object] = None) -> dict: if isinstance(alpha_factors, Transformer): transformer = alpha_factors else: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = map_freq(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) if fit_target is None: target_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) else: one_more_date = advanceDateByCalendar('china.sse', dates[-1], frequency) target_df = engine.fetch_factor_range_forward(universe, factors=fit_target, dates=dates + [one_more_date]) target_df = target_df[target_df.trade_date.isin(dates)] target_df = target_df.groupby('code').apply( lambda x: x.fillna(method='pad')) df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna() target_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code'] + transformer.names] target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \ _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): pyFinAssert( len(dates) >= 2, ValueError, "No previous data for training for the date {0}".format(ref_date)) end = dates[-2] start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0] else: end = dates[-1] start = dates[-batch] if batch <= len(dates) else dates[0] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] this_code = codes[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = { 'x': pd.DataFrame(ne_x, columns=transformer.names), 'y': ne_y, 'code': this_code } return ret
def fetch_predict_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0): transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates).dropna() names = transformer.names if neutralized_risk: risk_df = engine.fetch_risk_model_range(universe, dates=dates, risk_model=risk_model)[1] used_neutralized_risk = list(set(neutralized_risk).difference(names)) risk_df = risk_df[['trade_date', 'code'] + used_neutralized_risk].dropna() train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code']) risk_exp = train_x[neutralized_risk].values.astype(float) x_values = train_x[names].values.astype(float) else: train_x = factor_df.copy() risk_exp = None date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime() dates = np.unique(date_label) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-1] start = dates[-batch] # index = (date_label >= start) & (date_label <= end) left_index = bisect.bisect_left(date_label, start) right_index = bisect.bisect_right(date_label, end) this_raw_x = x_values[left_index:right_index] sub_dates = date_label[left_index:right_index] if risk_exp is not None: this_risk_exp = risk_exp[left_index:right_index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) inner_left_index = bisect.bisect_left(sub_dates, end) inner_right_index = bisect.bisect_right(sub_dates, end) ne_x = ne_x[inner_left_index:inner_right_index] left_index = bisect.bisect_left(date_label, end) right_index = bisect.bisect_right(date_label, end) codes = train_x.code.values[left_index:right_index] else: ne_x = None codes = None ret = dict() ret['x_names'] = transformer.names ret['predict'] = {'x': ne_x, 'code': codes} return ret
def fetch_train_phase(engine, alpha_factors: Iterable[object], ref_date, frequency, universe, batch, neutralized_risk: Iterable[str] = None, risk_model: str = 'short', pre_process: Iterable[object] = None, post_process: Iterable[object] = None, warm_start: int = 0) -> dict: transformer = Transformer(alpha_factors) p = Period(frequency) p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units()) start_date = advanceDateByCalendar('china.sse', ref_date, p, BizDayConventions.Following) dates = makeSchedule(start_date, ref_date, frequency, calendar='china.sse', dateRule=BizDayConventions.Following, dateGenerationRule=DateGeneration.Backward) horizon = _map_horizon(frequency) factor_df = engine.fetch_factor_range(universe, factors=transformer, dates=dates) return_df = engine.fetch_dx_return_range(universe, dates=dates, horizon=horizon) df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna() return_df, factor_df = df[['trade_date', 'code', 'dx']], df[['trade_date', 'code', 'isOpen'] + transformer.names] return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \ _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk) if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'): end = dates[-2] start = dates[-batch - 1] else: end = dates[-1] start = dates[-batch] index = (date_label >= start) & (date_label <= end) this_raw_x = x_values[index] this_raw_y = y_values[index] if risk_exp is not None: this_risk_exp = risk_exp[index] else: this_risk_exp = None ne_x = factor_processing(this_raw_x, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ne_y = factor_processing(this_raw_y, pre_process=pre_process, risk_factors=this_risk_exp, post_process=post_process) ret = dict() ret['x_names'] = transformer.names ret['train'] = {'x': ne_x, 'y': ne_y} return ret
def testYearsMonthsAlgebra(self): oneYear = Period(length=1, units=TimeUnits.Years) sixMonths = Period(length=6, units=TimeUnits.Months) threeMonths = Period(length=3, units=TimeUnits.Months) n = 4 flag = oneYear / n == threeMonths self.assertTrue( flag, "division error: {0} / {1:d}" " not equal to {2}".format(oneYear, n, threeMonths)) n = 2 flag = oneYear / n == sixMonths self.assertTrue( flag, "division error: {0} / {1:d}" " not equal to {2}".format(oneYear, n, sixMonths)) sum = threeMonths sum += sixMonths flag = sum == Period(length=9, units=TimeUnits.Months) self.assertTrue( flag, "sum error: {0}" " + {1}" " != {2}".format(threeMonths, sixMonths, Period(length=9, units=TimeUnits.Months))) sum += oneYear flag = sum == Period(length=21, units=TimeUnits.Months) self.assertTrue( flag, "sum error: {0}" " + {1}" " + {2}" " != {3}".format(threeMonths, sixMonths, oneYear, Period(length=21, units=TimeUnits.Months))) twelveMonths = Period(length=12, units=TimeUnits.Months) flag = twelveMonths.length() == 12 self.assertTrue( flag, "normalization error: TwelveMonths.length" " is {0:d}" " instead of 12".format(twelveMonths.length())) flag = twelveMonths.units() == TimeUnits.Months self.assertTrue( flag, "normalization error: TwelveMonths.units" " is {0:d}" " instead of {1:d}".format(twelveMonths.units(), TimeUnits.Months)) normalizedTwelveMonths = Period(length=12, units=TimeUnits.Months) normalizedTwelveMonths = normalizedTwelveMonths.normalize() flag = normalizedTwelveMonths.length() == 1 self.assertTrue( flag, "normalization error: TwelveMonths.length" " is {0:d}" " instead of 1".format(twelveMonths.length())) flag = normalizedTwelveMonths.units() == TimeUnits.Years self.assertTrue( flag, "normalization error: TwelveMonths.units" " is {0:d}" " instead of {1:d}".format(twelveMonths.units(), TimeUnits.Years)) thirtyDays = Period(length=30, units=TimeUnits.Days) normalizedThirtyDays = thirtyDays.normalize() flag = normalizedThirtyDays.units() == TimeUnits.Days self.assertTrue( flag, "normalization error: ThirtyDays.units" " is {0:d}" " instead of {1:d}".format(normalizedThirtyDays.units(), TimeUnits.Days)) thirtyBDays = Period(length=30, units=TimeUnits.BDays) normalizedThirtyBDays = thirtyBDays.normalize() flag = normalizedThirtyBDays.units() == TimeUnits.BDays self.assertTrue( flag, "normalization error: ThirtyBDays.units" " is {0:d}" " instead of {1:d}".format(normalizedThirtyBDays.units(), TimeUnits.BDays))