def initial_env(self): df_env = pd.DataFrame(index=self.df_pred.index) df_stk = djq_data_processor.get_data(self.etf) df_stk = df_stk.set_index('date') df_env = df_env.join(df_stk.close) df_env = df_env.fillna(method='ffill') df_env = df_env.fillna(method='backfill') return df_env
def initial_env(self): df_env = pd.DataFrame(index=self.df_pred.index) for name, etf_name in self.etf_names.items(): try: df_stk = djq_data_processor.get_data(etf_name, inx=False) except: raise ValueError('Cannot find the file!') df_stk = df_stk[['date', 'close']] df_stk = df_stk.set_index('date') df_stk = df_stk.rename(columns={'close': name}) df_stk = df_stk.sort_values('date') # df_stk = df_stk.pct_change() + 1 df_env = df_env.join(df_stk) df_env = df_env.fillna(method='ffill') df_env = df_env.fillna(method='backfill') return df_env
def cls_to_weighted_pct(self, df): """ The estimated result is a class interval number, change the class to corresponding pct change Calculate the index pct change using the market value weights :param df: pandas.DataFrame, estimated result :param pjNam: str, project name :return: float, estimated change of index """ df2 = df.astype(float) for stk in df.columns: df2.loc[:][stk] = [ self.book[self.book.code == stk]['tier' + str(int(c))].values[0] for c in df.loc[:][stk] ] mkt = djq_data_processor.get_data('market') mkt = mkt.set_index('code') return np.average(df2, weights=mkt.loc[df.columns]['mktcap'], axis=1)
def mkt_cmp(df, mkt='399300',start_date='2020-01-01'): ''' Draw the picture of portfolio price chart, compared with market Draw the linear regression line of portfolio with market, show the beta/alpha value :param df: pandas.Series, index = date, time series data of NPV of the portfolio :param mkt: str with length=6, China stock index code number :param start_date: 'YYYY-mm-dd' :return: None ''' assert type(df) == pd.Series plt.figure(figsize=(10, 8)) df.index = pd.to_datetime(df.index) df_mkt = djq_data_processor.get_data(mkt, inx=True) # df_mkt.date = df_mkt.date.dt.strftime('%Y/%m/%d') df_mkt = df_mkt.set_index('date') df_mkt.index = pd.to_datetime(df_mkt.index) if set(df.index) & set(df_mkt.index) == set([]): print('Input data error') return df = df[df.index>=start_date] df_mkt = df_mkt.loc[list(df.index)] df_mkt['close'] = df_mkt['close'] / df_mkt['close'][0] ax1 = plt.subplot(2,1,1) #ax1.xaxis.set_major_locator(ticker.MultipleLocator(base=10)) plt.plot(df) plt.plot(df_mkt.close) plt.legend(['portfolio_profit','market_profit'], loc='best') #plt.xticks(rotation=330) df_chg = df.pct_change().dropna() mkt_chg = df_mkt.close.pct_change().dropna() plt.subplot(2, 1, 2) plt.scatter(mkt_chg, df_chg) cov_a_b = np.cov(df_chg, mkt_chg)[0][1] beta = cov_a_b / mkt_chg.var() alpha = df_chg.mean() - beta * mkt_chg.mean() x = np.linspace(min(mkt_chg), max(mkt_chg), 50) y1 = beta * x + alpha plt.plot(x, y1, color='red') plt.title('Security Characteristic Line with beta=%.4f, alpha=%.4f'%(beta, alpha)) plt.show()
def data_prepare(self, code, drop=True, real_time=True): """ :param code: China stock code with 6 numbers, using local data set :param drop: set True to drop the data of days with missing information :param real_time: add real time data as the newest data when the market is open :return: train_data, train_label, test_data, test_label, df_train, df_test, class threshold """ xlst = zsys.ohlcVLst + zsys.stcokcharts_indicators + zsys.last_data # + zsys.TDS_talib_indicators_all train_start = '2012-01-01' train_end = '2019-12-31' n_pca = 50 pca = True min_profit_ratio = self.target_day // 3 for part in self.data_params: if part.startswith('xlst'): xlst_name = part[5:].split('+') xlst = [] for name in xlst_name: xlst += { 'ohlcV': zsys.ohlcVLst, 'chart': zsys.stcokcharts_indicators, 'last': zsys.last_data, 'all': zsys.TDS_talib_indicators_all, 'ohlc': zsys.ohlcLst, 'talib5': zsys.TDS_talib_multi_indicators_5, 'talib15': zsys.TDS_talib_multi_indicators_15, 'talib30': zsys.TDS_talib_multi_indicators_30, 'talibstatic': zsys.TDS_talib_multi_indicators_Static }[name] elif part.startswith('date'): start, end = part[5:].split('-') train_start = start + '-01-01' train_end = end + '-12-31' elif part.startswith('minprofit'): min_profit_ratio = int(part[10:]) elif part.startswith('pca'): if part[4:].isnumeric(): n_pca = int(part[3:]) elif part.startswith('lda'): pca = False n_pca = self.classify - 1 # 数据准备 try: df = djq_data_processor.get_data(code) except: raise ValueError('Cannot find the file!') if real_time and list(df['date'])[-1] != time.strftime('%Y-%m-%d'): open_time = time.strptime( time.strftime('%Y-%m-%d') + ' 09:30:00', '%Y-%m-%d %H:%M:%S') now = time.strptime(time.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S') diff = min(max(0, time.mktime(now) - time.mktime(open_time)), 2 * 60 * 60) open_time = time.strptime( time.strftime('%Y-%m-%d') + ' 13:00:00', '%Y-%m-%d %H:%M:%S') diff += min(max(0, time.mktime(now) - time.mktime(open_time)), 2 * 60 * 60) new = ts.get_realtime_quotes([code]) if new['date'][0] != list(df['date'])[-1] and diff and float( new['open'][0]): time_multiple = 4 * 60 * 60 / diff line = pd.Series( dict( zip(['date', 'open', 'high', 'low', 'close', 'volume'], [ time.strftime('%Y-%m-%d'), float(new['open'][0]), float(new['high'][0]), float(new['low'][0]), float(new['price'][0]), time_multiple * float(new['volume'][0][:-2]) ]))) df = df.append(line, ignore_index=True) df = df.sort_values('date', ascending=False) df = df.reset_index(drop=True) if df.shape[0] < 252: raise ValueError('Not enough train data!') df = get_all_finanical_indicators(df) djq_data_processor.get_label(df, target_day=self.target_day) df = df[df.date >= train_start] if drop: df = df.dropna() #transfer_label_to_classification(df, classify=self.classify) df['y_pct_change'] = df['y'].copy() df_train = df[df.date <= train_end].copy() if df_train.shape[0] < 252: raise ValueError('Not enough train data!') df_test = df[df.date > train_end].copy() split, thresholds = pd.qcut(df_train['y'], self.classify, labels=range(self.classify), retbins=True) if thresholds[-2] < min_profit_ratio: pass # raise ValueError('Too small profit!') df_train['y'] = np.array(split) df_test['y'] = np.array( pd.cut(df_test['y'], thresholds, labels=range(self.classify))) # 数据清洗 std = preprocessing.StandardScaler() x_train = df_train[xlst].values x_train = std.fit_transform(x_train) if pca: dimension_reducer = PCA(n_components=n_pca) dimension_reducer.fit(x_train) else: # LDA dimension reduction dimension_reducer = LDA(n_components=n_pca) dimension_reducer.fit(x_train, df_train['y']) x_train = dimension_reducer.transform(x_train) # 测试集与训练集采用相同方法处理 x_test = df_test[xlst].values if x_test.shape[0]: x_test = std.transform(x_test) x_test = dimension_reducer.transform(x_test) return self.subclassifiers_transfer( code, x_train), df_train['y'].values, self.subclassifiers_transfer( code, x_test), df_test['y'].values, df_train, df_test, thresholds