def prep_data_rf(): """ Calculate daily indicators and store them in rf_data_research. Multiprocessing is used in this function to speed up the process. Returns: None. """ stock_list = get_data.get_sql_key() stock_list_f = get_data.get_sql_key(name='data_finance') stock_list = list(set(stock_list) & set(stock_list_f)) stock_list_list = divide_list(stock_list, 24) print('Start') p = Pool() for i in range(24): p.apply_async(prep_data_for_rf, args=(stock_list_list[i], i,)) p.close() p.join() con = db.connect('D:\\Data\\rf_data_research.sqlite') cur = con.cursor() for i in range(24): data_temp = get_data.get_from_sql(name='rf_data_research_'+str(i)) for stock in data_temp: data_temp[stock].to_sql( name=stock, con=con, if_exists='replace', index=False ) os.remove('D:\\Data\\rf_data_research_' + str(i) + '.sqlite') con.commit() cur.close() con.close()
def give_relative_return(): """ Calculate daily relative return cross-sectionally based on the data gathered. Multiprocessing is used in this function to speed up the process. Returns: None. """ result = get_data.get_from_sql(stock_id='All_Data', name='rf_data_research_target') result = result[result['date'] >= '20070101'] result = result.sort_values(by=['date']).reset_index(drop=True) sub_result = {} cutting = list(result['date'].drop_duplicates().index) unit = int(len(cutting) / 24) for i in range(24): if i != 23: sub_result[i] = result.loc[cutting[i * unit]: cutting[(i + 1) * unit] - 1] else: sub_result[i] = result.loc[cutting[i * unit]:] dict_industry = get_data.get_industry_stock_list() industry_list = list(dict_industry.keys()) + ['None'] cal_relative_return(sub_result[0], industry_list, 0) p = Pool() print('Start pooling') for i in range(24): p.apply_async(cal_relative_return, args=(sub_result[i], industry_list, i, )) p.close() p.join() print('Done pooling') final = pd.DataFrame() con = db.connect('D:\\Data\\rf_data_research_target.sqlite') for i in range(24): final = final.append(get_data.get_from_sql(stock_id='Processed_data_'+str(i), name='rf_data_research_target')) final.to_sql( name='Processed_data', con=con, if_exists='replace', index=False ) con.commit() con.close()
def main(): ts.set_token('267addf63a14adcfc98067fc253fbd72a728461706acf9474c0dae29') pro = ts.pro_api() dict_300 = {} for i in range(14): dict_300[str(2007+i)+'0101'] = list(pro.index_weight(index_code='399300.SZ', start_date=str(2007+i)+'0101', end_date=str(2007+i)+'0110')['con_code'].iloc[:300]) dict_300[str(2007+i)+'0701'] = list(pro.index_weight(index_code='399300.SZ', start_date=str(2007+i)+'0625', end_date=str(2007+i)+'0701')['con_code'].iloc[:300]) dict_500 = {} for i in range(14): dict_500[str(2007+i)+'0101'] = list(pro.index_weight(index_code='000905.SH', start_date=str(2007+i)+'0101', end_date=str(2007+i)+'0201')['con_code'].iloc[:500]) dict_500[str(2007+i)+'0701'] = list(pro.index_weight(index_code='000905.SH', start_date=str(2007+i)+'0625', end_date=str(2007+i)+'0710')['con_code'].iloc[:500]) calendar = pro.trade_cal(exchange='') calendar = calendar[calendar['is_open'] == 1]['cal_date'] dict_industry = get_data.get_industry_stock_list() stock_list = get_data.get_sql_key() # prep_data_for_rf(stock_list, dict_industry, calendar, 1, dict_300, dict_500) stock_list_list = [] length = int(len(stock_list) / 24) for i in range(24): if i == 23: stock_list_list.append(stock_list[i*length:]) else: stock_list_list.append(stock_list[i*length: (i+1)*length]) p = Pool() for i in range(24): p.apply_async(prep_data_for_rf, args=(stock_list_list[i], dict_industry, calendar, i, dict_300, dict_500, )) p.close() p.join() data = {} for i in range(24): data_temp = get_data.get_from_sql(name='rf_temp_' + str(i)) data = {**data, **data_temp} con = db.connect('D:\\Data\\rf_data_d.sqlite') cur = con.cursor() for stock in data: data[stock].to_sql( name=stock, con=con, if_exists='replace', index=False ) con.commit() cur.close() con.close() return None
def prep_data_for_rf(stock_list, dict_industry, calendar, i, dict_300, dict_500): con = db.connect('D:\\Data\\rf_temp_'+str(i)+'.sqlite') cur = con.cursor() for stock in stock_list: try: df = get_data.get_from_sql(stock_id=stock) df_f = get_data.get_from_sql(stock_id=stock, name='data_finance') df.set_index('trade_date', inplace=True) df_f.set_index('ann_date', inplace=True) listed_date = df.index[0] usable = max(change_month(listed_date, 13), '20070101') if (usable > '20200101') or (usable > str(df.index[-1])): continue if 'turnover_rate_f' not in df.columns: df['turnover_rate_f'] = df['turnover_rate'] first_date = df.index[df.index >= usable][0] current_date = first_date df['pct_chg'] = (df['close'] / df['pre_close'] - 1) * 100 data = pd.DataFrame() while (current_date <= '20200101') and (current_date < df.index[-30]): # stock_date = df.index[df.index >= current_date][0] ''' if stock_date != current_date: # If the stock is not trading in the first trading day of the month, # we don't collect its data and won't do trading on that stock this month. continue ''' last_trading_day = df.index[df.index < current_date][-1] existing_data = df.loc[: last_trading_day] last_year = change_month(current_date, -12) last_month = change_month(current_date, -1) last_year = df.index[df.index >= last_year][0] last_month = df.index[df.index >= last_month][0] last_year_df = df.loc[last_year: last_trading_day] last_month_df = df.loc[last_month: last_trading_day] f_date = df_f.index[df_f.index <= current_date][0] next_date = df.index[df.index > current_date][0] next_5d = df.index[df.index > current_date][4] last_5d = df.index[df.index < current_date][-5] price = df.loc[last_trading_day, 'close'] * df.loc[last_trading_day, 'adj_factor'] if df_f.loc[f_date, 'rd_exp'] and df.loc[last_trading_day, 'total_mv'] and df.loc[last_trading_day, 'pe_ttm']: rd_exp_to_earning = df_f.loc[f_date, 'rd_exp']\ / df.loc[last_trading_day, 'total_mv']\ * df.loc[last_trading_day, 'pe_ttm'] else: rd_exp_to_earning = np.nan if df_f.loc[f_date, 'fcfe'] and df.loc[last_trading_day, 'total_mv']: fcfe = df_f.loc[f_date, 'fcfe'] / df.loc[last_trading_day, 'total_mv'] / 10000 else: fcfe = np.nan if df.index[df.index > next_date].shape[0] == 0: break return_rate_1d = df.loc[next_date, 'close'] / df.loc[next_date, 'pre_close'] - 1 return_rate_5d = df.loc[next_5d, 'close'] * df.loc[next_5d, 'adj_factor'] / (df.loc[current_date, 'close'] * df.loc[current_date, 'adj_factor']) - 1 return_rate_1m = cal_return(current_date, 1, 0, df) return_last_1d = df.loc[last_trading_day, 'close'] / df.loc[last_trading_day, 'pre_close'] - 1 return_last_5d = price / (df.loc[last_5d, 'close'] * df.loc[last_5d, 'adj_factor']) - 1 if np.mean(np.abs(existing_data['close'][-3:] - existing_data['pre_close'][-3:])) == 0: rsi_3 = 100 else: rsi_3 = 100 * np.mean(existing_data['adj_factor'][-3:] * np.maximum(existing_data['close'][-3:] - existing_data['pre_close'][-3:], 0)) / np.mean(existing_data['adj_factor'][-3:] * np.abs(existing_data['close'][-3:] - existing_data['pre_close'][-3:])) if rsi_3 > 70: rsi_3_adj = 50 - rsi_3 elif rsi_3 > 50: rsi_3_adj = rsi_3 - 50 elif rsi_3 > 30: rsi_3_adj = 30 - rsi_3 else: rsi_3_adj = 20 + rsi_3 if np.mean(np.abs(existing_data['close'][-14:] - existing_data['pre_close'][-14:])) == 0: rsi_14 = 100 else: rsi_14 = 100 * np.mean(existing_data['adj_factor'][-14:] * np.maximum(existing_data['close'][-14:] - existing_data['pre_close'][-14:], 0)) / np.mean(existing_data['adj_factor'][-14:] * np.abs(existing_data['close'][-14:] - existing_data['pre_close'][-14:])) if rsi_14 > 70: rsi_14_adj = 50 - rsi_14 elif rsi_14 > 50: rsi_14_adj = rsi_14 - 50 elif rsi_14 > 30: rsi_14_adj = 30 - rsi_14 else: rsi_14_adj = 20 + rsi_14 rsi_28 = 100 * np.mean(existing_data['adj_factor'][-28:] * np.maximum(existing_data['close'][-28:] - existing_data['pre_close'][-28:], 0)) / np.mean(existing_data['adj_factor'][-28:] * np.abs(existing_data['close'][-28:] - existing_data['pre_close'][-28:])) if rsi_28 > 70: rsi_28_adj = 50 - rsi_28 elif rsi_28 > 50: rsi_28_adj = rsi_28 - 50 elif rsi_28 > 30: rsi_28_adj = 30 - rsi_28 else: rsi_28_adj = 20 + rsi_28 obv = np.sum(np.sign(last_month_df['close'] - last_month_df['pre_close']) * last_month_df['vol']) if last_month_df.shape[0] <= 10: return_var_month_realized = np.nan return_skew_month_realized = np.nan return_kurt_month_realized = np.nan avg_tr_last_month = np.nan avg_tr_last_month_avg_tr_last_year = np.nan return_var_month = np.nan return_skew_month = np.nan return_kurt_month = np.nan return_d_var_month = np.nan return_u_var_month = np.nan return_d_var_var_month = np.nan t_t_1 = np.nan max_return_last_month = np.nan corr_vol_close_month = np.nan corr_vol_high_month = np.nan corr_vol_open_month = np.nan corr_vol_low_month = np.nan high_open_month = np.nan close_low_month = np.nan trend_strength_month = np.nan else: if np.isnan(last_month_df['turnover_rate_f']).all(): last_month_df.loc['turnover_rate_f'] = last_month_df['turnover_rate'] return_var_month_realized = np.nanmean(last_month_df['pct_chg'] ** 2) return_skew_month_realized = np.nanmean(last_month_df['pct_chg'] ** 3)\ / (return_var_month_realized ** 1.5) return_kurt_month_realized = np.nanmean(last_month_df['pct_chg'] ** 4)\ / (return_var_month_realized ** 2) avg_tr_last_month = np.nanmean(last_month_df['turnover_rate_f']) avg_tr_last_month_avg_tr_last_year = np.nanmean( last_month_df['turnover_rate_f']) / np.nanmean(last_year_df['turnover_rate_f']) return_var_month = last_month_df['pct_chg'].var() return_skew_month = last_month_df['pct_chg'].skew() return_kurt_month = last_month_df['pct_chg'].kurt() return_d_var_month = d_var(last_month_df['pct_chg']) return_u_var_month = u_var(last_month_df['pct_chg']) return_d_var_var_month = return_d_var_month / return_var_month t_t_1 = cal_return(current_date, 0, -1, df) max_return_last_month = np.nanmax(last_month_df['pct_chg']) corr_vol_close_month = corrcoef(last_month_df['vol'], last_month_df['adj_factor'] * last_month_df['close']) corr_vol_high_month = corrcoef(last_month_df['vol'], last_month_df['adj_factor'] * last_month_df['high']) corr_vol_open_month = corrcoef(last_month_df['vol'], last_month_df['adj_factor'] * last_month_df['open']) corr_vol_low_month = corrcoef(last_month_df['vol'], last_month_df['adj_factor'] * last_month_df['low']) high_open_month = np.nanmean(last_month_df['high'] / last_month_df['open']) close_low_month = np.nanmean(last_month_df['close'] / last_month_df['low']) trend_strength_month = (last_month_df['close'][-1] - last_month_df['pre_close'][0])\ / np.nansum(np.abs(last_month_df['close'] - last_month_df['pre_close'])) if last_year_df.shape[0] <= 20: return_var_year_realized = np.nan return_skew_year_realized = np.nan return_kurt_year_realized = np.nan return_var_year = np.nan return_skew_year = np.nan return_kurt_year = np.nan return_d_var_year = np.nan return_u_var_year = np.nan return_d_var_var_year = np.nan std_tr_last_year = np.nan avg_abs_return_tr_last_year = np.nan close_last_year_high = np.nan max_return_last_year = np.nan corr_vol_close_year = np.nan corr_vol_high_year = np.nan corr_vol_open_year = np.nan corr_vol_low_year = np.nan high_open_year = np.nan close_low_year = np.nan trend_strength_year = np.nan ma20_price = np.nan ma20_ma5 = np.nan SO_k = np.nan else: if np.isnan(last_year_df['turnover_rate_f']).all(): last_year_df.loc['turnover_rate_f'] = last_year_df['turnover_rate'] return_var_year_realized = np.nanmean(last_year_df['pct_chg'] ** 2) return_skew_year_realized = np.nanmean(last_year_df['pct_chg'] ** 3)\ / (return_var_year_realized ** 1.5) return_kurt_year_realized = np.nanmean(last_year_df['pct_chg'] ** 4)\ / (return_var_year_realized ** 2) return_var_year = last_year_df['pct_chg'].var() return_skew_year = last_year_df['pct_chg'].skew() return_kurt_year = last_year_df['pct_chg'].kurt() return_d_var_year = d_var(last_year_df['pct_chg']) return_u_var_year = u_var(last_year_df['pct_chg']) return_d_var_var_year = return_d_var_year / return_var_year std_tr_last_year = np.nanstd(last_year_df['turnover_rate_f']) avg_abs_return_tr_last_year = np.nanmean(np.abs(last_year_df['pct_chg']) / last_year_df['turnover_rate_f']) close_last_year_high = df.loc[last_trading_day, 'close']\ * df.loc[last_trading_day, 'adj_factor']\ / np.nanmax(last_year_df['high'] * last_year_df['adj_factor']) max_return_last_year = np.nanmax(last_year_df['pct_chg']) corr_vol_close_year = corrcoef(last_year_df['vol'], last_year_df['adj_factor'] * last_year_df['close']) corr_vol_high_year = corrcoef(last_year_df['vol'], last_year_df['adj_factor'] * last_year_df['high']) corr_vol_open_year = corrcoef(last_year_df['vol'], last_year_df['adj_factor'] * last_year_df['open']) corr_vol_low_year = corrcoef(last_year_df['vol'], last_year_df['adj_factor'] * last_year_df['low']) high_open_year = np.nanmean(last_year_df['high'] / last_year_df['open']) close_low_year = np.nanmean(last_year_df['close'] / last_year_df['low']) trend_strength_year = (last_year_df['close'][-1] - last_year_df['pre_close'][0])\ / np.nansum(np.abs(last_year_df['close'] - last_year_df['pre_close'])) ma5_price = (np.nanmean(last_year_df['close'][-5:] * last_year_df['adj_factor'][-5:]) - price)\ / price ma20_price = (np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:]) - price)\ / price ma20_ma5 = (np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:]) - np.nanmean(last_year_df['close'][-5:] * last_year_df['adj_factor'][-5:]))\ / price SO_k = SO(last_year_df.iloc[-20:]) if last_year_df.shape[0] > 120: ma120_price = (np.nanmean(last_year_df['close'][-120:] * last_year_df['adj_factor'][-120:]) - price)\ / price ma120_ma40 = (np.nanmean(last_year_df['close'][-120:] * last_year_df['adj_factor'][-120:]) - np.nanmean(last_year_df['close'][-40:] * last_year_df['adj_factor'][-40:]))\ / price else: ma120_price = np.nan ma120_ma40 = np.nan if last_year_df.shape[0] > 60: ma60_price = (np.nanmean(last_year_df['close'][-60:] * last_year_df['adj_factor'][-60:]) - price)\ / price ma60_ma20 = (np.nanmean(last_year_df['close'][-60:] * last_year_df['adj_factor'][-60:]) - np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:]))\ / price SO_d = np.nanmean([SO(last_year_df.iloc[-20:]), SO(last_year_df.iloc[-40:-20]), SO(last_year_df.iloc[-60:-20])]) SO_k_d = SO_k - SO_d else: ma60_price = np.nan ma60_ma20 = np.nan SO_d = np.nan SO_k_d = np.nan info = { 'tick': stock, 'industry': industry_stock(stock, dict_industry), 'stock_value_cat': stock_to_cat(stock, current_date, dict_300, dict_500), 'date': current_date, 'return_rate_1d': return_rate_1d, 'return_rate_5d': return_rate_5d, 'return_rate_1m': return_rate_1m, 'close': price # Trend factors # Reversal 'return_last_1d': return_last_1d, 'return_last_5d': return_last_5d, 't_6_t_12': cal_return(current_date, -6, -12, df), 't_12_t_36': cal_return(current_date, -12, -36, df), 't_12_t_18': cal_return(current_date, -12, -18, df), # Momentum 't_1_t_6': cal_return(current_date, -1, -6, df), 't_t_1': t_t_1, 'return_month': cal_hist_month(df, current_date), 'SO_k': SO_k, 'SO_d': SO_d, 'SO_k_d': SO_k_d, 'ma5_price': ma5_price, 'ma20_price': ma20_price, 'ma20_ma5': ma20_ma5, 'ma60_price': ma60_price, 'ma60_ma20': ma60_ma20, 'ma120_price': ma120_price, 'ma120_ma40': ma120_ma40, # Liquidity 'std_tr_last_year': std_tr_last_year, 'avg_tr_last_month': avg_tr_last_month, 'avg_tr_last_month_avg_tr_last_year': avg_tr_last_month_avg_tr_last_year, # Technical 'rsi_3': rsi_3, 'rsi_3_adj': rsi_3_adj, 'rsi_14': rsi_14, 'rsi_14_adj': rsi_14_adj, 'rsi_28': rsi_28, 'rsi_28_adj': rsi_28_adj, 'obv': obv, 'close_last_year_high': close_last_year_high, 'max_return_last_month': max_return_last_month, 'max_return_last_year': max_return_last_year, 'avg_abs_return_tr_last_year': avg_abs_return_tr_last_year, 'ln_mv_t': math.log(df.loc[last_trading_day, 'total_mv']), 'ln_mv_c': math.log(df.loc[last_trading_day, 'circ_mv']), 'mv_c_mv_t': df.loc[last_trading_day, 'circ_mv'] / df.loc[last_trading_day, 'total_mv'], 'return_var_month_realized': return_var_month_realized, 'return_skew_month_realized': return_skew_month_realized, 'return_kurt_month_realized': return_kurt_month_realized, 'return_var_month': return_var_month, 'return_skew_month': return_skew_month, 'return_kurt_month': return_kurt_month, 'return_d_var_month': return_d_var_month, 'return_u_var_month': return_u_var_month, 'return_d_var_var_month': return_d_var_var_month, 'return_var_year_realized': return_var_year_realized, 'return_skew_year_realized': return_skew_year_realized, 'return_kurt_year_realized': return_kurt_year_realized, 'return_var_year': return_var_year, 'return_skew_year': return_skew_year, 'return_kurt_year': return_kurt_year, 'return_d_var_year': return_d_var_year, 'return_u_var_year': return_u_var_year, 'return_d_var_var_year': return_d_var_var_year, 'corr_vol_close_month': corr_vol_close_month, 'corr_vol_high_month': corr_vol_high_month, 'corr_vol_open_month': corr_vol_open_month, 'corr_vol_low_month': corr_vol_low_month, 'high_open_month': high_open_month, 'close_low_month': close_low_month, 'trend_strength_month': trend_strength_month, 'corr_vol_close_year': corr_vol_close_year, 'corr_vol_high_year': corr_vol_high_year, 'corr_vol_open_year': corr_vol_open_year, 'corr_vol_low_year': corr_vol_low_year, 'high_open_year': high_open_year, 'close_low_year': close_low_year, 'trend_strength_year': trend_strength_year, # Value factors 'pe': df.loc[last_trading_day, 'pe_ttm'], 'ps': df.loc[last_trading_day, 'ps_ttm'], 'pb': df.loc[last_trading_day, 'pb'], 'current_ratio': df_f.loc[f_date, 'current_ratio'], 'quick_ratio': df_f.loc[f_date, 'quick_ratio'], 'cash_ratio': df_f.loc[f_date, 'cash_ratio'], 'inv_turn': df_f.loc[f_date, 'inv_turn'], 'ar_turn': df_f.loc[f_date, 'ar_turn'], 'ca_turn': df_f.loc[f_date, 'ca_turn'], 'fa_turn': df_f.loc[f_date, 'fa_turn'], 'assets_turn': df_f.loc[f_date, 'assets_turn'], 'fcfe': fcfe, 'tax_to_ebt': df_f.loc[f_date, 'tax_to_ebt'], 'ocf_to_or': df_f.loc[f_date, 'ocf_to_or'], 'ocf_to_opincome': df_f.loc[f_date, 'ocf_to_opincome'], 'ca_to_assets': df_f.loc[f_date, 'ca_to_assets'], 'tbassets_to_totalassets': df_f.loc[f_date, 'tbassets_to_totalassets'], 'int_to_talcap': df_f.loc[f_date, 'int_to_talcap'], 'currentdebt_to_debt': df_f.loc[f_date, 'currentdebt_to_debt'], 'longdeb_to_debt': df_f.loc[f_date, 'longdeb_to_debt'], 'ocf_to_shortdebt': df_f.loc[f_date, 'ocf_to_shortdebt'], 'debt_to_eqt': df_f.loc[f_date, 'debt_to_eqt'], 'tangibleasset_to_debt': df_f.loc[f_date, 'tangibleasset_to_debt'], 'tangasset_to_intdebt': df_f.loc[f_date, 'tangasset_to_intdebt'], 'tangibleasset_to_netdebt': df_f.loc[f_date, 'tangibleasset_to_netdebt'], 'ocf_to_debt': df_f.loc[f_date, 'ocf_to_debt'], 'ocf_to_interestdebt': df_f.loc[f_date, 'ocf_to_interestdebt'], 'longdebt_to_workingcapital': df_f.loc[f_date, 'longdebt_to_workingcapital'], 'ebitda_to_debt': df_f.loc[f_date, 'ebitda_to_debt'], 'cash_to_liqdebt': df_f.loc[f_date, 'cash_to_liqdebt'], 'cash_to_liqdebt_withinterest': df_f.loc[f_date, 'cash_to_liqdebt_withinterest'], 'q_netprofit_margin': df_f.loc[f_date, 'q_netprofit_margin'], 'q_gsprofit_margin': df_f.loc[f_date, 'q_gsprofit_margin'], 'q_exp_to_sales': df_f.loc[f_date, 'q_exp_to_sales'], 'q_profit_to_gr': df_f.loc[f_date, 'q_profit_to_gr'], 'q_saleexp_to_gr': df_f.loc[f_date, 'q_saleexp_to_gr'], 'q_adminexp_to_gr': df_f.loc[f_date, 'q_adminexp_to_gr'], 'q_finaexp_to_gr': df_f.loc[f_date, 'q_finaexp_to_gr'], 'q_impair_to_gr_ttm': df_f.loc[f_date, 'q_impair_to_gr_ttm'], 'q_gc_to_gr': df_f.loc[f_date, 'q_gc_to_gr'], 'q_op_to_gr': df_f.loc[f_date, 'q_op_to_gr'], 'q_roe': df_f.loc[f_date, 'q_roe'], 'q_dt_roe': df_f.loc[f_date, 'q_dt_roe'], 'q_npta': df_f.loc[f_date, 'q_npta'], 'q_opincome_to_ebt': df_f.loc[f_date, 'q_opincome_to_ebt'], 'q_investincome_to_ebt': df_f.loc[f_date, 'q_investincome_to_ebt'], 'q_dtprofit_to_profit': df_f.loc[f_date, 'q_dtprofit_to_profit'], 'q_salescash_to_or': df_f.loc[f_date, 'q_salescash_to_or'], 'q_ocf_to_sales': df_f.loc[f_date, 'q_ocf_to_sales'], 'q_ocf_to_or': df_f.loc[f_date, 'q_ocf_to_or'], 'ocf_yoy': df_f.loc[f_date, 'ocf_yoy'], 'roe_yoy': df_f.loc[f_date, 'roe_yoy'], 'q_gr_yoy': df_f.loc[f_date, 'q_gr_yoy'], 'q_sales_yoy': df_f.loc[f_date, 'q_sales_yoy'], 'q_op_yoy': df_f.loc[f_date, 'q_op_yoy'], 'q_profit_yoy': df_f.loc[f_date, 'q_profit_yoy'], 'q_netprofit_yoy': df_f.loc[f_date, 'q_netprofit_yoy'], 'equity_yoy': df_f.loc[f_date, 'equity_yoy'], 'rd_exp_to_earning': rd_exp_to_earning } data = data.append(info, ignore_index=True) current_date = next_date if data.shape[0] > 0: data.to_sql( name=stock, con=con, if_exists='replace', index=False ) con.commit() except Exception as e: print(stock) print(repr(e)) cur.close() con.close() print(str(i)+" done") return None
def forward(self, _x): x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size) s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size) x = x.view(s * b, h) x = self.fc(x) x = x.view(s, b, -1) # 把形状改回来 return x model = LSTM_Regression(CHARACTER_FOR_TRAIN, 8, output_size=1, num_layers=2).cuda() loss_function = nn.MSELoss().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) industry_dict = get_data.load_obj('industry_dict') data = get_data.get_from_sql(name='LSTM_data') for industry in industry_dict: for stock in industry_dict[industry]['con_code']: data_train = torch.from_numpy( data[stock + '_data_train'].values.astype(np.float32).reshape( -1, 1, CHARACTER_FOR_TRAIN)).cuda() target_train = torch.from_numpy( data[stock + '_target_train'].values.astype( np.float32)[:, 0].reshape(-1, 1, 1)).cuda() if data_train.shape[0] == 0: continue for i in range(num_epochs): out = model(data_train) loss = loss_function(out, target_train)
from sklearn.model_selection import train_test_split from Process.genBasicData import genData from Utils import feaUtils import sys sys.path.append('D:\Code\Hybrid Model') import sqlite3 as db import time from multiprocessing import Pool import pandas as pd import tushare as ts import numpy as np import functions.get_data as get_data import math data = get_data.get_from_sql(stock_id = 'test_data_801010.SI', name = 'rf_data_industry') train_data = genData(pathUtils.train_path) test_data = genData(pathUtils.test_path) param = {'max_depth': 3, 'learning_rate ': 0.01, 'silent': 1, 'objective': 'binary:logistic', "eval_metric":"auc", "scale_pos_weight":10, "subsample":0.8, "min_child_weight":1, "n_estimators": 1} # features = [i for i in list(train_data.columns) if i not in ["ID","y"]] features = feaUtils.train_fea
def regressor_test(self, test_x): b_row = test_x.shape[0] h = self.sigmoid(np.dot(test_x, self.w) + self.b[:b_row, :]) result = np.dot(h, self.beta) return result def classifisor_test(self, test_x): b_row = test_x.shape[0] h = self.sigmoid(np.dot(test_x, self.w) + self.b[:b_row, :]) result = np.dot(h, self.beta) result = [item.tolist().index(max(item.tolist())) for item in result] return result stdsc = StandardScaler() stockdata = get_data.get_from_sql(minimum_data=500) example = stockdata['600419.SH'] adjust_factor = ['low', 'close', 'open', 'high'] for item in adjust_factor: example[item + '_adj'] = example[item] * example['adj_factor'] example_1 = example[[ 'low_adj', 'close_adj', 'open_adj', 'high_adj', 'pct_chg', 'pe_ttm', 'vol', 'turnover_rate', 'dv_ttm', 'float_share', 'turnover_rate_f', 'pb', 'ps_ttm', 'volume_ratio' ]] for i in range(1, 11): example_1['close_adj_last_' + str(i)] = example_1['close_adj'].shift(i) example_1['open_adj_last_' + str(i)] = example_1['open_adj'].shift(i) example_1['high_adj_last_' + str(i)] = example_1['high_adj'].shift(i) example_1['low_adj_last_' + str(i)] = example_1['low_adj'].shift(i)
def prep_data_for_rf(stock_list, const): """ Calculate the daily indicators for stocks in the given stock_list. Args: stock_list (list): The stocks that need to be calculated. const (TYPE): Multiprocessing id. Returns: None. """ con = db.connect('D:\\Data\\rf_data_research_'+str(const)+'.sqlite') for stock in stock_list: try: print(stock, const) df = get_data.get_from_sql(stock_id=stock) df_f = get_data.get_from_sql(stock_id=stock, name='data_finance') df.set_index('trade_date', inplace=True) df_f.set_index('ann_date', inplace=True) df_f = df_f[::-1] df_f = df_f.drop(['ts_code', 'end_date', 'dt_eps'], axis=1) df_m = pd.merge(df_f, df, right_index = True, left_index=True, how='outer') df_m.fillna(method='ffill', inplace=True) df_m['pct_chg'] = (df_m['close'] / df_m['pre_close'] - 1) * 100 for i in ['high', 'low', 'close', 'open']: df_m[i + '_adj'] = df_m[i] * df_m['adj_factor'] df_m['pre_close_adj'] = df_m['close_adj'] / (df_m['pct_chg'] / 100 + 1) df_m['MACD'] = talib.MACD(df_m['close_adj'])[1] df_m['WILLR'] = talib.WILLR(df_m['high_adj'], df_m['low_adj'], df_m['close_adj']) df_m['AD'] = talib.AD(df_m['high_adj'], df_m['low_adj'], df_m['close_adj'], df_m['vol']) df_m['RSI_3'] = talib.RSI(df_m['close_adj'], timeperiod=3) df_m['RSI_14'] = talib.RSI(df_m['close_adj'], timeperiod=14) df_m['RSI_28'] = talib.RSI(df_m['close_adj'], timeperiod=28) df_m['CCI_3'] = talib.CCI(df_m['high_adj'], df_m['low_adj'], df_m['close_adj'], timeperiod=3) df_m['CCI_14'] = talib.CCI(df_m['high_adj'], df_m['low_adj'], df_m['close_adj'], timeperiod=14) df_m['CCI_28'] = talib.CCI(df_m['high_adj'], df_m['low_adj'], df_m['close_adj'], timeperiod=28) df_m['WILLR'] = talib.WILLR(df_m['high_adj'], df_m['low_adj'], df_m['close_adj']) df_m['SMA5'] = (talib.MA(df_m['close_adj'], timeperiod=5) - df_m['close_adj']) / df_m['close_adj'] df_m['SMA20'] = (talib.MA(df_m['close_adj'], timeperiod=20) - df_m['close_adj']) / df_m['close_adj'] df_m['SMA60'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['close_adj']) / df_m['close_adj'] df_m['SMA20_5'] = (talib.MA(df_m['close_adj'], timeperiod=20) - df_m['SMA5']) / df_m['SMA5'] df_m['SMA60_5'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['SMA5']) / df_m['SMA5'] df_m['SMA60_20'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['SMA20']) / df_m['SMA20'] df_m['SMA5_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=5) df_m['SMA20_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=20) df_m['SMA60_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=60) df_m['SMA250_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=250) df_m['SMA5_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=5) - df_m['turnover_rate']) / df_m['turnover_rate'] df_m['SMA20_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=20) - df_m['turnover_rate']) / df_m['turnover_rate'] df_m['SMA60_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60) - df_m['turnover_rate']) / df_m['turnover_rate'] df_m['SMA250_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250) - df_m['turnover_rate']) / df_m['turnover_rate'] df_m['SMA20_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=20) - df_m['SMA5_tr']) / df_m['SMA5_tr'] df_m['SMA60_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60) - df_m['SMA5_tr']) / df_m['SMA5_tr'] df_m['SMA250_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250) - df_m['SMA5_tr']) / df_m['SMA5_tr'] df_m['SMA60_SMA20_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60) - df_m['SMA20_tr']) / df_m['SMA20_tr'] df_m['SMA250_SMA20_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250) - df_m['SMA20_tr']) / df_m['SMA20_tr'] df_m['corr_close_vol_20'] = talib.CORREL(df_m['close_adj'], df_m['vol'], timeperiod=20) df_m['corr_close_vol_60'] = talib.CORREL(df_m['close_adj'], df_m['vol'], timeperiod=60) df_m['pct_chg_STD_20'] = talib.STDDEV(df_m['pct_chg'], timeperiod=20) df_m['pct_chg_STD_60'] = talib.STDDEV(df_m['pct_chg'], timeperiod=60) df_m['pct_chg_STD_250'] = talib.STDDEV(df_m['pct_chg'], timeperiod=250) df_m['OBV'] = talib.OBV(df_m['pct_chg'], df_m['vol']) df_m['MOM_5'] = talib.MOM(df_m['close_adj'], timeperiod=5) / df_m['close_adj'].shift(5) df_m['MOM_20'] = talib.MOM(df_m['close_adj'], timeperiod=20) / df_m['close_adj'].shift(20) df_m['MOM_60'] = talib.MOM(df_m['close_adj'], timeperiod=60) / df_m['close_adj'].shift(60) df_m['MOM_250'] = talib.MOM(df_m['close_adj'], timeperiod=250) / df_m['close_adj'].shift(250) df_m['t_1_t_6'] = (talib.MOM(df_m['close_adj'], timeperiod=100) / df_m['close_adj'].shift(100)).shift(20) df_m['t_6_t_12'] = (talib.MOM(df_m['close_adj'], timeperiod=120) / df_m['close_adj'].shift(120)).shift(120) df_m['close_kurt_20'] = df_m['pct_chg'].rolling(20).kurt() df_m['close_kurt_60'] = df_m['pct_chg'].rolling(60).kurt() df_m['close_skew_20'] = df_m['pct_chg'].rolling(20).skew() df_m['close_skew_60'] = df_m['pct_chg'].rolling(60).skew() df_m['open_close_5'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(5).mean() df_m['open_close_20'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(20).mean() df_m['open_close_60'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(60).mean() df_m['open_high_5'] = (df_m['high_adj'] / df_m['open_adj'] - 1).rolling(5).mean() df_m['open_high_20'] = (df_m['high_adj'] / df_m['open_adj'] - 1).rolling(20).mean() df_m['open_close_60'] = (df_m['high_adj'] / df_m['open_adj'] - 1).rolling(60).mean() df_m['close_low_5'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(5).mean() df_m['close_low_20'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(20).mean() df_m['close_low_60'] = (df_m['close_adj'] / df_m['open_adj'] - 1).rolling(60).mean() df_m['open_pre_close_5'] = (df_m['open_adj'] / df_m['pre_close_adj'] - 1).rolling(5).mean() df_m['open_pre_close_20'] = (df_m['open_adj'] / df_m['pre_close_adj'] - 1).rolling(20).mean() df_m['open_pre_close_60'] = (df_m['open_adj'] / df_m['pre_close_adj'] - 1).rolling(60).mean() df_m['trend_strength_20'] = df_m['MOM_20'] / np.abs( df_m['close_adj'] - df_m['pre_close_adj']).rolling(20).sum() df_m['return_1d'] = (df_m['close_adj'].shift(-1) - df_m['close_adj']) / df_m['close_adj'] df_m['return_5d'] = (df_m['close_adj'].shift(-5) - df_m['close_adj']) / df_m['close_adj'] df_m['return_20d'] = (df_m['close_adj'].shift(-20) - df_m['close_adj']) / df_m['close_adj'] df_m.to_sql( name=stock, con=con, if_exists='replace', index_label='date' ) con.commit() except Exception as e: print(stock) print(repr(e)) con.close()
def gather_target_df(): """ Gather the return information from the downloaded sql and give each day a label. Returns: None. """ con = db.connect('D:\\Data\\rf_data_research_target.sqlite') dict_industry = get_data.get_industry_stock_list() data_list = get_data.get_sql_key() data_f_list = get_data.get_sql_key(name='data_finance') data_l = set(data_list) for i in dict_industry: dict_industry[i] = list(dict_industry[i]['con_code']) data_l = data_l - set(dict_industry[i]) dict_industry['None'] = list(data_l) # calculate future return and give industry label result = pd.DataFrame() for industry in dict_industry: for stock in dict_industry[industry]: if (stock not in data_f_list) or (stock not in data_list): continue df = get_data.get_from_sql(stock_id=stock) df_m = pd.DataFrame() df_m['close_adj'] = df['close'] * df['adj_factor'] df_m['return_1d'] = (df_m['close_adj'].shift(-1) - df_m['close_adj']) / df_m['close_adj'] df_m['return_5d'] = (df_m['close_adj'].shift(-5) - df_m['close_adj']) / df_m['close_adj'] df_m['return_20d'] = (df_m['close_adj'].shift(-20) - df_m['close_adj']) / df_m['close_adj'] df_m['tick'] = stock df_m = df_m[['return_1d', 'return_5d', 'return_20d', 'tick']] df_m['industry'] = industry df_m['date'] = df['trade_date'] result = result.append(df_m) adjustment_day = get_adjustment_day() # Give index label ts.set_token('267addf63a14adcfc98067fc253fbd72a728461706acf9474c0dae29') pro = ts.pro_api() dict_300 = {} for i in range(14): dict_300[str(2007+i)+'0101'] = list(pro.index_weight(index_code='399300.SZ', start_date=str(2007+i)+'0101', end_date=str(2007+i)+'0110')['con_code'].iloc[:300]) if i != 13: dict_300[str(2007+i)+'0701'] = list(pro.index_weight(index_code='399300.SZ', start_date=str(2007+i)+'0625', end_date=str(2007+i)+'0701')['con_code'].iloc[:300]) dict_500 = {} for i in range(14): dict_500[str(2007+i)+'0101'] = list(pro.index_weight(index_code='000905.SH', start_date=str(2007+i)+'0101', end_date=str(2007+i)+'0301')['con_code'].iloc[:500]) if i != 13: dict_500[str(2007+i)+'0701'] = list(pro.index_weight(index_code='000905.SH', start_date=str(2007+i)+'0625', end_date=str(2007+i)+'0710')['con_code'].iloc[:500]) result['index'] = 'None' adjustment_day = pd.Series(adjustment_day) result.dropna(axis=0, inplace = True) for t in dict_300: if adjustment_day[adjustment_day < t].shape[0] == 0: start_date = '20070101' end_date = adjustment_day[adjustment_day > t].iloc[0] elif adjustment_day[adjustment_day > t].shape[0] == 0: start_date = adjustment_day.iloc[-1] end_date = '20200501' else: start_date = adjustment_day[adjustment_day < t].iloc[-1] end_date = adjustment_day[adjustment_day > t].iloc[0] temp = result[(result['date'] > start_date) & (result['date'] <= end_date)] for stock in dict_300[t]: temp = temp[temp['tick'] == stock] result.loc[temp.index, 'index'] = 'hs300' for stock in dict_500[t]: temp = temp[temp['tick'] == stock] result.loc[temp.index, 'index'] = 'zz500' result.to_sql( name='All_Data', con=con, if_exists='replace', index=False ) con.commit() con.close()
def compute_discrete(): data_dict = get_data.get_sql_key(name='rf_data_d') correct_rate = {} correct_rate[5] = [] correct_rate[1] = [] correct_rate[2] = [] correct_rate[3] = [] correct_rate[4] = [] count = 1 length = len(data_dict) unit = int(length / 100) for stock in data_dict: if count % unit == 0: print(count / unit) count += 1 data = get_data.get_from_sql(name='rf_data_d', stock_id=stock) data = data[data['date'] > '20120101'] if data.shape[0] < 1500: continue temp_data = data[['rsi_3', 'rsi_14', 'rsi_28']] temp_data[temp_data < 30] = 1 temp_data[(temp_data >= 30) & (temp_data < 50)] = -1 temp_data[(temp_data >= 50) & (temp_data < 70)] = 1 temp_data[temp_data >= 70] = -1 data[['rsi_3', 'rsi_14', 'rsi_28']] = temp_data index = [ 'rsi_3', 'ma20_price', 'return_last_1d', 'return_month', 'return_month', 't_6_t_12', 'rsi_28', 'ma60_ma20', 'SO_k_d', 'rsi_14', 'obv', 't_t_1', 't_12_t_18', 't_1_t_6', 'corr_vol_close_month', 'ma120_price', 'ma120_ma40', 'ma60_price', 't_12_t_36', 'ma5_price', 'corr_vol_close_year', 'ma20_ma5' ] data_x = data[index] data_x[data_x > 0] = 1 data_x[data_x < 0] = -1 data_y = (data['return_rate_1m'] > 0).astype(int) data_x[np.isnan(data_x)] = 0 # print('*' * 50) # print(stock) for i in range(5, 0, -1): x_train = data_x.iloc[:-i * 200] y_train = data_y.iloc[:-i * 200] x_test = data_x.iloc[-i * 200:-(i - 1) * 200 - 1] y_test = data_y.iloc[-i * 200:-(i - 1) * 200 - 1] classifier = RandomForestClassifier( min_samples_leaf=100, n_estimators=200, random_state=0, n_jobs=-1, class_weight='balanced_subsample') classifier.fit(x_train, y_train) y_pre = classifier.predict(x_test) # print(i) # print(np.nanmean((y_pre == y_test))) correct_rate[i].append(np.nanmean((y_pre == y_test))) ''' importances = list(classifier.feature_importances_) # List of tuples with variable and importance feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)[:10] # Print out the feature and importances [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] ''' for i in correct_rate: print(np.mean(correct_rate[i]))
def compute_talib(): data_dict = get_data.get_sql_key(name='data') correct_rate = {} correct_rate[5] = [] correct_rate[1] = [] correct_rate[2] = [] correct_rate[3] = [] correct_rate[4] = [] count = 1 length = len(data_dict) unit = int(length / 100) for stock in data_dict: if count % unit == 0: print(count / unit) count += 1 data = get_data.get_from_sql(stock_id=stock) # data = data[data['trade_date'] > '20120101'] if data.shape[0] < 1500: continue adjusted_close = data['close'] * data['adj_factor'] adjusted_high = data['high'] * data['adj_factor'] adjusted_low = data['low'] * data['adj_factor'] adjusted_open = data['open'] * data['adj_factor'] data_x = pd.DataFrame() data_x['MACD'] = talib.MACD(adjusted_close)[1] data_x['RSI'] = talib.RSI(adjusted_close) data_x['WILLR'] = talib.WILLR(adjusted_high, adjusted_low, adjusted_open) data_y = (data['pct_chg'].shift(-1) > 0).astype(int) data_x[np.isnan(data_x)] = 0 # print('*' * 50) # print(stock) for i in range(5, 0, -1): x_train = data_x.iloc[100:-i * 200] y_train = data_y.iloc[100:-i * 200] x_test = data_x.iloc[-i * 200:-(i - 1) * 200 - 1] y_test = data_y.iloc[-i * 200:-(i - 1) * 200 - 1] classifier = RandomForestClassifier( min_samples_leaf=100, n_estimators=200, random_state=0, n_jobs=-1, class_weight='balanced_subsample') classifier.fit(x_train, y_train) y_pre = classifier.predict(x_test) # print(i) # print(np.nanmean((y_pre == y_test))) correct_rate[i].append(np.nanmean((y_pre == y_test))) ''' importances = list(classifier.feature_importances_) # List of tuples with variable and importance feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)] # Sort the feature importances by most important first feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)[:10] # Print out the feature and importances [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] ''' for i in correct_rate: print(np.mean(correct_rate[i]))
if __name__ == '__main__': industry_dict = get_data.load_obj('industry_dict') model = SimpleNet(232, 50, 20, 1) if torch.cuda.is_available(): model = model.cuda() criterion = nn.MSELoss().cuda() optimizer = optim.SGD(model.parameters(), lr=learning_rate) epoch = 0 count = 0 for i in range(num_epochs): for industry in industry_dict: data_train = get_data.get_from_sql( stock_id=industry + '_data_train', name='CNN_industry').values.astype(np.float32) if data_train.shape[0] == 0: continue target_train = get_data.get_from_sql( stock_id=industry + '_target_train', name='CNN_industry').values[:, 0:1].astype(np.float32) characteristic = torch.from_numpy(data_train) label = torch.from_numpy(target_train) if characteristic.shape[0] < 2: continue if torch.cuda.is_available(): characteristic = characteristic.cuda() label = label.cuda() else: characteristic = Variable(characteristic)
def calc_accr_rf(stock_list, d): """ Create a random forest for each stock in the stock_list and output the result. Args: stock_list (list): Given stock list to operate on. d (int): Multiprocessing id. Returns: None. """ warnings.filterwarnings("ignore") result_stock = pd.DataFrame() con = db.connect('D:\\Data\\rf_data_research_target.sqlite') target_list = [ 'industry_return_1d', 'industry_return_5d', 'industry_return_20d', 'index_return_1d', 'index_return_5d', 'index_return_20d', 'all_return_1d', 'all_return_5d', 'all_return_20d' ] target_list_abs = ['return_1d', 'return_5d', 'return_20d'] train_end_date = '20180101' for stock in stock_list: try: data_train_x = get_data.get_from_sql(stock_id=stock, name='rf_data_research') data_train_x.rename(columns={'index': 'date'}, inplace=True) data_train_x.drop(['return_1d', 'return_5d', 'return_20d'], inplace=True, axis=1) data_train_y = pd.read_sql_query( sql="SELECT * FROM Processed_data WHERE tick = '" + stock + "'", con=con) data_set = pd.merge(data_train_x, data_train_y, left_on='date', right_on='date', how='inner') data_set.fillna(0, inplace=True) temp_dict = {'tick': stock} temp_dict['index'] = data_train_y['index'].iloc[-1] temp_dict['industry'] = data_train_y['industry'].iloc[-1] for target in target_list: data_set_working = data_set[list(data_train_x.columns) + [target, 'return_1d']] data_set_working = data_set_working[ data_set_working[target] != 0] train_data_set = data_set_working[ data_set_working['date'] < train_end_date] if train_data_set.shape[0] < 500: break test_data_set = data_set_working[ data_set_working['date'] >= train_end_date] if train_data_set.shape[0] < 100: break train_data_set['target'] = -1 train_data_set.loc[train_data_set[ train_data_set[target] >= 0.7].index, 'target'] = 1 train_data_set.loc[train_data_set[ train_data_set[target] <= 0.3].index, 'target'] = 0 train_data_set = train_data_set[train_data_set['target'] >= 0] train_x = train_data_set[data_train_x.columns].drop('date', axis=1) train_x[np.isinf(train_x)] = 0 train_y = train_data_set['target'] test_x = test_data_set[data_train_x.columns].drop('date', axis=1) test_x[np.isinf(test_x)] = 0 test_y = (test_data_set[target] > 0.5).astype(int) classifier = RandomForestClassifier( min_samples_leaf=100, n_estimators=200, random_state=0, n_jobs=-1, class_weight='balanced_subsample') classifier.fit(train_x, train_y) y_pre = classifier.predict(test_x) temp_dict[target] = round(np.nanmean(y_pre == test_y), 4) prob_y = classifier.predict_proba(test_x) prob_y = prob_y[:, 1] if not (prob_y <= 0.5).all(): prob_y[ prob_y >= np.percentile(prob_y[prob_y > 0.5], 70)] = 1 if not (prob_y >= 0.5).all(): prob_y[ prob_y <= np.percentile(prob_y[prob_y < 0.5], 30)] = 0 temp_dict[target + '_enhanced'] = round( np.nanmean((prob_y == test_y)) / 0.3, 4) if 'return_1d' in target: temp_dict[target + '_return'] = np.sum( y_pre * test_data_set['return_1d']) temp_dict[target + '_return_compound'] = np.prod( y_pre * test_data_set['return_1d'] + 1) y_pre[y_pre == 0] = -1 temp_dict[target + '_return_ls'] = np.sum( y_pre * test_data_set['return_1d']) temp_dict[target + '_return_compound_ls'] = np.prod( y_pre * test_data_set['return_1d'] + 1) stor_prob_y = prob_y.copy() prob_y[prob_y < 1] = 0 temp_dict[target + '_enhanced_return'] = np.sum( prob_y * test_data_set['return_1d']) temp_dict[target + '_enhanced_return_compound'] = np.prod( prob_y * test_data_set['return_1d'] + 1) stor_prob_y[stor_prob_y == 0] = -1 stor_prob_y[(stor_prob_y >= 0) & (stor_prob_y < 1)] = 0 temp_dict[target + '_enhanced_return_ls'] = np.sum( stor_prob_y * test_data_set['return_1d']) temp_dict[target + '_enhanced_return_compound_ls'] = np.prod( stor_prob_y * test_data_set['return_1d'] + 1) for target in target_list_abs: data_set_working = data_set[list(data_train_x.columns) + [target]] data_set_working = data_set_working[ data_set_working[target] != 0] train_data_set = data_set_working[ data_set_working['date'] < train_end_date] if train_data_set.shape[0] < 500: break test_data_set = data_set_working[ data_set_working['date'] >= train_end_date] if train_data_set.shape[0] < 100: break train_data_set['target'] = -1 train_data_set.loc[train_data_set[ train_data_set[target] >= 0].index, 'target'] = 1 train_data_set.loc[train_data_set[ train_data_set[target] <= 0].index, 'target'] = 0 train_data_set = train_data_set[train_data_set['target'] >= 0] train_x = train_data_set[data_train_x.columns].drop('date', axis=1) train_x[np.isinf(train_x)] = 0 train_y = train_data_set['target'] test_x = test_data_set[data_train_x.columns].drop('date', axis=1) test_x[np.isinf(test_x)] = 0 test_y = (test_data_set[target] >= 0).astype(int) classifier = RandomForestClassifier( min_samples_leaf=200, n_estimators=500, random_state=0, n_jobs=-1, class_weight='balanced_subsample') classifier.fit(train_x, train_y) y_pre = classifier.predict(test_x) temp_dict[target] = round(np.nanmean(y_pre == test_y), 4) prob_y = classifier.predict_proba(test_x) prob_y = prob_y[:, 1] if not (prob_y <= 0.5).all(): prob_y[ prob_y >= np.percentile(prob_y[prob_y > 0.5], 70)] = 1 if not (prob_y >= 0.5).all(): prob_y[ prob_y <= np.percentile(prob_y[prob_y < 0.5], 30)] = 0 temp_dict[target + '_enhanced'] = round( np.nanmean((prob_y == test_y)) / 0.3, 4) if 'return_1d' in target: temp_dict[target + '_return'] = np.sum( y_pre * test_data_set['return_1d']) temp_dict[target + '_return_compound'] = np.prod( y_pre * test_data_set['return_1d'] + 1) y_pre[y_pre == 0] = -1 temp_dict[target + '_return_ls'] = np.sum( y_pre * test_data_set['return_1d']) temp_dict[target + '_return_compound_ls'] = np.prod( y_pre * test_data_set['return_1d'] + 1) stor_prob_y = prob_y.copy() prob_y[prob_y < 1] = 0 temp_dict[target + '_enhanced_return'] = np.sum( prob_y * test_data_set['return_1d']) temp_dict[target + '_enhanced_return_compound'] = np.prod( prob_y * test_data_set['return_1d'] + 1) stor_prob_y[stor_prob_y == 0] = -1 stor_prob_y[(stor_prob_y >= 0) & (stor_prob_y < 1)] = 0 temp_dict[target + '_enhanced_return_ls'] = np.sum( stor_prob_y * test_data_set['return_1d']) temp_dict[target + '_enhanced_return_compound_ls'] = np.prod( stor_prob_y * test_data_set['return_1d'] + 1) result_stock = result_stock.append(temp_dict, ignore_index=True) except Exception as e: print(stock) print(repr(e)) result_stock.to_csv('D:\\output\\rf_result' + str(d) + '.csv')
def prep_data_for_rf(stock_list, dict_industry, calendar, i, dict_300, dict_500): con = db.connect('D:\\Data\\rf_temp_'+str(i)+'.sqlite') cur = con.cursor() data = pd.DataFrame() for stock in stock_list: try: df = get_data.get_from_sql(stock_id=stock) df_f = get_data.get_from_sql(stock_id=stock, name='data_finance') df.set_index('trade_date', inplace=True) df_f.set_index('ann_date', inplace=True) listed_date = df.index[0] usable = max(change_month(listed_date, 13), '20070101') if usable > '20200101': continue if 'turnover_rate_f' not in df.columns: df['turnover_rate_f'] = df['turnover_rate'] first_date = calendar[calendar >= usable].iloc[0] current_date = first_date while (current_date <= '20200131') and (current_date < df.index[-1]): stock_date = df.index[df.index >= current_date][0] ''' if stock_date != current_date: # If the stock is not trading in the first trading day of the month, # we don't collect its data and won't do trading on that stock this month. continue ''' last_trading_day = df.index[df.index < current_date][-1] last_year = change_month(current_date, -12) last_month = change_month(current_date, -1) last_year = df.index[df.index >= last_year][0] last_month = df.index[df.index >= last_month][0] last_year_df = df.loc[last_year: last_trading_day] last_month_df = df.loc[last_month: last_trading_day] f_date = df_f.index[df_f.index <= current_date][0] next_date = calendar[calendar >= change_month(current_date, 1)].iloc[0] price = df.loc[last_trading_day, 'close'] * df.loc[last_trading_day, 'adj_factor'] if df_f.loc[f_date, 'rd_exp'] and df.loc[last_trading_day, 'total_mv'] and df.loc[last_trading_day, 'pe_ttm']: rd_exp_to_earning = df_f.loc[f_date, 'rd_exp']\ / df.loc[last_trading_day, 'total_mv']\ * df.loc[last_trading_day, 'pe_ttm'] else: rd_exp_to_earning = np.nan if df_f.loc[f_date, 'fcfe'] and df.loc[last_trading_day, 'total_mv']: fcfe = df_f.loc[f_date, 'fcfe'] / df.loc[last_trading_day, 'total_mv'] / 10000 else: fcfe = np.nan if df.index[df.index > next_date].shape[0] == 0: break return_rate = cal_return(current_date, 1, 0, df) if last_month_df.shape[0] <= 10: return_var_month_realized = np.nan return_skew_month_realized = np.nan return_kurt_month_realized = np.nan avg_tr_last_month = np.nan avg_tr_last_month_avg_tr_last_year = np.nan return_var_month = np.nan return_skew_month = np.nan return_kurt_month = np.nan return_d_var_month = np.nan return_u_var_month = np.nan return_d_var_var_month = np.nan t_t_1 = np.nan max_return_last_month = np.nan corr_vol_close_month = np.nan corr_vol_high_month = np.nan corr_vol_open_month = np.nan corr_vol_low_month = np.nan high_open_month = np.nan close_low_month = np.nan