Exemple #1
0
def prep_data_rf():
    """
    Calculate daily indicators and store them in rf_data_research.

    Multiprocessing is used in this function to speed up the process.

    Returns:
        None.

    """
    stock_list = get_data.get_sql_key()
    stock_list_f = get_data.get_sql_key(name='data_finance')
    stock_list = list(set(stock_list) & set(stock_list_f))
    stock_list_list = divide_list(stock_list, 24)
    print('Start')
    p = Pool()
    for i in range(24):
        p.apply_async(prep_data_for_rf, args=(stock_list_list[i], i,))
    p.close()
    p.join()
    con = db.connect('D:\\Data\\rf_data_research.sqlite')
    cur = con.cursor()
    for i in range(24):
        data_temp = get_data.get_from_sql(name='rf_data_research_'+str(i))
        for stock in data_temp:
            data_temp[stock].to_sql(
                name=stock,
                con=con,
                if_exists='replace',
                index=False
                )
        os.remove('D:\\Data\\rf_data_research_' + str(i) + '.sqlite')
    con.commit()
    cur.close()
    con.close()
Exemple #2
0
def give_relative_return():
    """
    Calculate daily relative return cross-sectionally based on the data gathered.

    Multiprocessing is used in this function to speed up the process.

    Returns:
        None.

    """
    result = get_data.get_from_sql(stock_id='All_Data', name='rf_data_research_target')
    result = result[result['date'] >= '20070101']
    result = result.sort_values(by=['date']).reset_index(drop=True)
    sub_result = {}
    cutting = list(result['date'].drop_duplicates().index)
    unit = int(len(cutting) / 24)
    for i in range(24):
        if i != 23:
            sub_result[i] = result.loc[cutting[i * unit]: cutting[(i + 1) * unit] - 1]
        else:
            sub_result[i] = result.loc[cutting[i * unit]:]
    dict_industry = get_data.get_industry_stock_list()
    industry_list = list(dict_industry.keys()) + ['None']
    cal_relative_return(sub_result[0], industry_list, 0)
    p = Pool()
    print('Start pooling')
    for i in range(24):
        p.apply_async(cal_relative_return, args=(sub_result[i], industry_list, i, ))
    p.close()
    p.join()
    print('Done pooling')
    final = pd.DataFrame()
    con = db.connect('D:\\Data\\rf_data_research_target.sqlite')
    for i in range(24):
        final = final.append(get_data.get_from_sql(stock_id='Processed_data_'+str(i), name='rf_data_research_target'))
    final.to_sql(
                name='Processed_data',
                con=con,
                if_exists='replace',
                index=False
                )
    con.commit()
    con.close()
Exemple #3
0
def main():
    ts.set_token('267addf63a14adcfc98067fc253fbd72a728461706acf9474c0dae29')
    pro = ts.pro_api()
    dict_300 = {}
    for i in range(14):
        dict_300[str(2007+i)+'0101'] = list(pro.index_weight(index_code='399300.SZ',
                                                             start_date=str(2007+i)+'0101',
                                                             end_date=str(2007+i)+'0110')['con_code'].iloc[:300])
        dict_300[str(2007+i)+'0701'] = list(pro.index_weight(index_code='399300.SZ',
                                                             start_date=str(2007+i)+'0625',
                                                             end_date=str(2007+i)+'0701')['con_code'].iloc[:300])
    dict_500 = {}
    for i in range(14):
        dict_500[str(2007+i)+'0101'] = list(pro.index_weight(index_code='000905.SH',
                                                             start_date=str(2007+i)+'0101',
                                                             end_date=str(2007+i)+'0201')['con_code'].iloc[:500])
        dict_500[str(2007+i)+'0701'] = list(pro.index_weight(index_code='000905.SH',
                                                             start_date=str(2007+i)+'0625',
                                                             end_date=str(2007+i)+'0710')['con_code'].iloc[:500])
    calendar = pro.trade_cal(exchange='')
    calendar = calendar[calendar['is_open'] == 1]['cal_date']
    dict_industry = get_data.get_industry_stock_list()
    stock_list = get_data.get_sql_key()
    # prep_data_for_rf(stock_list, dict_industry, calendar, 1, dict_300, dict_500)
    stock_list_list = []
    length = int(len(stock_list) / 24)
    for i in range(24):
        if i == 23:
            stock_list_list.append(stock_list[i*length:])
        else:
            stock_list_list.append(stock_list[i*length: (i+1)*length])
    p = Pool()
    for i in range(24):
        p.apply_async(prep_data_for_rf, args=(stock_list_list[i], dict_industry, calendar, i, dict_300, dict_500, ))
    p.close()
    p.join()
    data = {}
    for i in range(24):
        data_temp = get_data.get_from_sql(name='rf_temp_' + str(i))
        data = {**data, **data_temp}
    con = db.connect('D:\\Data\\rf_data_d.sqlite')
    cur = con.cursor()
    for stock in data:
        data[stock].to_sql(
            name=stock,
            con=con,
            if_exists='replace',
            index=False
            )
    con.commit()
    cur.close()
    con.close()
    return None
Exemple #4
0
def prep_data_for_rf(stock_list, dict_industry, calendar, i, dict_300, dict_500):
    con = db.connect('D:\\Data\\rf_temp_'+str(i)+'.sqlite')
    cur = con.cursor()
    for stock in stock_list:
        try:
            df = get_data.get_from_sql(stock_id=stock)
            df_f = get_data.get_from_sql(stock_id=stock, name='data_finance')
            df.set_index('trade_date', inplace=True)
            df_f.set_index('ann_date', inplace=True)
            listed_date = df.index[0]
            usable = max(change_month(listed_date, 13), '20070101')
            if (usable > '20200101') or (usable > str(df.index[-1])):
                continue
            if 'turnover_rate_f' not in df.columns:
                df['turnover_rate_f'] = df['turnover_rate']
            first_date = df.index[df.index >= usable][0]
            current_date = first_date
            df['pct_chg'] = (df['close'] / df['pre_close'] - 1) * 100
            data = pd.DataFrame()
            while (current_date <= '20200101') and (current_date < df.index[-30]):
                # stock_date = df.index[df.index >= current_date][0]
                '''
                if stock_date != current_date:
                    # If the stock is not trading in the first trading day of the month,
                    # we don't collect its data and won't do trading on that stock this month.
                    continue
                '''
                last_trading_day = df.index[df.index < current_date][-1]
                existing_data = df.loc[: last_trading_day]
                last_year = change_month(current_date, -12)
                last_month = change_month(current_date, -1)
                last_year = df.index[df.index >= last_year][0]
                last_month = df.index[df.index >= last_month][0]
                last_year_df = df.loc[last_year: last_trading_day]
                last_month_df = df.loc[last_month: last_trading_day]
                f_date = df_f.index[df_f.index <= current_date][0]
                next_date = df.index[df.index > current_date][0]
                next_5d = df.index[df.index > current_date][4]
                last_5d = df.index[df.index < current_date][-5]
            
                price = df.loc[last_trading_day, 'close'] * df.loc[last_trading_day, 'adj_factor']
                if df_f.loc[f_date, 'rd_exp'] and df.loc[last_trading_day, 'total_mv'] and df.loc[last_trading_day, 'pe_ttm']:
                    rd_exp_to_earning = df_f.loc[f_date, 'rd_exp']\
                        / df.loc[last_trading_day, 'total_mv']\
                        * df.loc[last_trading_day, 'pe_ttm']
                else:
                    rd_exp_to_earning = np.nan
                if df_f.loc[f_date, 'fcfe'] and df.loc[last_trading_day, 'total_mv']:
                    fcfe = df_f.loc[f_date, 'fcfe'] / df.loc[last_trading_day, 'total_mv'] / 10000
                else:
                    fcfe = np.nan
            
                if df.index[df.index > next_date].shape[0] == 0:
                    break
                return_rate_1d = df.loc[next_date, 'close'] / df.loc[next_date, 'pre_close'] - 1
                return_rate_5d = df.loc[next_5d, 'close'] * df.loc[next_5d, 'adj_factor'] / (df.loc[current_date, 'close'] * df.loc[current_date, 'adj_factor']) - 1
                return_rate_1m = cal_return(current_date, 1, 0, df)
            
                return_last_1d = df.loc[last_trading_day, 'close'] / df.loc[last_trading_day, 'pre_close'] - 1
                return_last_5d = price / (df.loc[last_5d, 'close'] * df.loc[last_5d, 'adj_factor']) - 1
            
                if np.mean(np.abs(existing_data['close'][-3:] - existing_data['pre_close'][-3:])) == 0:
                    rsi_3 = 100
                else:
                    rsi_3 = 100 * np.mean(existing_data['adj_factor'][-3:] * np.maximum(existing_data['close'][-3:] - existing_data['pre_close'][-3:], 0)) / np.mean(existing_data['adj_factor'][-3:] * np.abs(existing_data['close'][-3:] - existing_data['pre_close'][-3:]))
                if rsi_3 > 70:
                    rsi_3_adj = 50 - rsi_3
                elif rsi_3 > 50:
                    rsi_3_adj = rsi_3 - 50
                elif rsi_3 > 30:
                    rsi_3_adj = 30 - rsi_3 
                else:
                    rsi_3_adj = 20 + rsi_3 
            
                if np.mean(np.abs(existing_data['close'][-14:] - existing_data['pre_close'][-14:])) == 0:
                    rsi_14 = 100
                else:
                    rsi_14 = 100 * np.mean(existing_data['adj_factor'][-14:] * np.maximum(existing_data['close'][-14:] - existing_data['pre_close'][-14:], 0)) / np.mean(existing_data['adj_factor'][-14:] * np.abs(existing_data['close'][-14:] - existing_data['pre_close'][-14:]))
                if rsi_14 > 70:
                    rsi_14_adj = 50 - rsi_14
                elif rsi_14 > 50:
                    rsi_14_adj = rsi_14 - 50
                elif rsi_14 > 30:
                    rsi_14_adj = 30 - rsi_14 
                else:
                    rsi_14_adj = 20 + rsi_14 
            
                rsi_28 = 100 * np.mean(existing_data['adj_factor'][-28:] * np.maximum(existing_data['close'][-28:] - existing_data['pre_close'][-28:], 0)) / np.mean(existing_data['adj_factor'][-28:] * np.abs(existing_data['close'][-28:] - existing_data['pre_close'][-28:]))
                if rsi_28 > 70:
                    rsi_28_adj = 50 - rsi_28
                elif rsi_28 > 50:
                    rsi_28_adj = rsi_28 - 50
                elif rsi_28 > 30:
                    rsi_28_adj = 30 - rsi_28 
                else:
                    rsi_28_adj = 20 + rsi_28 
            
                obv = np.sum(np.sign(last_month_df['close'] - last_month_df['pre_close']) * last_month_df['vol'])
            
                if last_month_df.shape[0] <= 10:
                    return_var_month_realized = np.nan
                    return_skew_month_realized = np.nan
                    return_kurt_month_realized = np.nan
                    avg_tr_last_month = np.nan
                    avg_tr_last_month_avg_tr_last_year = np.nan
                    return_var_month = np.nan
                    return_skew_month = np.nan
                    return_kurt_month = np.nan
                    return_d_var_month = np.nan
                    return_u_var_month = np.nan
                    return_d_var_var_month = np.nan
                    t_t_1 = np.nan
                    max_return_last_month = np.nan
                    corr_vol_close_month = np.nan
                    corr_vol_high_month = np.nan
                    corr_vol_open_month = np.nan
                    corr_vol_low_month = np.nan
                    high_open_month = np.nan
                    close_low_month = np.nan
                    trend_strength_month = np.nan
                else:
                    if np.isnan(last_month_df['turnover_rate_f']).all():
                        last_month_df.loc['turnover_rate_f'] = last_month_df['turnover_rate']
                    return_var_month_realized = np.nanmean(last_month_df['pct_chg'] ** 2)
                    return_skew_month_realized = np.nanmean(last_month_df['pct_chg'] ** 3)\
                        / (return_var_month_realized ** 1.5)
                    return_kurt_month_realized = np.nanmean(last_month_df['pct_chg'] ** 4)\
                        / (return_var_month_realized ** 2)
                    avg_tr_last_month = np.nanmean(last_month_df['turnover_rate_f'])
                    avg_tr_last_month_avg_tr_last_year = np.nanmean(
                        last_month_df['turnover_rate_f']) / np.nanmean(last_year_df['turnover_rate_f'])
                    return_var_month = last_month_df['pct_chg'].var()
                    return_skew_month = last_month_df['pct_chg'].skew()
                    return_kurt_month = last_month_df['pct_chg'].kurt()
                    return_d_var_month = d_var(last_month_df['pct_chg'])
                    return_u_var_month = u_var(last_month_df['pct_chg'])
                    return_d_var_var_month = return_d_var_month / return_var_month
                    t_t_1 = cal_return(current_date, 0, -1, df)
                    max_return_last_month = np.nanmax(last_month_df['pct_chg'])
                    corr_vol_close_month = corrcoef(last_month_df['vol'],
                                                    last_month_df['adj_factor'] * last_month_df['close'])
                    corr_vol_high_month = corrcoef(last_month_df['vol'],
                                                   last_month_df['adj_factor'] * last_month_df['high'])
                    corr_vol_open_month = corrcoef(last_month_df['vol'],
                                                   last_month_df['adj_factor'] * last_month_df['open'])
                    corr_vol_low_month = corrcoef(last_month_df['vol'],
                                                  last_month_df['adj_factor'] * last_month_df['low'])
                    high_open_month = np.nanmean(last_month_df['high'] / last_month_df['open'])
                    close_low_month = np.nanmean(last_month_df['close'] / last_month_df['low'])
                    trend_strength_month = (last_month_df['close'][-1] - last_month_df['pre_close'][0])\
                        / np.nansum(np.abs(last_month_df['close'] - last_month_df['pre_close']))
                if last_year_df.shape[0] <= 20:
                    return_var_year_realized = np.nan
                    return_skew_year_realized = np.nan
                    return_kurt_year_realized = np.nan
                    return_var_year = np.nan
                    return_skew_year = np.nan
                    return_kurt_year = np.nan
                    return_d_var_year = np.nan
                    return_u_var_year = np.nan
                    return_d_var_var_year = np.nan
                    std_tr_last_year = np.nan
                    avg_abs_return_tr_last_year = np.nan
                    close_last_year_high = np.nan
                    max_return_last_year = np.nan
                    corr_vol_close_year = np.nan
                    corr_vol_high_year = np.nan
                    corr_vol_open_year = np.nan
                    corr_vol_low_year = np.nan
                    high_open_year = np.nan
                    close_low_year = np.nan
                    trend_strength_year = np.nan
                    ma20_price = np.nan
                    ma20_ma5 = np.nan
                    SO_k = np.nan
                else:
                    if np.isnan(last_year_df['turnover_rate_f']).all():
                        last_year_df.loc['turnover_rate_f'] = last_year_df['turnover_rate']
                    return_var_year_realized = np.nanmean(last_year_df['pct_chg'] ** 2)
                    return_skew_year_realized = np.nanmean(last_year_df['pct_chg'] ** 3)\
                        / (return_var_year_realized ** 1.5)
                    return_kurt_year_realized = np.nanmean(last_year_df['pct_chg'] ** 4)\
                        / (return_var_year_realized ** 2)
                    return_var_year = last_year_df['pct_chg'].var()
                    return_skew_year = last_year_df['pct_chg'].skew()
                    return_kurt_year = last_year_df['pct_chg'].kurt()
                    return_d_var_year = d_var(last_year_df['pct_chg'])
                    return_u_var_year = u_var(last_year_df['pct_chg'])
                    return_d_var_var_year = return_d_var_year / return_var_year
                    std_tr_last_year = np.nanstd(last_year_df['turnover_rate_f'])
                    avg_abs_return_tr_last_year = np.nanmean(np.abs(last_year_df['pct_chg'])
                                                             / last_year_df['turnover_rate_f'])
                    close_last_year_high = df.loc[last_trading_day, 'close']\
                        * df.loc[last_trading_day, 'adj_factor']\
                        / np.nanmax(last_year_df['high'] * last_year_df['adj_factor'])
                    max_return_last_year = np.nanmax(last_year_df['pct_chg'])
                    corr_vol_close_year = corrcoef(last_year_df['vol'],
                                                   last_year_df['adj_factor'] * last_year_df['close'])
                    corr_vol_high_year = corrcoef(last_year_df['vol'],
                                                  last_year_df['adj_factor'] * last_year_df['high'])
                    corr_vol_open_year = corrcoef(last_year_df['vol'],
                                                  last_year_df['adj_factor'] * last_year_df['open'])
                    corr_vol_low_year = corrcoef(last_year_df['vol'],
                                                 last_year_df['adj_factor'] * last_year_df['low'])
                    high_open_year = np.nanmean(last_year_df['high'] / last_year_df['open'])
                    close_low_year = np.nanmean(last_year_df['close'] / last_year_df['low'])
                    trend_strength_year = (last_year_df['close'][-1] - last_year_df['pre_close'][0])\
                        / np.nansum(np.abs(last_year_df['close'] - last_year_df['pre_close']))
                    ma5_price = (np.nanmean(last_year_df['close'][-5:] * last_year_df['adj_factor'][-5:])
                                  - price)\
                        / price    
                    ma20_price = (np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:])
                                  - price)\
                        / price
                    ma20_ma5 = (np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:])
                                - np.nanmean(last_year_df['close'][-5:] * last_year_df['adj_factor'][-5:]))\
                        / price
                    SO_k = SO(last_year_df.iloc[-20:])
                    if last_year_df.shape[0] > 120:
                        ma120_price = (np.nanmean(last_year_df['close'][-120:] * last_year_df['adj_factor'][-120:])
                                       - price)\
                            / price
                        ma120_ma40 = (np.nanmean(last_year_df['close'][-120:] * last_year_df['adj_factor'][-120:])
                                      - np.nanmean(last_year_df['close'][-40:] * last_year_df['adj_factor'][-40:]))\
                            / price
                    else:
                        ma120_price = np.nan
                        ma120_ma40 = np.nan
                    if last_year_df.shape[0] > 60:
                        ma60_price = (np.nanmean(last_year_df['close'][-60:] * last_year_df['adj_factor'][-60:])
                                      - price)\
                            / price
                        ma60_ma20 = (np.nanmean(last_year_df['close'][-60:] * last_year_df['adj_factor'][-60:])
                                     - np.nanmean(last_year_df['close'][-20:] * last_year_df['adj_factor'][-20:]))\
                            / price
                        SO_d = np.nanmean([SO(last_year_df.iloc[-20:]),
                                           SO(last_year_df.iloc[-40:-20]),
                                           SO(last_year_df.iloc[-60:-20])])
                        SO_k_d = SO_k - SO_d
                    else:
                        ma60_price = np.nan
                        ma60_ma20 = np.nan
                        SO_d = np.nan
                        SO_k_d = np.nan
            
                info = {
                    'tick': stock,
                    'industry': industry_stock(stock, dict_industry),
                    'stock_value_cat': stock_to_cat(stock, current_date, dict_300, dict_500),
                    'date': current_date,
                    'return_rate_1d': return_rate_1d,
                    'return_rate_5d': return_rate_5d,
                    'return_rate_1m': return_rate_1m,
                    'close': price
                    # Trend factors
                    # Reversal
                    'return_last_1d': return_last_1d,
                    'return_last_5d': return_last_5d,
                    't_6_t_12': cal_return(current_date, -6, -12, df),
                    't_12_t_36': cal_return(current_date, -12, -36, df),
                    't_12_t_18': cal_return(current_date, -12, -18, df),
                    # Momentum
                    't_1_t_6': cal_return(current_date, -1, -6, df),
                    't_t_1': t_t_1,
                    'return_month': cal_hist_month(df, current_date),
                    'SO_k': SO_k,
                    'SO_d': SO_d,
                    'SO_k_d': SO_k_d,
                    'ma5_price': ma5_price,
                    'ma20_price': ma20_price,
                    'ma20_ma5': ma20_ma5,
                    'ma60_price': ma60_price,
                    'ma60_ma20': ma60_ma20,
                    'ma120_price': ma120_price,
                    'ma120_ma40': ma120_ma40,
                    # Liquidity
                    'std_tr_last_year': std_tr_last_year,
                    'avg_tr_last_month': avg_tr_last_month,
                    'avg_tr_last_month_avg_tr_last_year': avg_tr_last_month_avg_tr_last_year,
                    # Technical
                    'rsi_3': rsi_3,
                    'rsi_3_adj': rsi_3_adj,
                    'rsi_14': rsi_14,
                    'rsi_14_adj': rsi_14_adj,
                    'rsi_28': rsi_28,
                    'rsi_28_adj': rsi_28_adj,
                    'obv': obv,
                    'close_last_year_high': close_last_year_high,
                    'max_return_last_month': max_return_last_month,
                    'max_return_last_year': max_return_last_year,
                    'avg_abs_return_tr_last_year': avg_abs_return_tr_last_year,
                    'ln_mv_t': math.log(df.loc[last_trading_day, 'total_mv']),
                    'ln_mv_c': math.log(df.loc[last_trading_day, 'circ_mv']),
                    'mv_c_mv_t': df.loc[last_trading_day, 'circ_mv'] / df.loc[last_trading_day, 'total_mv'],
                    'return_var_month_realized': return_var_month_realized,
                    'return_skew_month_realized': return_skew_month_realized,
                    'return_kurt_month_realized': return_kurt_month_realized,
                    'return_var_month': return_var_month,
                    'return_skew_month': return_skew_month,
                    'return_kurt_month': return_kurt_month,
                    'return_d_var_month': return_d_var_month,
                    'return_u_var_month': return_u_var_month,
                    'return_d_var_var_month': return_d_var_var_month,
                    'return_var_year_realized': return_var_year_realized,
                    'return_skew_year_realized': return_skew_year_realized,
                    'return_kurt_year_realized': return_kurt_year_realized,
                    'return_var_year': return_var_year,
                    'return_skew_year': return_skew_year,
                    'return_kurt_year': return_kurt_year,
                    'return_d_var_year': return_d_var_year,
                    'return_u_var_year': return_u_var_year,
                    'return_d_var_var_year': return_d_var_var_year,
                    'corr_vol_close_month': corr_vol_close_month,
                    'corr_vol_high_month': corr_vol_high_month,
                    'corr_vol_open_month': corr_vol_open_month,
                    'corr_vol_low_month': corr_vol_low_month,
                    'high_open_month': high_open_month,
                    'close_low_month': close_low_month,
                    'trend_strength_month': trend_strength_month,
                    'corr_vol_close_year': corr_vol_close_year,
                    'corr_vol_high_year': corr_vol_high_year,
                    'corr_vol_open_year': corr_vol_open_year,
                    'corr_vol_low_year': corr_vol_low_year,
                    'high_open_year': high_open_year,
                    'close_low_year': close_low_year,
                    'trend_strength_year': trend_strength_year,      
            
                    # Value factors
                    'pe': df.loc[last_trading_day, 'pe_ttm'],
                    'ps': df.loc[last_trading_day, 'ps_ttm'],
                    'pb': df.loc[last_trading_day, 'pb'],
                    'current_ratio': df_f.loc[f_date, 'current_ratio'],
                    'quick_ratio': df_f.loc[f_date, 'quick_ratio'],
                    'cash_ratio': df_f.loc[f_date, 'cash_ratio'],
                    'inv_turn': df_f.loc[f_date, 'inv_turn'],
                    'ar_turn': df_f.loc[f_date, 'ar_turn'],
                    'ca_turn': df_f.loc[f_date, 'ca_turn'],
                    'fa_turn': df_f.loc[f_date, 'fa_turn'],
                    'assets_turn': df_f.loc[f_date, 'assets_turn'],
                    'fcfe': fcfe,
                    'tax_to_ebt': df_f.loc[f_date, 'tax_to_ebt'],
                    'ocf_to_or': df_f.loc[f_date, 'ocf_to_or'],
                    'ocf_to_opincome': df_f.loc[f_date, 'ocf_to_opincome'],
                    'ca_to_assets': df_f.loc[f_date, 'ca_to_assets'],
                    'tbassets_to_totalassets': df_f.loc[f_date, 'tbassets_to_totalassets'],
                    'int_to_talcap': df_f.loc[f_date, 'int_to_talcap'],
                    'currentdebt_to_debt': df_f.loc[f_date, 'currentdebt_to_debt'],
                    'longdeb_to_debt': df_f.loc[f_date, 'longdeb_to_debt'],
                    'ocf_to_shortdebt': df_f.loc[f_date, 'ocf_to_shortdebt'],
                    'debt_to_eqt': df_f.loc[f_date, 'debt_to_eqt'],
                    'tangibleasset_to_debt': df_f.loc[f_date, 'tangibleasset_to_debt'],
                    'tangasset_to_intdebt': df_f.loc[f_date, 'tangasset_to_intdebt'],
                    'tangibleasset_to_netdebt': df_f.loc[f_date, 'tangibleasset_to_netdebt'],
                    'ocf_to_debt': df_f.loc[f_date, 'ocf_to_debt'],
                    'ocf_to_interestdebt': df_f.loc[f_date, 'ocf_to_interestdebt'],
                    'longdebt_to_workingcapital': df_f.loc[f_date, 'longdebt_to_workingcapital'],
                    'ebitda_to_debt': df_f.loc[f_date, 'ebitda_to_debt'],
                    'cash_to_liqdebt': df_f.loc[f_date, 'cash_to_liqdebt'],
                    'cash_to_liqdebt_withinterest': df_f.loc[f_date, 'cash_to_liqdebt_withinterest'],
                    'q_netprofit_margin': df_f.loc[f_date, 'q_netprofit_margin'],
                    'q_gsprofit_margin': df_f.loc[f_date, 'q_gsprofit_margin'],
                    'q_exp_to_sales': df_f.loc[f_date, 'q_exp_to_sales'],
                    'q_profit_to_gr': df_f.loc[f_date, 'q_profit_to_gr'],
                    'q_saleexp_to_gr': df_f.loc[f_date, 'q_saleexp_to_gr'],
                    'q_adminexp_to_gr': df_f.loc[f_date, 'q_adminexp_to_gr'],
                    'q_finaexp_to_gr': df_f.loc[f_date, 'q_finaexp_to_gr'],
                    'q_impair_to_gr_ttm': df_f.loc[f_date, 'q_impair_to_gr_ttm'],
                    'q_gc_to_gr': df_f.loc[f_date, 'q_gc_to_gr'],
                    'q_op_to_gr': df_f.loc[f_date, 'q_op_to_gr'],
                    'q_roe': df_f.loc[f_date, 'q_roe'],
                    'q_dt_roe': df_f.loc[f_date, 'q_dt_roe'],
                    'q_npta': df_f.loc[f_date, 'q_npta'],
                    'q_opincome_to_ebt': df_f.loc[f_date, 'q_opincome_to_ebt'],
                    'q_investincome_to_ebt': df_f.loc[f_date, 'q_investincome_to_ebt'],
                    'q_dtprofit_to_profit': df_f.loc[f_date, 'q_dtprofit_to_profit'],
                    'q_salescash_to_or': df_f.loc[f_date, 'q_salescash_to_or'],
                    'q_ocf_to_sales': df_f.loc[f_date, 'q_ocf_to_sales'],
                    'q_ocf_to_or': df_f.loc[f_date, 'q_ocf_to_or'],
                    'ocf_yoy': df_f.loc[f_date, 'ocf_yoy'],
                    'roe_yoy': df_f.loc[f_date, 'roe_yoy'],
                    'q_gr_yoy': df_f.loc[f_date, 'q_gr_yoy'],
                    'q_sales_yoy': df_f.loc[f_date, 'q_sales_yoy'],
                    'q_op_yoy': df_f.loc[f_date, 'q_op_yoy'],
                    'q_profit_yoy': df_f.loc[f_date, 'q_profit_yoy'],
                    'q_netprofit_yoy': df_f.loc[f_date, 'q_netprofit_yoy'],
                    'equity_yoy': df_f.loc[f_date, 'equity_yoy'],
                    'rd_exp_to_earning': rd_exp_to_earning
                }
                data = data.append(info, ignore_index=True)
                current_date = next_date
            if data.shape[0] > 0:
                data.to_sql(
                    name=stock,
                    con=con,
                    if_exists='replace',
                    index=False
                    )
                con.commit()
        except Exception as e:
            print(stock)
            print(repr(e))
    cur.close()
    con.close()
    print(str(i)+" done")
    return None
Exemple #5
0
    def forward(self, _x):
        x, _ = self.lstm(_x)  # _x is input, size (seq_len, batch, input_size)
        s, b, h = x.shape  # x is output, size (seq_len, batch, hidden_size)
        x = x.view(s * b, h)
        x = self.fc(x)
        x = x.view(s, b, -1)  # 把形状改回来
        return x


model = LSTM_Regression(CHARACTER_FOR_TRAIN, 8, output_size=1,
                        num_layers=2).cuda()
loss_function = nn.MSELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

industry_dict = get_data.load_obj('industry_dict')
data = get_data.get_from_sql(name='LSTM_data')

for industry in industry_dict:
    for stock in industry_dict[industry]['con_code']:
        data_train = torch.from_numpy(
            data[stock + '_data_train'].values.astype(np.float32).reshape(
                -1, 1, CHARACTER_FOR_TRAIN)).cuda()
        target_train = torch.from_numpy(
            data[stock + '_target_train'].values.astype(
                np.float32)[:, 0].reshape(-1, 1, 1)).cuda()
        if data_train.shape[0] == 0:
            continue
        for i in range(num_epochs):
            out = model(data_train)
            loss = loss_function(out, target_train)
Exemple #6
0
from sklearn.model_selection import  train_test_split
from Process.genBasicData import genData
from Utils import  feaUtils
import sys
sys.path.append('D:\Code\Hybrid Model')
import sqlite3 as db
import time
from multiprocessing import Pool
import pandas as pd
import tushare as ts
import numpy as np
import functions.get_data as get_data
import math


data = get_data.get_from_sql(stock_id = 'test_data_801010.SI', name = 'rf_data_industry')
train_data = genData(pathUtils.train_path)
test_data = genData(pathUtils.test_path)

param = {'max_depth': 3,
         'learning_rate ': 0.01,
         'silent': 1,
         'objective': 'binary:logistic',
         "eval_metric":"auc",
         "scale_pos_weight":10,
         "subsample":0.8,
         "min_child_weight":1,
         "n_estimators": 1}

# features = [i for i in list(train_data.columns) if i not in ["ID","y"]]
features = feaUtils.train_fea
Exemple #7
0
    def regressor_test(self, test_x):
        b_row = test_x.shape[0]
        h = self.sigmoid(np.dot(test_x, self.w) + self.b[:b_row, :])
        result = np.dot(h, self.beta)
        return result

    def classifisor_test(self, test_x):
        b_row = test_x.shape[0]
        h = self.sigmoid(np.dot(test_x, self.w) + self.b[:b_row, :])
        result = np.dot(h, self.beta)
        result = [item.tolist().index(max(item.tolist())) for item in result]
        return result


stdsc = StandardScaler()
stockdata = get_data.get_from_sql(minimum_data=500)
example = stockdata['600419.SH']
adjust_factor = ['low', 'close', 'open', 'high']
for item in adjust_factor:
    example[item + '_adj'] = example[item] * example['adj_factor']
example_1 = example[[
    'low_adj', 'close_adj', 'open_adj', 'high_adj', 'pct_chg', 'pe_ttm', 'vol',
    'turnover_rate', 'dv_ttm', 'float_share', 'turnover_rate_f', 'pb',
    'ps_ttm', 'volume_ratio'
]]
for i in range(1, 11):
    example_1['close_adj_last_' + str(i)] = example_1['close_adj'].shift(i)
    example_1['open_adj_last_' + str(i)] = example_1['open_adj'].shift(i)
    example_1['high_adj_last_' + str(i)] = example_1['high_adj'].shift(i)
    example_1['low_adj_last_' + str(i)] = example_1['low_adj'].shift(i)
Exemple #8
0
def prep_data_for_rf(stock_list, const):
    """
    Calculate the daily indicators for stocks in the given stock_list.

    Args:
        stock_list (list): The stocks that need to be calculated.
        const (TYPE): Multiprocessing id.

    Returns:
        None.

    """
    con = db.connect('D:\\Data\\rf_data_research_'+str(const)+'.sqlite')
    for stock in stock_list:
        try:
            print(stock, const)
            df = get_data.get_from_sql(stock_id=stock)
            df_f = get_data.get_from_sql(stock_id=stock, name='data_finance')
            df.set_index('trade_date', inplace=True)
            df_f.set_index('ann_date', inplace=True)
            df_f = df_f[::-1]
            df_f = df_f.drop(['ts_code', 'end_date', 'dt_eps'], axis=1)
            df_m = pd.merge(df_f, df, right_index = True, left_index=True, how='outer')
            df_m.fillna(method='ffill', inplace=True)
            df_m['pct_chg'] = (df_m['close'] / df_m['pre_close'] - 1) * 100
            for i in ['high', 'low', 'close', 'open']:
                df_m[i + '_adj'] = df_m[i] * df_m['adj_factor']
            df_m['pre_close_adj'] = df_m['close_adj'] / (df_m['pct_chg'] / 100 + 1)
            df_m['MACD'] = talib.MACD(df_m['close_adj'])[1]
            df_m['WILLR'] = talib.WILLR(df_m['high_adj'], df_m['low_adj'],
                                        df_m['close_adj'])
            df_m['AD'] = talib.AD(df_m['high_adj'], df_m['low_adj'], df_m['close_adj'],
                                  df_m['vol'])
            df_m['RSI_3'] = talib.RSI(df_m['close_adj'], timeperiod=3)
            df_m['RSI_14'] = talib.RSI(df_m['close_adj'], timeperiod=14)
            df_m['RSI_28'] = talib.RSI(df_m['close_adj'], timeperiod=28)
            df_m['CCI_3'] = talib.CCI(df_m['high_adj'],
                                      df_m['low_adj'],
                                      df_m['close_adj'],
                                      timeperiod=3)
            df_m['CCI_14'] = talib.CCI(df_m['high_adj'],
                                       df_m['low_adj'],
                                       df_m['close_adj'],
                                       timeperiod=14)
            df_m['CCI_28'] = talib.CCI(df_m['high_adj'],
                                       df_m['low_adj'],
                                       df_m['close_adj'],
                                       timeperiod=28)
            df_m['WILLR'] = talib.WILLR(df_m['high_adj'], df_m['low_adj'],
                                        df_m['close_adj'])

            df_m['SMA5'] = (talib.MA(df_m['close_adj'], timeperiod=5) - df_m['close_adj']) / df_m['close_adj']
            df_m['SMA20'] = (talib.MA(df_m['close_adj'], timeperiod=20) - df_m['close_adj']) / df_m['close_adj']
            df_m['SMA60'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['close_adj']) / df_m['close_adj']
            df_m['SMA20_5'] = (talib.MA(df_m['close_adj'], timeperiod=20) - df_m['SMA5']) / df_m['SMA5']
            df_m['SMA60_5'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['SMA5']) / df_m['SMA5']
            df_m['SMA60_20'] = (talib.MA(df_m['close_adj'], timeperiod=60) - df_m['SMA20']) / df_m['SMA20']

            df_m['SMA5_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=5)
            df_m['SMA20_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=20)
            df_m['SMA60_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=60)
            df_m['SMA250_tr'] = talib.MA(df_m['turnover_rate'], timeperiod=250)
            df_m['SMA5_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=5)
                                  - df_m['turnover_rate']) / df_m['turnover_rate']
            df_m['SMA20_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=20)
                                   - df_m['turnover_rate']) / df_m['turnover_rate']
            df_m['SMA60_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60)
                                   - df_m['turnover_rate']) / df_m['turnover_rate']
            df_m['SMA250_tr_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250)
                                    - df_m['turnover_rate']) / df_m['turnover_rate']
            df_m['SMA20_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=20)
                                     - df_m['SMA5_tr']) / df_m['SMA5_tr']
            df_m['SMA60_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60)
                                     - df_m['SMA5_tr']) / df_m['SMA5_tr']
            df_m['SMA250_SMA5_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250)
                                      - df_m['SMA5_tr']) / df_m['SMA5_tr']
            df_m['SMA60_SMA20_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=60)
                                      - df_m['SMA20_tr']) / df_m['SMA20_tr']
            df_m['SMA250_SMA20_tr'] = (talib.MA(df_m['turnover_rate'], timeperiod=250)
                                       - df_m['SMA20_tr']) / df_m['SMA20_tr']

            df_m['corr_close_vol_20'] = talib.CORREL(df_m['close_adj'],
                                                     df_m['vol'],
                                                     timeperiod=20)
            df_m['corr_close_vol_60'] = talib.CORREL(df_m['close_adj'],
                                                     df_m['vol'],
                                                     timeperiod=60)
            df_m['pct_chg_STD_20'] = talib.STDDEV(df_m['pct_chg'], timeperiod=20)
            df_m['pct_chg_STD_60'] = talib.STDDEV(df_m['pct_chg'], timeperiod=60)
            df_m['pct_chg_STD_250'] = talib.STDDEV(df_m['pct_chg'], timeperiod=250)
            df_m['OBV'] = talib.OBV(df_m['pct_chg'], df_m['vol'])
            df_m['MOM_5'] = talib.MOM(df_m['close_adj'], timeperiod=5) / df_m['close_adj'].shift(5)
            df_m['MOM_20'] = talib.MOM(df_m['close_adj'], timeperiod=20) / df_m['close_adj'].shift(20)
            df_m['MOM_60'] = talib.MOM(df_m['close_adj'], timeperiod=60) / df_m['close_adj'].shift(60)
            df_m['MOM_250'] = talib.MOM(df_m['close_adj'], timeperiod=250) / df_m['close_adj'].shift(250)
            df_m['t_1_t_6'] = (talib.MOM(df_m['close_adj'], timeperiod=100) / df_m['close_adj'].shift(100)).shift(20)
            df_m['t_6_t_12'] = (talib.MOM(df_m['close_adj'], timeperiod=120) / df_m['close_adj'].shift(120)).shift(120)

            df_m['close_kurt_20'] = df_m['pct_chg'].rolling(20).kurt()
            df_m['close_kurt_60'] = df_m['pct_chg'].rolling(60).kurt()
            df_m['close_skew_20'] = df_m['pct_chg'].rolling(20).skew()
            df_m['close_skew_60'] = df_m['pct_chg'].rolling(60).skew()

            df_m['open_close_5'] = (df_m['close_adj'] / df_m['open_adj'] -
                                    1).rolling(5).mean()
            df_m['open_close_20'] = (df_m['close_adj'] / df_m['open_adj'] -
                                     1).rolling(20).mean()
            df_m['open_close_60'] = (df_m['close_adj'] / df_m['open_adj'] -
                                     1).rolling(60).mean()
            df_m['open_high_5'] = (df_m['high_adj'] / df_m['open_adj'] -
                                   1).rolling(5).mean()
            df_m['open_high_20'] = (df_m['high_adj'] / df_m['open_adj'] -
                                    1).rolling(20).mean()
            df_m['open_close_60'] = (df_m['high_adj'] / df_m['open_adj'] -
                                     1).rolling(60).mean()
            df_m['close_low_5'] = (df_m['close_adj'] / df_m['open_adj'] -
                                   1).rolling(5).mean()
            df_m['close_low_20'] = (df_m['close_adj'] / df_m['open_adj'] -
                                    1).rolling(20).mean()
            df_m['close_low_60'] = (df_m['close_adj'] / df_m['open_adj'] -
                                    1).rolling(60).mean()
            df_m['open_pre_close_5'] = (df_m['open_adj'] / df_m['pre_close_adj'] -
                                        1).rolling(5).mean()
            df_m['open_pre_close_20'] = (df_m['open_adj'] / df_m['pre_close_adj'] -
                                         1).rolling(20).mean()
            df_m['open_pre_close_60'] = (df_m['open_adj'] / df_m['pre_close_adj'] -
                                         1).rolling(60).mean()

            df_m['trend_strength_20'] = df_m['MOM_20'] / np.abs(
                df_m['close_adj'] - df_m['pre_close_adj']).rolling(20).sum()

            df_m['return_1d'] = (df_m['close_adj'].shift(-1) - df_m['close_adj']) / df_m['close_adj']
            df_m['return_5d'] = (df_m['close_adj'].shift(-5) - df_m['close_adj']) / df_m['close_adj']
            df_m['return_20d'] = (df_m['close_adj'].shift(-20) - df_m['close_adj']) / df_m['close_adj']

            df_m.to_sql(
                name=stock,
                con=con,
                if_exists='replace',
                index_label='date'
            )
            con.commit()
        except Exception as e:
            print(stock)
            print(repr(e))
    con.close()
Exemple #9
0
def gather_target_df():
    """
    Gather the return information from the downloaded sql and give each day a label.

    Returns:
        None.

    """
    con = db.connect('D:\\Data\\rf_data_research_target.sqlite')

    dict_industry = get_data.get_industry_stock_list()
    data_list = get_data.get_sql_key()
    data_f_list = get_data.get_sql_key(name='data_finance')
    data_l = set(data_list)

    for i in dict_industry:
        dict_industry[i] = list(dict_industry[i]['con_code'])
        data_l = data_l - set(dict_industry[i])
    dict_industry['None'] = list(data_l)

    # calculate future return and give industry label
    result = pd.DataFrame()
    for industry in dict_industry:
        for stock in dict_industry[industry]:
            if (stock not in data_f_list) or (stock not in data_list):
                continue
            df = get_data.get_from_sql(stock_id=stock)
            df_m = pd.DataFrame()
            df_m['close_adj'] = df['close'] * df['adj_factor']
            df_m['return_1d'] = (df_m['close_adj'].shift(-1) - df_m['close_adj']) / df_m['close_adj']
            df_m['return_5d'] = (df_m['close_adj'].shift(-5) - df_m['close_adj']) / df_m['close_adj']
            df_m['return_20d'] = (df_m['close_adj'].shift(-20) - df_m['close_adj']) / df_m['close_adj']
            df_m['tick'] = stock
            df_m = df_m[['return_1d', 'return_5d', 'return_20d', 'tick']]
            df_m['industry'] = industry
            df_m['date'] = df['trade_date']
            result = result.append(df_m)
    adjustment_day = get_adjustment_day()

    # Give index label
    ts.set_token('267addf63a14adcfc98067fc253fbd72a728461706acf9474c0dae29')
    pro = ts.pro_api()
    dict_300 = {}
    for i in range(14):
        dict_300[str(2007+i)+'0101'] = list(pro.index_weight(index_code='399300.SZ',
                                                             start_date=str(2007+i)+'0101',
                                                             end_date=str(2007+i)+'0110')['con_code'].iloc[:300])
        if i != 13:
            dict_300[str(2007+i)+'0701'] = list(pro.index_weight(index_code='399300.SZ',
                                                                 start_date=str(2007+i)+'0625',
                                                                 end_date=str(2007+i)+'0701')['con_code'].iloc[:300])
    dict_500 = {}
    for i in range(14):
        dict_500[str(2007+i)+'0101'] = list(pro.index_weight(index_code='000905.SH',
                                                             start_date=str(2007+i)+'0101',
                                                             end_date=str(2007+i)+'0301')['con_code'].iloc[:500])
        if i != 13:
            dict_500[str(2007+i)+'0701'] = list(pro.index_weight(index_code='000905.SH',
                                                                 start_date=str(2007+i)+'0625',
                                                                 end_date=str(2007+i)+'0710')['con_code'].iloc[:500])
            
    result['index'] = 'None'
    adjustment_day = pd.Series(adjustment_day)
    result.dropna(axis=0, inplace = True)
    
    for t in dict_300:
        if adjustment_day[adjustment_day < t].shape[0] == 0:
            start_date = '20070101'
            end_date = adjustment_day[adjustment_day > t].iloc[0]
        elif adjustment_day[adjustment_day > t].shape[0] == 0:
            start_date = adjustment_day.iloc[-1]
            end_date = '20200501'
        else:
            start_date = adjustment_day[adjustment_day < t].iloc[-1]
            end_date = adjustment_day[adjustment_day > t].iloc[0]
        temp = result[(result['date'] > start_date) & (result['date'] <= end_date)]
        for stock in dict_300[t]:
            temp = temp[temp['tick'] == stock]
            result.loc[temp.index, 'index'] = 'hs300'
        for stock in dict_500[t]:
            temp = temp[temp['tick'] == stock]
            result.loc[temp.index, 'index'] = 'zz500'

    result.to_sql(
        name='All_Data',
        con=con,
        if_exists='replace',
        index=False
    )
    con.commit()
    con.close()
Exemple #10
0
def compute_discrete():
    data_dict = get_data.get_sql_key(name='rf_data_d')
    correct_rate = {}
    correct_rate[5] = []
    correct_rate[1] = []
    correct_rate[2] = []
    correct_rate[3] = []
    correct_rate[4] = []
    count = 1
    length = len(data_dict)
    unit = int(length / 100)
    for stock in data_dict:
        if count % unit == 0:
            print(count / unit)
        count += 1
        data = get_data.get_from_sql(name='rf_data_d', stock_id=stock)
        data = data[data['date'] > '20120101']
        if data.shape[0] < 1500:
            continue
        temp_data = data[['rsi_3', 'rsi_14', 'rsi_28']]
        temp_data[temp_data < 30] = 1
        temp_data[(temp_data >= 30) & (temp_data < 50)] = -1
        temp_data[(temp_data >= 50) & (temp_data < 70)] = 1
        temp_data[temp_data >= 70] = -1
        data[['rsi_3', 'rsi_14', 'rsi_28']] = temp_data
        index = [
            'rsi_3', 'ma20_price', 'return_last_1d', 'return_month',
            'return_month', 't_6_t_12', 'rsi_28', 'ma60_ma20', 'SO_k_d',
            'rsi_14', 'obv', 't_t_1', 't_12_t_18', 't_1_t_6',
            'corr_vol_close_month', 'ma120_price', 'ma120_ma40', 'ma60_price',
            't_12_t_36', 'ma5_price', 'corr_vol_close_year', 'ma20_ma5'
        ]
        data_x = data[index]
        data_x[data_x > 0] = 1
        data_x[data_x < 0] = -1
        data_y = (data['return_rate_1m'] > 0).astype(int)
        data_x[np.isnan(data_x)] = 0
        # print('*' * 50)
        # print(stock)
        for i in range(5, 0, -1):
            x_train = data_x.iloc[:-i * 200]
            y_train = data_y.iloc[:-i * 200]
            x_test = data_x.iloc[-i * 200:-(i - 1) * 200 - 1]
            y_test = data_y.iloc[-i * 200:-(i - 1) * 200 - 1]
            classifier = RandomForestClassifier(
                min_samples_leaf=100,
                n_estimators=200,
                random_state=0,
                n_jobs=-1,
                class_weight='balanced_subsample')
            classifier.fit(x_train, y_train)
            y_pre = classifier.predict(x_test)
            # print(i)
            # print(np.nanmean((y_pre == y_test)))
            correct_rate[i].append(np.nanmean((y_pre == y_test)))
            '''
            importances = list(classifier.feature_importances_)
    
            # List of tuples with variable and importance
            feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)]
    
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)[:10]
    
            # Print out the feature and importances 
            [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            '''
    for i in correct_rate:
        print(np.mean(correct_rate[i]))
Exemple #11
0
def compute_talib():
    data_dict = get_data.get_sql_key(name='data')
    correct_rate = {}
    correct_rate[5] = []
    correct_rate[1] = []
    correct_rate[2] = []
    correct_rate[3] = []
    correct_rate[4] = []
    count = 1
    length = len(data_dict)
    unit = int(length / 100)
    for stock in data_dict:
        if count % unit == 0:
            print(count / unit)
        count += 1
        data = get_data.get_from_sql(stock_id=stock)
        # data = data[data['trade_date'] > '20120101']
        if data.shape[0] < 1500:
            continue
        adjusted_close = data['close'] * data['adj_factor']
        adjusted_high = data['high'] * data['adj_factor']
        adjusted_low = data['low'] * data['adj_factor']
        adjusted_open = data['open'] * data['adj_factor']
        data_x = pd.DataFrame()
        data_x['MACD'] = talib.MACD(adjusted_close)[1]
        data_x['RSI'] = talib.RSI(adjusted_close)
        data_x['WILLR'] = talib.WILLR(adjusted_high, adjusted_low,
                                      adjusted_open)
        data_y = (data['pct_chg'].shift(-1) > 0).astype(int)
        data_x[np.isnan(data_x)] = 0
        # print('*' * 50)
        # print(stock)
        for i in range(5, 0, -1):
            x_train = data_x.iloc[100:-i * 200]
            y_train = data_y.iloc[100:-i * 200]
            x_test = data_x.iloc[-i * 200:-(i - 1) * 200 - 1]
            y_test = data_y.iloc[-i * 200:-(i - 1) * 200 - 1]
            classifier = RandomForestClassifier(
                min_samples_leaf=100,
                n_estimators=200,
                random_state=0,
                n_jobs=-1,
                class_weight='balanced_subsample')
            classifier.fit(x_train, y_train)
            y_pre = classifier.predict(x_test)
            # print(i)
            # print(np.nanmean((y_pre == y_test)))
            correct_rate[i].append(np.nanmean((y_pre == y_test)))
            '''
            importances = list(classifier.feature_importances_)
    
            # List of tuples with variable and importance
            feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(x_train.columns), importances)]
    
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)[:10]
    
            # Print out the feature and importances 
            [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
            '''
    for i in correct_rate:
        print(np.mean(correct_rate[i]))
Exemple #12
0
if __name__ == '__main__':
    industry_dict = get_data.load_obj('industry_dict')
    model = SimpleNet(232, 50, 20, 1)
    if torch.cuda.is_available():
        model = model.cuda()

    criterion = nn.MSELoss().cuda()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    epoch = 0
    count = 0

    for i in range(num_epochs):
        for industry in industry_dict:
            data_train = get_data.get_from_sql(
                stock_id=industry + '_data_train',
                name='CNN_industry').values.astype(np.float32)
            if data_train.shape[0] == 0:
                continue
            target_train = get_data.get_from_sql(
                stock_id=industry + '_target_train',
                name='CNN_industry').values[:, 0:1].astype(np.float32)
            characteristic = torch.from_numpy(data_train)
            label = torch.from_numpy(target_train)
            if characteristic.shape[0] < 2:
                continue
            if torch.cuda.is_available():
                characteristic = characteristic.cuda()
                label = label.cuda()
            else:
                characteristic = Variable(characteristic)
Exemple #13
0
def calc_accr_rf(stock_list, d):
    """
    Create a random forest for each stock in the stock_list and output the result.

    Args:
        stock_list (list): Given stock list to operate on.
        d (int): Multiprocessing id.

    Returns:
        None.

    """
    warnings.filterwarnings("ignore")
    result_stock = pd.DataFrame()
    con = db.connect('D:\\Data\\rf_data_research_target.sqlite')
    target_list = [
        'industry_return_1d', 'industry_return_5d', 'industry_return_20d',
        'index_return_1d', 'index_return_5d', 'index_return_20d',
        'all_return_1d', 'all_return_5d', 'all_return_20d'
    ]
    target_list_abs = ['return_1d', 'return_5d', 'return_20d']
    train_end_date = '20180101'

    for stock in stock_list:
        try:
            data_train_x = get_data.get_from_sql(stock_id=stock,
                                                 name='rf_data_research')
            data_train_x.rename(columns={'index': 'date'}, inplace=True)
            data_train_x.drop(['return_1d', 'return_5d', 'return_20d'],
                              inplace=True,
                              axis=1)
            data_train_y = pd.read_sql_query(
                sql="SELECT * FROM Processed_data WHERE tick = '" + stock +
                "'",
                con=con)
            data_set = pd.merge(data_train_x,
                                data_train_y,
                                left_on='date',
                                right_on='date',
                                how='inner')
            data_set.fillna(0, inplace=True)
            temp_dict = {'tick': stock}
            temp_dict['index'] = data_train_y['index'].iloc[-1]
            temp_dict['industry'] = data_train_y['industry'].iloc[-1]

            for target in target_list:
                data_set_working = data_set[list(data_train_x.columns) +
                                            [target, 'return_1d']]
                data_set_working = data_set_working[
                    data_set_working[target] != 0]

                train_data_set = data_set_working[
                    data_set_working['date'] < train_end_date]
                if train_data_set.shape[0] < 500:
                    break
                test_data_set = data_set_working[
                    data_set_working['date'] >= train_end_date]
                if train_data_set.shape[0] < 100:
                    break
                train_data_set['target'] = -1
                train_data_set.loc[train_data_set[
                    train_data_set[target] >= 0.7].index, 'target'] = 1
                train_data_set.loc[train_data_set[
                    train_data_set[target] <= 0.3].index, 'target'] = 0
                train_data_set = train_data_set[train_data_set['target'] >= 0]
                train_x = train_data_set[data_train_x.columns].drop('date',
                                                                    axis=1)
                train_x[np.isinf(train_x)] = 0
                train_y = train_data_set['target']

                test_x = test_data_set[data_train_x.columns].drop('date',
                                                                  axis=1)
                test_x[np.isinf(test_x)] = 0
                test_y = (test_data_set[target] > 0.5).astype(int)

                classifier = RandomForestClassifier(
                    min_samples_leaf=100,
                    n_estimators=200,
                    random_state=0,
                    n_jobs=-1,
                    class_weight='balanced_subsample')
                classifier.fit(train_x, train_y)
                y_pre = classifier.predict(test_x)
                temp_dict[target] = round(np.nanmean(y_pre == test_y), 4)

                prob_y = classifier.predict_proba(test_x)
                prob_y = prob_y[:, 1]
                if not (prob_y <= 0.5).all():
                    prob_y[
                        prob_y >= np.percentile(prob_y[prob_y > 0.5], 70)] = 1
                if not (prob_y >= 0.5).all():
                    prob_y[
                        prob_y <= np.percentile(prob_y[prob_y < 0.5], 30)] = 0
                temp_dict[target + '_enhanced'] = round(
                    np.nanmean((prob_y == test_y)) / 0.3, 4)

                if 'return_1d' in target:
                    temp_dict[target + '_return'] = np.sum(
                        y_pre * test_data_set['return_1d'])
                    temp_dict[target + '_return_compound'] = np.prod(
                        y_pre * test_data_set['return_1d'] + 1)
                    y_pre[y_pre == 0] = -1
                    temp_dict[target + '_return_ls'] = np.sum(
                        y_pre * test_data_set['return_1d'])
                    temp_dict[target + '_return_compound_ls'] = np.prod(
                        y_pre * test_data_set['return_1d'] + 1)

                    stor_prob_y = prob_y.copy()
                    prob_y[prob_y < 1] = 0
                    temp_dict[target + '_enhanced_return'] = np.sum(
                        prob_y * test_data_set['return_1d'])
                    temp_dict[target + '_enhanced_return_compound'] = np.prod(
                        prob_y * test_data_set['return_1d'] + 1)
                    stor_prob_y[stor_prob_y == 0] = -1
                    stor_prob_y[(stor_prob_y >= 0) & (stor_prob_y < 1)] = 0
                    temp_dict[target + '_enhanced_return_ls'] = np.sum(
                        stor_prob_y * test_data_set['return_1d'])
                    temp_dict[target +
                              '_enhanced_return_compound_ls'] = np.prod(
                                  stor_prob_y * test_data_set['return_1d'] + 1)

            for target in target_list_abs:
                data_set_working = data_set[list(data_train_x.columns) +
                                            [target]]
                data_set_working = data_set_working[
                    data_set_working[target] != 0]

                train_data_set = data_set_working[
                    data_set_working['date'] < train_end_date]
                if train_data_set.shape[0] < 500:
                    break
                test_data_set = data_set_working[
                    data_set_working['date'] >= train_end_date]
                if train_data_set.shape[0] < 100:
                    break
                train_data_set['target'] = -1
                train_data_set.loc[train_data_set[
                    train_data_set[target] >= 0].index, 'target'] = 1
                train_data_set.loc[train_data_set[
                    train_data_set[target] <= 0].index, 'target'] = 0
                train_data_set = train_data_set[train_data_set['target'] >= 0]
                train_x = train_data_set[data_train_x.columns].drop('date',
                                                                    axis=1)
                train_x[np.isinf(train_x)] = 0
                train_y = train_data_set['target']

                test_x = test_data_set[data_train_x.columns].drop('date',
                                                                  axis=1)
                test_x[np.isinf(test_x)] = 0
                test_y = (test_data_set[target] >= 0).astype(int)

                classifier = RandomForestClassifier(
                    min_samples_leaf=200,
                    n_estimators=500,
                    random_state=0,
                    n_jobs=-1,
                    class_weight='balanced_subsample')
                classifier.fit(train_x, train_y)
                y_pre = classifier.predict(test_x)
                temp_dict[target] = round(np.nanmean(y_pre == test_y), 4)

                prob_y = classifier.predict_proba(test_x)
                prob_y = prob_y[:, 1]
                if not (prob_y <= 0.5).all():
                    prob_y[
                        prob_y >= np.percentile(prob_y[prob_y > 0.5], 70)] = 1
                if not (prob_y >= 0.5).all():
                    prob_y[
                        prob_y <= np.percentile(prob_y[prob_y < 0.5], 30)] = 0
                temp_dict[target + '_enhanced'] = round(
                    np.nanmean((prob_y == test_y)) / 0.3, 4)

                if 'return_1d' in target:
                    temp_dict[target + '_return'] = np.sum(
                        y_pre * test_data_set['return_1d'])
                    temp_dict[target + '_return_compound'] = np.prod(
                        y_pre * test_data_set['return_1d'] + 1)
                    y_pre[y_pre == 0] = -1
                    temp_dict[target + '_return_ls'] = np.sum(
                        y_pre * test_data_set['return_1d'])
                    temp_dict[target + '_return_compound_ls'] = np.prod(
                        y_pre * test_data_set['return_1d'] + 1)

                    stor_prob_y = prob_y.copy()
                    prob_y[prob_y < 1] = 0
                    temp_dict[target + '_enhanced_return'] = np.sum(
                        prob_y * test_data_set['return_1d'])
                    temp_dict[target + '_enhanced_return_compound'] = np.prod(
                        prob_y * test_data_set['return_1d'] + 1)
                    stor_prob_y[stor_prob_y == 0] = -1
                    stor_prob_y[(stor_prob_y >= 0) & (stor_prob_y < 1)] = 0
                    temp_dict[target + '_enhanced_return_ls'] = np.sum(
                        stor_prob_y * test_data_set['return_1d'])
                    temp_dict[target +
                              '_enhanced_return_compound_ls'] = np.prod(
                                  stor_prob_y * test_data_set['return_1d'] + 1)
            result_stock = result_stock.append(temp_dict, ignore_index=True)
        except Exception as e:
            print(stock)
            print(repr(e))
    result_stock.to_csv('D:\\output\\rf_result' + str(d) + '.csv')
Exemple #14
0
def prep_data_for_rf(stock_list, dict_industry, calendar, i, dict_300, dict_500):
    con = db.connect('D:\\Data\\rf_temp_'+str(i)+'.sqlite')
    cur = con.cursor()
    data = pd.DataFrame()
    for stock in stock_list:
        try:
            df = get_data.get_from_sql(stock_id=stock)
            df_f = get_data.get_from_sql(stock_id=stock, name='data_finance')
            df.set_index('trade_date', inplace=True)
            df_f.set_index('ann_date', inplace=True)
            listed_date = df.index[0]
            usable = max(change_month(listed_date, 13), '20070101')
            if usable > '20200101':
                continue
            if 'turnover_rate_f' not in df.columns:
                df['turnover_rate_f'] = df['turnover_rate']
            first_date = calendar[calendar >= usable].iloc[0]
            current_date = first_date
            while (current_date <= '20200131') and (current_date < df.index[-1]):
                stock_date = df.index[df.index >= current_date][0]
                '''
                if stock_date != current_date:
                    # If the stock is not trading in the first trading day of the month,
                    # we don't collect its data and won't do trading on that stock this month.
                    continue
                '''
                last_trading_day = df.index[df.index < current_date][-1]
                last_year = change_month(current_date, -12)
                last_month = change_month(current_date, -1)
                last_year = df.index[df.index >= last_year][0]
                last_month = df.index[df.index >= last_month][0]
                last_year_df = df.loc[last_year: last_trading_day]
                last_month_df = df.loc[last_month: last_trading_day]
                f_date = df_f.index[df_f.index <= current_date][0]
                next_date = calendar[calendar >= change_month(current_date, 1)].iloc[0]
                
                price = df.loc[last_trading_day, 'close'] * df.loc[last_trading_day, 'adj_factor']
                if df_f.loc[f_date, 'rd_exp'] and df.loc[last_trading_day, 'total_mv'] and df.loc[last_trading_day, 'pe_ttm']:
                    rd_exp_to_earning = df_f.loc[f_date, 'rd_exp']\
                        / df.loc[last_trading_day, 'total_mv']\
                        * df.loc[last_trading_day, 'pe_ttm']
                else:
                    rd_exp_to_earning = np.nan
                if df_f.loc[f_date, 'fcfe'] and df.loc[last_trading_day, 'total_mv']:
                    fcfe = df_f.loc[f_date, 'fcfe'] / df.loc[last_trading_day, 'total_mv'] / 10000
                else:
                    fcfe = np.nan
                    
                if df.index[df.index > next_date].shape[0] == 0:
                    break
                return_rate = cal_return(current_date, 1, 0, df)

                if last_month_df.shape[0] <= 10:
                    return_var_month_realized = np.nan
                    return_skew_month_realized = np.nan
                    return_kurt_month_realized = np.nan
                    avg_tr_last_month = np.nan
                    avg_tr_last_month_avg_tr_last_year = np.nan
                    return_var_month = np.nan
                    return_skew_month = np.nan
                    return_kurt_month = np.nan
                    return_d_var_month = np.nan
                    return_u_var_month = np.nan
                    return_d_var_var_month = np.nan
                    t_t_1 = np.nan
                    max_return_last_month = np.nan
                    corr_vol_close_month = np.nan
                    corr_vol_high_month = np.nan
                    corr_vol_open_month = np.nan
                    corr_vol_low_month = np.nan
                    high_open_month = np.nan
                    close_low_month = np.nan