def factor_wash(factor_data, stockList, industry_code, date): #去极值,目前private_tools支持单series的去极值化处理,待完成dataframe级别后替代 factor_data = winsorize_med(factor_data, scale=5, inf2nan=False, axis=0) #缺失值处理,目前已完成private_tools支持 factor_data = tools.replace_nan_indu(factor_data, stockList, industry_code, date) #中性化处理,目前已完成private_tools支持 factor_data = neutralize(factor_data, mkt_cap=False, industry=False) #标准化处理,目前private_tools支持单series的去极值化处理,待完成dataframe级别后替代 factor_data = standardlize(factor_data, axis=0)
def data_preprocessing(factor_data,stockList,industry_code,date): #去极值 factor_data=winsorize_med(factor_data, scale=5, inf2nan=False,axis=0) #缺失值处理 factor_data=replace_nan_indu(factor_data,stockList,industry_code,date) #中性化处理 factor_data=neutralize(factor_data, how=['sw_l1', 'market_cap'], date=date, axis=0) #标准化处理 factor_data=standardlize(factor_data,axis=0) return factor_data
def clean_factor(factors, date): factors = factors.fillna(factors.mean()) factors = winsorize_med(factors, scale=3, inclusive=True, inf2nan=True, axis=0) factors = standardlize(factors, inf2nan=True, axis=0) factors = neutralize(factors, ['sw_l1', 'pe_ratio'], date=str(date), axis=0) return factors
def handle_data(context,data): if g.if_trade == True: # 记录交易次数 g.__tradeCount = g.__tradeCount + 1 # 训练集合成 yesterday = context.previous_date df_train = get_df_train(g.__q,yesterday,g.__trainlength,g.__intervals) df_train = initialize_df(df_train) # T日截面数据(测试集) df = get_fundamentals(g.__q, date = None) df = initialize_df(df) # 离散值处理 for fac in g.__winsorizeList: df_train[fac] = winsorize_med(df_train[fac], scale=5, inclusive=True, inf2nan=True, axis=0) df[fac] = winsorize_med(df[fac], scale=5, inclusive=True, inf2nan=True, axis=0) # 标准化处理 for fac in g.__standardizeList: df_train[fac] = standardlize(df_train[fac], inf2nan=True, axis=0) df[fac] = standardlize(df[fac], inf2nan=True, axis=0) # 中性化处理(行业中性化) df_train = neutralize(df_train,g.__industry_set) df = neutralize(df,g.__industry_set) #训练集(包括验证集) X_trainval = df_train[g.__factorList] X_trainval = X_trainval.fillna(0) #定义机器学习训练集输出 y_trainval = df_train[['log_mcap']] y_trainval = y_trainval.fillna(0) #测试集 X = df[g.__factorList] X = X.fillna(0) #定义机器学习测试集输出 y = df[['log_mcap']] y.index = df['code'] y = y.fillna(0) kfold = KFold(n_splits=4) if g.__gridserach == False: #不带网格搜索的机器学习 if g.method == 'svr': #SVR from sklearn.svm import SVR model = SVR(C=100, gamma=1) elif g.method == 'lr': from sklearn.linear_model import LinearRegression model = LinearRegression() elif g.method == 'ridge': #岭回归 from sklearn.linear_model import Ridge model = Ridge(random_state=42,alpha=100) elif g.method == 'rf': #随机森林 from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(random_state=42,n_estimators=500,n_jobs=-1) else: g.__scoreWrite = False else: # 带网格搜索的机器学习 para_grid = {} if g.method == 'svr': from sklearn.svm import SVR para_grid = {'C':[10,100],'gamma':[0.1,1,10]} grid_search_model = SVR() elif g.method == 'lr': from sklearn.linear_model import LinearRegression grid_search_model = LinearRegression() elif g.method == 'ridge': from sklearn.linear_model import Ridge para_grid = {'alpha':[1,10,100]} grid_search_model = Ridge() elif g.method == 'rf': from sklearn.ensemble import RandomForestRegressor para_grid = {'n_estimators':[100,500,1000]} grid_search_model = RandomForestRegressor() else: g.__scoreWrite = False from sklearn.model_selection import GridSearchCV model = GridSearchCV(grid_search_model,para_grid,cv=kfold,n_jobs=-1) # 拟合训练集,生成模型 model.fit(X_trainval,y_trainval) # 预测值 y_pred = model.predict(X) # 新的因子:实际值与预测值之差 factor = y - pd.DataFrame(y_pred, index = y.index, columns = ['log_mcap']) #对新的因子,即残差进行排序(按照从小到大) factor = factor.sort_index(by = 'log_mcap') ### 分组测试用 ############## if g.invest_by_group == True: len_secCodeList = len(list(factor.index)) g.stocknum = int(len_secCodeList * g.quantile) ### 分组测试用 ############## start = g.stocknum * (g.group-1) end = g.stocknum * g.group stockset = list(factor.index[start:end]) current_data = get_current_data() #卖出 sell_list = list(context.portfolio.positions.keys()) for stock in sell_list: if stock not in stockset: if stock in g.__feasible_stocks: if current_data[stock].last_price == current_data[stock].high_limit: pass else: stock_sell = stock order_target_value(stock_sell, 0) #分配买入资金 if len(context.portfolio.positions) < g.stocknum: num = g.stocknum - len(context.portfolio.positions) cash = context.portfolio.cash/num else: cash = 0 num = 0 #买入 for stock in stockset[:g.stocknum]: if stock in sell_list: pass else: if current_data[stock].last_price == current_data[stock].low_limit: pass else: stock_buy = stock order_target_value(stock_buy, cash) num = num - 1 if num == 0: break