def make_model(input_shape, nb_epochs=100, batch_size=128, lr=0.01, n_layers=1, n_hidden=16, rate_dropout=0.3): model_path = '/output/model.{}.{}c.{}l.{}'.format(input_shape[0], n_hidden, n_layers, nb_epochs) wp = WindPuller(input_shape=input_shape, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) train_set, test_set = read_ultimate("/dataset/", input_shape) wp.fit(train_set.images, train_set.labels, batch_size=batch_size, nb_epoch=nb_epochs, shuffle=False, verbose=1, # validation_split=0.02, validation_data=(test_set.images, test_set.labels), callbacks=[ TensorBoard(log_dir='/output/logs', histogram_freq=100), ModelCheckpoint(filepath=model_path + '.best', save_best_only=True, mode='min') ]) scores = wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) wp.model.save(model_path) saved_wp = wp.load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(test_set.images, 1024) # print(pred) # print(test_set.labels) pred = numpy.reshape(pred, [-1]) result = numpy.array([pred, test_set.labels]).transpose() with open('output.' + str(input_shape[0]), 'w') as fp: for i in range(result.shape[0]): for val in result[i]: fp.write(str(val) + "\t") fp.write('\n')
def evaluate_model(model_path, code, output_dir, input_shape=[30, 61]): extract_from_file("dataset/%s.csv" % code, output_dir, code) train_set, test_set = read_feature(output_dir, input_shape, code) saved_wp = WindPuller(input_shape).load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(test_set.images, 1024) [cr, cap] = calculate_cumulative_return(test_set.labels, pred) # Output to a csv file # Read in the date, close from original data file. days_for_test = 700 tmp = pd.read_csv('dataset/%s.csv' % code, delimiter='\t') # tmp.columns = ['date', 'open', 'high', 'low', 'close', 'volume'] date = tmp['date'][-days_for_test:] close = tmp['close'][-days_for_test:] output = pd.DataFrame( { 'Return': test_set.labels, 'Position': pred.reshape(-1), 'Capital': cap.reshape(-1), 'Close': close.values }, index=date, columns=['Close', 'Return', 'Position', 'Capital']) output.to_csv('output/%s.csv' % code)
def make_model_type3(input_shape, nb_epochs=100, batch_size=128, lr=0.01, n_layers=1, n_hidden=16, rate_dropout=0.3): model_path = 'model.%s' % input_shape[0] windowSize = input_shape[0] # num minutes X_train, y_train, X_val, Y_val, X_test, y_test, numFeatures = ft.generateDataSetTXF( os.getcwd(), input_wind_size=input_shape[0], toDataSet=False) input_shape[0] = numFeatures wp = WindPuller(input_shape=input_shape, modelType=3, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) wp.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epochs, shuffle=True, verbose=1, validation_data=(X_val, Y_val)) scores = wp.evaluate(X_test, y_test, verbose=0) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) wp.model.save(model_path)
def load_model_type(model_path, input_shape, _modelTyp=0, nb_epochs=100, batch_size=128, lr=0.01, n_layers=1, n_hidden=16, rate_dropout=0.3): train_set, validation_set, test_set, numFeatures = ft.generateDataSetTXF( os.getcwd(), input_wind_size=input_shape[0], toDataSet=True) input_shape[1] = numFeatures wp = WindPuller(input_shape=input_shape, modelType=1, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) saved_wp = wp.load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(test_set.images, 1024) pred = np.reshape(pred, [-1]) result = np.array([pred, test_set.labels]).transpose() with open('output.' + str(input_shape[0]), 'w') as fp: for i in range(result.shape[0]): for val in result[i]: fp.write(str(val) + "\t") fp.write('\n')
def make_model_type(input_shape, _modelTyp=0, nb_epochs=100, batch_size=128, lr=0.01, n_layers=1, n_hidden=16, rate_dropout=0.3): model_path = 'model.%s' % input_shape[0] train_set, validation_set, test_set, numFeatures = ft.generateDataSetTXF( os.getcwd(), input_wind_size=input_shape[0], toDataSet=True) input_shape[1] = numFeatures wp = WindPuller(input_shape=input_shape, modelType=_modelTyp, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) wp.fit(train_set.images, train_set.labels, batch_size=batch_size, nb_epoch=nb_epochs, shuffle=True, verbose=1, validation_data=(validation_set.images, validation_set.labels)) scores = wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) wp.model.save(model_path)
def model_predict(model_path, code, input_shape=[30, 83]): extractfeatureonly_from_file("dataset/%s.csv" % code, code) ultimate_features = numpy.loadtxt("%s/%s_feature_only.%s" % (".", code, str(input_shape[0]))) ultimate_features = numpy.reshape(ultimate_features, [-1, input_shape[0], input_shape[1]]) saved_wp = WindPuller(input_shape).load_model(model_path) pred = saved_wp.predict(ultimate_features, 1024) for i in range(len(pred)): print(str(pred[i]))
def simple_predict_tomorrow(): ''' 使用做多和做空两个模型,对data_dir里的每日行情数据提取特征并训练得到信号。 ''' signal_dir = './signal_close/' date = get_date_list() files = os.listdir(data_dir) # 0. 加载模型 wp_buy = WindPuller(input_shape).load_model(model_path_buy) wp_sell = WindPuller(input_shape).load_model(model_path_sell) # 1. 提取所有特征 days_for_test = len(date) extract_all_features(data_dir, feature_dir, days_for_test) for (idf, f) in enumerate(files): # 2. 读取测试集特征 output_prefix = f.split('.')[0] test_set = read_features(feature_dir, input_shape, output_prefix) # 3. 训练模型 signal_buy = wp_buy.predict(test_set.images, 1024) signal_buy = signal_buy[-days_for_test:] signal_sell = wp_sell.predict(test_set.images, 1024) signal_sell = signal_sell[-days_for_test:] # 4. 保存结果 f_path_signal = os.path.join(signal_dir, f) data_signal = pd.DataFrame( { 'signal_close_buy': signal_buy.reshape(-1), 'signal_close_sell': signal_sell.reshape(-1) }, index=date) data_signal.to_csv(f_path_signal) print('%d 指数%s处理完毕' % (idf, output_prefix)) print('-' * 50) print('全部处理完毕!') print('=' * 80)
def make_separate_model(nb_epochs=100, batch_size=128, lr=0.01, n_layers=1, n_hidden=14, rate_dropout=0.3, input_shape=[30, 73]): train_sets, test_sets = read_separate_feature("./ultimate_feature") wp = WindPuller(input_shape=input_shape, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) wp.build_model() for code, train_set in train_sets.items(): test_set = test_sets[code] input_shape = [train_set.images.shape[1], train_set.images.shape[2]] print(input_shape) model_path = 'model.%s' % code print(train_set.images.shape) wp.fit(train_set.images, train_set.labels, batch_size=batch_size, nb_epoch=nb_epochs, shuffle=False, verbose=1, validation_data=(test_set.images, test_set.labels), callbacks=[ TensorBoard(histogram_freq=1000), ModelCheckpoint(filepath=model_path + '.best.checkpoints', save_best_only=True, mode='min') ]) scores = wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) wp.model.save(model_path) saved_wp = wp.load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(test_set.images, 1024) # print(pred) # print(test_set.labels) pred = numpy.reshape(pred, [-1]) result = numpy.array([pred, test_set.labels]).transpose() with open('output.' + str(input_shape[0]), 'w') as fp: for i in range(result.shape[0]): for val in result[i]: fp.write(str(val) + "\t") fp.write('\n')
def evaluate_model(model_path, code, input_shape=[30, 83]): extract_from_file("dataset/%s.csv" % code, code) train_set, test_set = read_feature(".", input_shape, code) saved_wp = WindPuller(input_shape).load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(test_set.images, 1024) cr = calculate_cumulative_return(test_set.labels, pred) print("changeRate\tpositionAdvice\tprincipal\tcumulativeReturn") for i in range(len(test_set.labels)): print(str(test_set.labels[i]) + "\t" + str(pred[i]) + "\t" + str(cr[i] + 1.) + "\t" + str(cr[i]))
def load_model_type3(input_shape): model_path = 'model.%s' % input_shape[0] wp = WindPuller(input_shape=input_shape, modelType=2, lr=lr, n_layers=n_layers, n_hidden=n_hidden, rate_dropout=rate_dropout) saved_wp = wp.load_model(model_path) scores = saved_wp.evaluate(test_set.images, test_set.labels, verbose=0) print('Test loss:', scores[0]) print('test accuracy:', scores[1]) pred = saved_wp.predict(X_test, 1024) pred = numpy.reshape(pred, [-1]) result = numpy.array([pred, y_test]).transpose() with open('output.' + str(input_shape[0]), 'w') as fp: for i in range(result.shape[0]): for val in result[i]: fp.write(str(val) + "\t") fp.write('\n')
def predict_tomorrow(model_path="model.30.best", extract_all=False): ''' 1. 先对3个数据集中每一个品种提取特征; 2. 读取只有一行数据的验证集; 3. 加载训练好的模型,预测在验证集上的信号结果; 4. 保存信号结果。 ''' # 1. 特征提取 data_dir = './newdata/' output_dir = './output09/' feature_dir = './stock_features/' if not (os.path.exists(output_dir)): os.mkdir(output_dir) # 测试集从2017-09-01开始 df = pd.read_csv('dataset/000300.csv', index_col='date', parse_dates=True) days_for_test = df.shape[0] - df.index.get_loc('2017-09-01') extract_all_features(data_dir, feature_dir, days_for_test, extract_all) # 2. 读取特征 input_shape = [30, 61] file_list = os.listdir(data_dir) if extract_all == True: column_names = [s.split(sep='.')[0] for s in file_list] else: # 否则只测试3个指数 column_names = ['000016', '000300', '000905'] # 加载模型 wp = WindPuller(input_shape).load_model(model_path) for f in column_names: _, test_set = read_feature(feature_dir, input_shape, f) tmp = pd.read_csv('dataset/%s.csv' % f) val = test_set pred = wp.predict(val.images, 1024) print(pred[-1]) [cr, cap] = calculate_cumulative_return_cost(val.labels, pred) # 设置读取验证集数据的范围 index = range(tmp.shape[0] - days_for_test - 1, tmp.shape[0]) # 1. 保存资金曲线的数据 date = tmp['date'].iloc[index] close = tmp['close'].iloc[index] buy_hold = close / close.iloc[0] - 1 output = pd.DataFrame( { 'Close': close.values, 'Pct_change': np.concatenate(([np.nan], val.labels)), 'Position': np.concatenate(([np.nan], pred.reshape(-1))), 'Cum_return': cr.reshape(-1), 'Buy_hold': buy_hold.values }, index=date, columns=[ 'Close', 'Pct_change', 'Position', 'Cum_return', 'Buy_hold' ]) names = pd.read_csv('指数名称.csv', dtype={ 'code': np.str, 'name': np.str }, engine='python') names.set_index('code', inplace=True) names = names.to_dict()['name'] n = names[f] # 写入文件 cap_line_dir = os.path.join(output_dir, 'stocks') if not (os.path.exists(cap_line_dir)): os.mkdir(cap_line_dir) cap_line_f = os.path.join(cap_line_dir, '%s_test.csv' % n) output.to_csv(cap_line_f) ## 2. 统计各项表现,画出资金曲线,生成投资报告 #print('当前处理 %s_%s_test\n' % (f, n)) #calc_perf(output, f, n, 'test', output_dir) print('计算完毕') print('=' * 50)
def test_model(model_path="model.30.best", extract_all=True, days_for_test=False): ''' 1. 先对数据集中每一个品种提取特征; 2. 读取训练集和验证集; 3. 加载训练好的模型,预测在训练集和验证集上的结果; 4. 根据结果绘制相应的资金变化图,并保存。 ''' # 1. 特征提取 data_dir = './dataset/' output_dir = './output09/' feature_dir = './stock_features/' if not (os.path.exists(output_dir)): os.mkdir(output_dir) # 只提取测试集的特征 if days_for_test == False: # 测试集从2017-09-01开始 df = pd.read_csv('dataset/000001.csv', index_col='date', parse_dates=True) days_for_test = df.shape[0] - df.index.get_loc('2017-09-01') extract_all_features(data_dir, feature_dir, days_for_test) # 2. 读取特征 input_shape = [30, 61] file_list = os.listdir(data_dir) if extract_all == True: column_names = [s.split(sep='.')[0] for s in file_list] else: # 否则只测试3个指数 column_names = ['000016', '000300', '000905'] wp = WindPuller(input_shape).load_model(model_path) for f in column_names: train_set, test_set = read_feature(feature_dir, input_shape, f) data_set = {'train': train_set, 'test': test_set} tmp = pd.read_csv('dataset/%s.csv' % f) for key in data_set: # 3.分别给训练集/验证集预测并画图保存 print('当前处理 %s_%s\n' % (f, key)) val = data_set[key] pred = wp.predict(val.images, 1024) [cr, cap] = calculate_cumulative_return_cost(val.labels, pred) # 根据训练集/验证集来设置读取数据的范围 if key == 'train': index = range(input_shape[0] - 1, input_shape[0] + pred.shape[0]) elif key == 'test': index = range(tmp.shape[0] - days_for_test - 1, tmp.shape[0]) # 1). 保存资金曲线的数据 date = tmp['date'].iloc[index] close = tmp['close'].iloc[index] buy_hold = close / close.iloc[0] - 1 # DEBUG: #print('date shape:\t', date.shape) #print('close shape:\t', close.shape) #print('buy_hold shape:\t', buy_hold.shape) #print('Pct_change shape:\t', val.labels.shape) #print('Position shape:\t', pred.shape) output = pd.DataFrame( { 'Close': close.values, 'Pct_change': np.concatenate(([np.nan], val.labels)), 'Position': np.concatenate(([np.nan], pred.reshape(-1))), 'Cum_return': cr.reshape(-1), 'Buy_hold': buy_hold.values }, index=date, columns=[ 'Close', 'Pct_change', 'Position', 'Cum_return', 'Buy_hold' ]) names = pd.read_csv('指数名称.csv', dtype={ 'code': np.str, 'name': np.str }, engine='python') names.set_index('code', inplace=True) names = names.to_dict()['name'] n = names[f] # 写入文件 cap_line_dir = os.path.join(output_dir, 'stocks') if not (os.path.exists(cap_line_dir)): os.mkdir(cap_line_dir) cap_line_f = os.path.join(cap_line_dir, '%s_%s.csv' % (n, key)) output.to_csv(cap_line_f) # 2). 统计各项表现,画出资金曲线,生成投资报告 print('开始计算策略表现 %s_%s_%s\n' % (f, n, key)) calc_perf(output, f, n, key, output_dir) print('计算完毕') print('=' * 50)
def paper_test(): ''' 逐个读取每一天的14:57的数据,与数据库中数据合并,生成新特征,读取训练好的模型, 预测出信号。 ''' merged_data_dir = './paper_merge' signal_dir = './paper_signals' date = get_date_list() files = os.listdir(tsl_data_dir) # 0. 加载模型 wp_buy = WindPuller(input_shape).load_model(model_path_buy) wp_sell = WindPuller(input_shape).load_model(model_path_sell) for (idx, d) in enumerate(date): print('当前处理日期\t%s' % d) for (idf, f) in enumerate(files): # 1. 读取新的数据 f_path1 = os.path.join(tsl_data_dir, f) df1 = pd.read_csv(f_path1) # 获取某一天的数据 df1 = df1[df1['date'] == d] df1['volume'] == df1['volume'] * 80 / 79 # 2. 读取原来的数据 f_path2 = os.path.join(data_dir, f) df2 = pd.read_csv(f_path2) # 3. 合并数据,删除原来数据多余部分,追加最新的一天的数据 df2 = df2.iloc[:int(np.flatnonzero(df2.date == d))] df3 = df2.append(df1, ignore_index=True) df3 = df3[df2.columns] # 4. 保存数据 f_path_merged = os.path.join(merged_data_dir, f) df3.to_csv(f_path_merged, index=False) # 5. 提取1个特征,存入相应文件夹 output_prefix = f.split('.')[0] extract_from_file(idx, f_path_merged, feature_dir, output_prefix, 1) # 6. 读取提取完的特征 test_set = read_features(feature_dir, input_shape, output_prefix) # 7. 训练模型 signal_buy = wp_buy.predict(test_set.images, 1024) signal_buy = float(signal_buy[-1]) signal_sell = wp_sell.predict(test_set.images, 1024) signal_sell = float(signal_sell[-1]) # 8. 保存结果 f_path_signal = os.path.join(signal_dir, f) if idx == 0: # 写入字段名 title = 'date,signal_buy,signal_sell' with open(f_path_signal, 'a') as file: file.write(title) write = '%s,%.2f,%.2f\n' % (d, signal_buy, signal_sell) with open(f_path_signal, 'a') as file: file.write(write) n_read = idx * len(files) + idf + 1 print('当前处理第%d个文件,剩余%d个文件,请耐心等待...' % (n_read, len(files) * len(date) - n_read)) print('-' * 50) print('\n全部处理完毕!') print('=' * 80)
def main(): ''' 每天获取14:57的3个指数数据,进行数据校正后添加到本地数据文件中,再提取特征,计算 信号,保存到对应文件中。到15:01收盘后,再进行同样操作。 ''' # 1. 加载keras训练完的模型 print('=' * 80) print('%s\t加载keras训练完的模型' % (datetime.now().strftime('%H:%M:%S'))) set_gpu_fraction() model_path_buy = 'model.30.buy' wp_buy = WindPuller(input_shape).load_model(model_path_buy) model_path_sell = 'model.30.sell' wp_sell = WindPuller(input_shape).load_model(model_path_sell) print('\n%s\t模型加载完毕\n' % (datetime.now().strftime('%H:%M:%S'))) # 2. 查询14:57的数据 print('=' * 80) print('%s\t开始查询实时行情数据,将返回14:57的第一笔数据' % datetime.now().strftime('%H:%M:%S')) running = 1 # 程序最多查询到15:01 stop_time = datetime.now().replace(hour=15, minute=1, second=0, microsecond=0) while running: print('时间未到,请耐心等待数据...') running, data = get_realtime_data() if running == 1: time.sleep(3) # 只要获取到14:57的数据,或者查询时间超过15:01,就停止 running = running and datetime.now() < stop_time print('%s\t查询数据完毕,开始合并数据\n' % datetime.now().strftime('%H:%M:%S')) print('=' * 80) # 3. 更新本地数据,共2个文件 # 文件1:原始数据末尾添加一行14:56数据 # 文件2:对比信号文件添加1行4列数据 update_csv(data) # 4. 提取最新特征 print('%s\t开始提取特征\n' % datetime.now().strftime('%H:%M:%S')) extract_all_features(data_dir, feature_dir, days_for_test=1, extract_all=False) print('%s\t特征提取完毕\n' % datetime.now().strftime('%H:%M:%S')) print('=' * 80) # 5. 读取原始数据,生成特征,预测明天的信号,并保存 predict_tomorrow(wp_buy, wp_sell, is_last_column=False) print('请等待15:01程序会继续获取当日行情数据进行计算和预测...') print('=' * 80) # 6. 在15:01:20运行一次, 获取收盘后的行情数据 stop_time = stop_time.replace(second=20) while datetime.now() < stop_time: time.sleep(3) print('等待中,请勿中断...') print('=' * 25, '开始获取当日收盘后行情', '=' * 25) _, data = get_realtime_data() # 7. 使用当日收盘价更新本地数据文件,共2个文件 # 文件1:原始数据末尾修改14:57的数据 # 文件2: 对比信号文件最后一行追加2列数据 update_close_csv(data) # 8. 提取最新特征 print('%s\t开始提取特征\n' % datetime.now().strftime('%H:%M:%S')) extract_all_features(data_dir, feature_dir, days_for_test=1, extract_all=False) print('%s\t特征提取完毕\n' % datetime.now().strftime('%H:%M:%S')) print('=' * 80) # 9. 读取收盘后的原始数据,生成特征,预测明天的信号 predict_tomorrow(wp_buy, wp_sell, is_last_column=True) print('%s\t完成!\n' % datetime.now().strftime('%H:%M:%S')) print('=' * 80)