Beispiel #1
0
def main():
    """"""

    with open('config.json', 'r', encoding='utf-8') as f:
        config = json.load(f)

    path = os.path.join(sys.path[0], 'output')
    # 存档路径
    archive_path = os.path.join(path, '031502')
    # 以上证50为参考
    ref_index = search_file(path, '000016.SH')

    statistics_list = search_file(archive_path, 'statistics')
    order_list = search_file(archive_path, 'order')
    portfolio_list = search_file(archive_path, 'portfolio')

    # 获取数据
    ref_index = pd.read_csv(ref_index[0], index_col=0)
    ref_index = ref_index.set_index(
        pd.Series([
            arrow.get(str(i), 'YYYYMMDD').date()
            for i in ref_index['trade_date'].values
        ]))

    statistics_list = [pd.read_csv(i, index_col=0) for i in statistics_list]
    order_list = [pd.read_csv(i, index_col=0) for i in order_list]
    portfolio_list = [pd.read_csv(i, index_col=0) for i in portfolio_list]

    plot_statistics(statistics_list, reference=ref_index, save=True)
    plot_order(order_list)
Beispiel #2
0
 def _load_data(self, ):
     """
     加载最新数据
     """
     for st in self.stock_pool:
         path_ = search_file(self.data_path, st)
         try:
             data = pd.read_csv(path_[0], )
             self.stock_data_list.append(data)
         except Exception as e:
             print(e, )
     self.preprocessed = False
def main():
    """"""
    with open('config.json', 'r', encoding='utf-8') as f:
        config = json.load(f)
    # 准备交易行情和日历
    calender, history, all_quote = prepare_train(config, download=False)

    # 读取已经保存好的训练结果
    stock_list = config['data']['stock_code']
    results_path = os.path.join(sys.path[0], 'saved_results')
    predict_results_dict = {}
    for item in stock_list:
        csv_files = search_file(results_path, item)
        data = pd.read_csv(csv_files[0])
        predict_results_dict[item] = data

    # 全局训练范围,在这个范围内随机指定时间段进行训练
    global_stop_date = arrow.get(config['training']['train_deadline'], 'YYYYMMDD')
    # global_stop_date = arrow.get('20151231', 'YYYYMMDD')
    global_start_date = calender[int(config['preprocess']['train_pct'] * len(calender))]
    global_training_range = [i for i in calender if i < global_stop_date and i >= global_start_date]
    # 约定决策训练的时间长度
    train_len = 200

    # 随机在整个训练周期内挑选时间段训练,时间长度为train_len天
    for _ in range(100):
        choose_start = random.choice(global_training_range[:-train_len])
        choose_range = [i for i in global_training_range if i >= choose_start][:train_len]

        # 每隔5次训练保存一次结果
        save_or_not = True if _ % 5 == 4 else False

        # 训练决策模型,初始化资金,得到
        train_decision( config=config,
                        save=save_or_not, 
                        calender=calender, 
                        history=history, 
                        predict_results_dict=predict_results_dict,
                        test_mode=False,
                        start_date=choose_start.date(),
                        stop_date=choose_range[-1].date(),
                        load=True,
                        episode_steps=config['training']['episode_steps'],
                        model='HER' if config['training']['env_mode'] == 'goal' else "TD3"
                        )



    print("A lot of work to do ...")
Beispiel #4
0
def train_decision(config=None,
                   save=False,
                   load=False,
                   calender=None,
                   history=None,
                   predict_results_dict=None,
                   test_mode=False,
                   start_date=None,
                   stop_date=None,
                   episode_steps=1000,
                   model='DDPG'):
    """
    训练决策模型,从数据库读取数据并进行决策训练

    参数:
        config:配置文件, 
        save:保存结果, 
        calender:交易日日历, 
        history:行情信息, 
        all_quotes:拼接之后的行情信息
        predict_results_dict:预测结果信息
    """
    # 首先处理预测数据中字符串日期

    MODEL = model

    predict_dict = {}
    for k, v in predict_results_dict.items():
        assert isinstance(v['predict_date'].iloc[0], str)
        tmp = v['predict_date'].apply(
            lambda x: arrow.get(x, 'YYYY-MM-DD').date())
        predict_dict[k] = v.rename(index=tmp)

    env = Portfolio_Prediction_Env(config=config,
                                   calender=calender,
                                   stock_history=history,
                                   window_len=1,
                                   prediction_history=predict_dict,
                                   start_trade_date=start_date,
                                   stop_trade_date=stop_date,
                                   save=save)

    # 测试模式
    if test_mode:
        obs = env.reset()
        # check_env(env)
        for i in range(1000):
            W = np.random.uniform(0.0, 1.0, size=(6, ))
            offer = np.random.uniform(-10.0, 10.0, size=(6, ))
            obs, reward, done, infos = env.step(np.hstack((W, offer)))
            # env.render()
            if done:
                env.save_history()
                break
        env.close()

    # 训练模式
    if MODEL == "DDPG":
        # 添加噪声
        n_actions = env.action_space.shape
        param_noise = None
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = DDPG.load(
                model_path[0],
                env=env,
                policy=CustomDDPGPolicy,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = DDPG(
                policy=CustomDDPGPolicy,
                env=env,
                verbose=1,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == 'TD3':
        n_actions = env.action_space.shape[-1]
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = TD3.load(
                model_path[0],
                env=env,
                policy=CustomTD3Policy,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = TD3(
                policy=CustomTD3Policy,
                env=env,
                verbose=1,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == "HER":
        """
        env必须是GoalEnv
        """
        model_class = DDPG

        # Available strategies (cf paper): future, final, episode, random
        goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

        # Wrap the model
        model = HER(policy=CustomDDPGPolicy,
                    env=env,
                    model_class=model_class,
                    n_sampled_goal=4,
                    goal_selection_strategy=goal_selection_strategy,
                    verbose=1)
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    obs = env.reset()
    # 实测模式
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render(info=info)
        if done:
            if save:
                env.save_history()
            env.reset()
            break

    env.close()
Beispiel #5
0
def main():
    for path in search_file(
            'D:\\GitHub\\Quantitative-analysis-with-Deep-Learning\\quantitative_analysis_with_deep_learning\\saved_results',
            '.csv'):
        data = load_data(path)
        plot_training(data)
def train_forecasting(config=None,
                      save=False,
                      calender=None,
                      history=None,
                      forecasting_deadline=None):
    """
    训练预测模型

        参数:
            config:配置文件
            save:是否保存结果至本地
            calender:交易日历
            history:股票历史行情
            forecasting_deadline:指定预测模型预测的截止时间
    """
    assert config is not None

    data_pro = DataProcessor(date_col=config['data']['date_col'],
                             daily_quotes=config['data']['daily_quotes'],
                             target_col=config['data']['target'])
    stock_list = config['data']['stock_code']
    assert len(stock_list) == len(history)
    # 对时间进行编码
    (date_list, embeddings_list) = data_pro.encode_date_embeddings(calender)

    # 预测结果的字典
    predict_results_dict = {}

    # 对投资标的的历史数据进行建模
    for idx, data in zip(stock_list, history):

        # 计算技术指标、填充空值
        data_tec = data_pro.cal_technical_indicators(data,
                                                     date_index=date_list)
        data_tec = data_pro.fill_nan(data_tec)

        # 计算傅里叶变换、填空
        data_fft = data_pro.cal_fft(data, )
        data_fft = data_pro.fill_nan(data_fft)

        # 计算日行情
        daily_quotes = data_pro.cal_daily_quotes(data)

        # 分离其他特征、填空
        daily_other_features = data_pro.split_quote_and_others(data)
        daily_other_features = data_pro.fill_nan(daily_other_features)

        assert data_tec.shape[0] == data_fft.shape[
            0] == daily_other_features.shape[0]

        # 将技术指标、傅里叶变换和除了额外指标进行拼接
        extra_features = np.concatenate(
            [data_tec, data_fft, daily_other_features], axis=1).astype(float)

        # 处理无穷数
        extra_features_no_nan_inf = data_pro.fill_inf(extra_features)

        # 超限数量级,对数压缩
        scaled_extra_features = data_pro.convert_log(
            pd.DataFrame(extra_features_no_nan_inf),
            trigger=config['data']['log_threshold'])

        # 获取标签列
        real_price = daily_quotes.values[:, 0]
        if config['preprocess']['predict_type'] == 'real':
            y = real_price
        elif config['preprocess']['predict_type'] == 'diff':
            y = daily_quotes.values[:, 1]
        elif config['preprocess']['predict_type'] == 'pct':
            y = daily_quotes.values[:, 2]
        else:
            raise ValueError(
                'Please input right prediction type: real/diff/pct .')

        # 建立时间和股价的索引,作为该数据集的全局索引,
        date_index = pd.to_datetime(date_list, format='%Y%m%d').date
        date_price_index = pd.DataFrame(
            {
                'date': date_list,
                'price': real_price,
                'idx': range(len(date_index))
            },
            index=date_index)
        # 拼接特征,顺序是[行情数据,时间编码,额外特征] ,标签是[标签]
        assert len(daily_quotes) == len(embeddings_list) == len(
            scaled_extra_features)
        x = np.concatenate([
            daily_quotes.values, embeddings_list, scaled_extra_features.values
        ],
                           axis=1)
        # 确定训练集和测试集时间范围,模型在测试集中迭代训练并预测
        date_range_dict = data_pro.split_train_test_date(
            date_price_index=date_price_index,
            train_pct=config['preprocess']['train_pct'],
            validation_pct=config['preprocess']['validation_pct'])
        # 分解训练、验证数据的时间范围
        total_train_daterange = date_range_dict['train']
        validation_daterange = date_range_dict['validation']
        step_by_step_train_daterange = date_range_dict['predict']

        # 将数据特征和标签按照window_len, predict_len切分
        total_x_train, total_y_train, _ = data_pro.window_data_sliceing(
            x, y, date_price_index, date_price_index.index.values)
        """
        定义参数文件命名方式:
            YYYYMMDD_hhmmss-loss-val_loss-acc-val_acc-stock_symbol-end_date.h5
            loss:训练误差,val loss:验证误差,acc:准确率,val acc:验证准确率,stock:代码:end date:训练数据截止日期

        训练流程:
            1.从save model path中查找权重文件,有则解析文件名,加载最新,无则直接【全量训练】
            2.从最新文件名,获得end_date,根据已有数据的latest date,计算出还需要预测几个window
            3.加载最新权重,训练之后预测1个window,写入文件或return
            4.直到预测到latest date为止,保存权重。
            5.预测一定是step by step的,为了避免信息泄露,确保时序因果性
        """

        # 获取已经保存的模型参数文件,查找字符串:股票代码idx
        model_para_path = search_file(config['training']['save_model_path'],
                                      idx)

        # 已有相关权重文件,解析最新文件
        if len(model_para_path) > 0:
            try:
                parser_list = [
                    parse_filename(filename=filename)
                    for filename in model_para_path
                ]
            except Exception as e:
                print(e)
            parser_list = [s for s in parser_list if s is not None]

            # 查找最近训练的权重
            tmp = arrow.get(0)
            for d in parser_list:
                if tmp < d['end_date']:
                    tmp = d['end_date']
                    latest_file = d
            # 定位最新训练文件
            timestamps = latest_file['train_date'].format('YYYYMMDD_HHmmss')
            latest_date = latest_file['end_date'].date()
            latest_file = search_file(config['training']['save_model_path'],
                                      timestamps)[0]
        else:
            latest_date = total_train_daterange[-1]
            latest_file = None

        # 根据latest file 截取step_by_step_train_daterange头部
        if latest_file is not None:
            latest_idx = np.where(step_by_step_train_daterange <= latest_date)
            # 有可能是latest_date 刚好等于step_by_step_train_daterange的前一天
            if latest_idx[0].shape[0] > 0:
                step_by_step_train_daterange = step_by_step_train_daterange[
                    latest_idx[0][-1] + 1:]

        # 根据deadline截取step_by_step_train_daterange尾部
        if forecasting_deadline is not None:
            step_by_step_end_date = arrow.get(forecasting_deadline,
                                              'YYYYMMDD').date()
            temp_idx = np.where(
                step_by_step_train_daterange <= step_by_step_end_date)
            step_by_step_train_daterange = step_by_step_train_daterange[:
                                                                        temp_idx[
                                                                            0]
                                                                        [-1]]
        '''
            模型定义与训练
            全量训练(必须)之后,保存权重,然后根据需要进行增量训练。
        '''
        stock_name = idx
        model = LSTM_Model(config, name=stock_name)

        # 定义输入输出维度
        input_shape = (config['preprocess']['window_len'], x.shape[-1])
        output_shape = (config['preprocess']['predict_len'], )
        batch_size = config['training']['batch_size']

        # 根据输入输出维度,每一代的训练次数,构建模型
        model.build_model(
            input_shape=input_shape,
            output_shape=output_shape,
        )

        # 定义存放结果数据的dataframe,包括预测涨跌,训练均方误差和精确度
        col_names = []
        for i in range(config['preprocess']['predict_len']):
            col_name = 'pred_' + str(i)
            col_names.append(col_name)
        col_names = ['predict_date'] + col_names + [
            'epoch_loss', 'epoch_val_loss', 'epoch_acc', 'epoch_val_acc'
        ]

        # 读取或者新建一个存放结果的dataframe
        results_path = search_file(config['prediction']['save_result_path'],
                                   idx)
        if len(results_path) == 0:
            results_df = pd.DataFrame(columns=col_names, )
        else:
            results_df = pd.read_csv(results_path[0])
        """
        # 全量训练,改为使用普通方法训练,节省时间
        """
        if latest_file is None:
            # 训练数据生成
            X, Y = data_pro.get_window_data(
                total_x_train,
                total_y_train,
                date_price_index,
                total_train_daterange,
            )

            # 验证数据从训练数据中按百分比抽样(这样有点失去了验证的效果)
            # 将验证数据从未来数据中抽样,不符合时序数据的因果性
            # val_idx = random.sample(range(len(X)), int(config['preprocess']['validation_pct'] * len(X)))
            # val_X = np.array([X[v_i] for v_i in val_idx])
            # val_Y = np.array([Y[v_i] for v_i in val_idx])

            # 验证集数据从未被训练的数据中获取
            # 训练集中,最后一个预测窗口所覆盖的len(predict_len)个数据未被训练,可用来验证模型时变性
            val_X = []
            val_Y = []
            val_idx_start = date_price_index['idx'].loc[
                total_train_daterange[-1]] + 1
            for i in range(config['preprocess']['predict_len']):
                val_X_i, val_Y_i = data_pro.get_window_data(
                    total_x_train,
                    total_y_train,
                    date_price_index,
                    single_window=date_price_index['date'].iloc[val_idx_start +
                                                                i],
                )
                val_X.append(val_X_i)
                val_Y.append(val_Y_i)

            val_X = np.array(val_X)
            val_Y = np.array(val_Y)

            # 训练并保存误差和精确度
            epoch_loss, epoch_val_loss, epoch_acc, epoch_val_acc = \
                model.train_model(  X, Y,
                                    val_x=val_X,
                                    val_y=val_Y,
                                    save_model=True,
                                    end_date=arrow.get(latest_date).format('YYYYMMDD'))
            # 预测一步
            pred_x, _ = data_pro.get_window_data(
                total_x_train,
                total_y_train,
                date_price_index,
                single_window=total_train_daterange[-batch_size])
            result = model.predict_one_step(pred_x, )

            row_data = [step_by_step_train_daterange[0]] + list(
                result.reshape((-1, ))) + [
                    epoch_loss, epoch_val_loss, epoch_acc, epoch_val_acc
                ]
            # 将一次预测的结果存入
            results_df = add_to_df(results_df, col_names, row_data)
        else:
            # 加载已有的权重,
            model.load_model_weight(latest_file)
        """
        # 按步全量训练,并预测
        """
        for date_step in step_by_step_train_daterange:
            try:
                recent_date = results_df['predict_date'].iloc[-1]
                if arrow.get(recent_date, 'YYYY-MM-DD').date() == date_step:
                    continue
            except Exception as e:
                pass
            # 训练数据生成
            temp_idx = np.where(date_price_index.index.values <= date_step)
            current_step = date_price_index.index.values[:temp_idx[0][-1]]
            X, Y = data_pro.get_window_data(
                total_x_train,
                total_y_train,
                date_price_index,
                current_step,
            )

            # 验证数据从训练数据中按百分比抽样
            # val_idx = random.sample(range(len(X)), int(config['preprocess']['validation_pct'] * len(X)))
            # val_X = np.array([X[v_i] for v_i in val_idx])
            # val_Y = np.array([Y[v_i] for v_i in val_idx])

            # 验证集数据从未被训练的数据中获取
            # 训练集中,最后一个预测窗口所覆盖的len(predict_len)个数据未被训练,可用来验证模型时变性
            val_X = []
            val_Y = []
            val_idx_start = date_price_index['idx'].loc[date_step] + 1
            for i in range(config['preprocess']['predict_len']):

                val_X_i, val_Y_i = data_pro.get_window_data(
                    total_x_train,
                    total_y_train,
                    date_price_index,
                    single_window=date_price_index['date'].iloc[val_idx_start +
                                                                i],
                )

                if val_Y_i is None:
                    # 如果是最后一次训练,则未来的验证集数据的标签无法获得
                    val_Y_i = [0, 0, 0, 0, 0]
                val_X.append(val_X_i)
                val_Y.append(val_Y_i)

            val_X = np.array(val_X)
            val_Y = np.array(val_Y)

            # 如果是最后一次训练,需要保存权重
            if date_step == step_by_step_train_daterange[-1]:
                save_model_value = True
            else:
                save_model_value = False

            # 训练并保存误差和精确度
            epoch_loss, epoch_val_loss, epoch_acc, epoch_val_acc = \
                model.train_model(  X, Y,
                                    val_x=val_X,
                                    val_y=val_Y,
                                    save_model=save_model_value,
                                    end_date=arrow.get(date_step).format('YYYYMMDD'))

            # 预测一步,预测一步使用的窗口数据是否是最后一个窗口?
            pred_x, _ = data_pro.get_window_data(
                total_x_train,
                total_y_train,
                date_price_index,
                single_window=current_step[-batch_size])
            result = model.predict_one_step(pred_x, )
            temp_idx = np.where(date_price_index.index.values <= date_step)
            current_date = date_price_index.index.values[temp_idx[0][-1]]
            row_data = [current_date] + list(result.reshape((-1, ))) + [
                epoch_loss, epoch_val_loss, epoch_acc, epoch_val_acc
            ]
            # 将一次预测的结果存入
            results_df = add_to_df(results_df, col_names, row_data)

            # 每训练一年(250个数据),则保存一次数据
            training_idx = np.where(
                step_by_step_train_daterange <= current_date)
            if training_idx[0][-1] % 250 == 249:
                now = arrow.now().format('YYYYMMDD_HHmmss')
                save_path = os.path.join(
                    config['prediction']['save_result_path'], now + '-' + idx +
                    '-' + current_date.strftime('%Y%m%d') + '.csv')
                results_df.to_csv(save_path)

            print('[Predict] Prediction of %s is saved to file.' %
                  current_date.strftime("%Y%m%d"))

            # 最后一次训练、保存权重,保存训练结果
            if save_model_value:
                now = arrow.now().format('YYYYMMDD_HHmmss')
                save_path = os.path.join(
                    config['prediction']['save_result_path'], now + '-' + idx +
                    '-' + step_by_step_train_daterange[-1].strftime('%Y%m%d') +
                    '.csv')
                results_df.to_csv(save_path)

        # 可视化部分,还没有实现
        if config['visualization']['draw_graph']:
            # 预测结果的长度是标签长度与预测步数的乘积
            predict_len = output_shape[0] * config['prediction'][
                'predict_steps']
            assert predict_len <= date_range_dict['predict'].shape[0]
            predict_data = data_pro.predict_data_x(
                x, date_price_index, date_range_dict['predict'][:predict_len])
            results = model.predict_future(predict_data)

            real_results = data_pro.cal_daily_price(
                date_price_index, date_range_dict['train'][-1], results)
            data_vis = DataVisualiser(config, name=stock_name)
            data_vis.plot_prediction(date_range_dict=date_range_dict,
                                     prediction=real_results,
                                     date_price_index=date_price_index)

        # 预测结果数据的字典,删除无用列
        try:
            results_df = results_df.drop(columns=[
                x for x in results_df.columns if x.startswith('Unnamed')
            ])
        except Exception as e:
            pass

        predict_results_dict[idx] = results_df

    return predict_results_dict
Beispiel #7
0
    def download_stock(self, download_mode: str, start_date: str, date_col):
        """
        更新股票池中的记录(截止最新)
        """
        self.download_mode = download_mode
        self.start_date = start_date
        self.date_col = date_col

        with open(self.stock_list_file, encoding='UTF-8') as f:
            stock_dict = {}
            for line in f.readlines():
                if line == '\n':
                    break
                l = list(line.rstrip('\n').split())
                stock_dict[l[0]] = l[1][1:-1]

        if self.download_mode == 'total':
            # 全量下载
            for k, v in stock_dict.items():
                # md = MinuteDownloader(start_date='20190101', end_date='20191231', stock_code=str(v))
                # minutes_data = md.downloadMinutes(save=False)
                dd = DailyDownloader(start_date=self.start_date,
                                     end_date=self.current_date,
                                     stock_code=str(v),
                                     save_dir=self.data_path)
                daily_data = dd.downloadDaily(save=True)
                print('Complete %s %s total downloading from %s to %s.' %
                      (k, v, self.start_date, self.current_date))

        elif self.download_mode == 'additional':
            # 增量下载,可以提高速度
            for k, v in stock_dict.items():
                try:
                    path_ = search_file(self.data_path, v)
                    if len(path_) == 0:
                        # 文件夹中不含有这个文件 则启动对这个文件的全量下载
                        dd = DailyDownloader(start_date=self.start_date,
                                             end_date=self.current_date,
                                             stock_code=str(v),
                                             save_dir=self.data_path)
                        daily_data = dd.downloadDaily(save=True)
                        print(
                            'Complete %s %s total downloading from %s to %s.' %
                            (k, v, self.start_date, self.current_date))
                    else:
                        old_data = pd.read_csv(path_[0])
                        old_cal_date = str(old_data[self.date_col].iloc[-1])
                        new_start = arrow.get(
                            old_cal_date,
                            'YYYYMMDD').shift(days=1).format('YYYYMMDD')
                        if old_cal_date != self.current_date:
                            dd = DailyDownloader(
                                start_date=new_start,
                                end_date=self.current_date,
                                stock_code=str(v),
                            )
                            additional_data = dd.downloadDaily(save=False)
                            additional_data.to_csv(path_[0],
                                                   mode='a+',
                                                   header=False,
                                                   index=True)
                            print(
                                'Complete %s %s additional downloading from %s to %s.'
                                % (k, v, old_cal_date, self.current_date))
                        else:
                            print('Stock data %s %s is up to date.' % (k, v))
                except Exception as e:
                    print(e)