def main(model_name, algo, testRange, isTargetPositionFixed, isDiscrete):
    panda_env = PandaGraspGymEnv(urdfRoot=object_data.getDataPath(), isRendering=True, useIK=True, isDiscrete=isDiscrete,
                                 numControlledJoints=7, isTargetPositionFixed=isTargetPositionFixed)
    env = DummyVecEnv([lambda: panda_env])

    if algo == "DDPG":
        model = DDPG.load(model_name)
    else:
        model = DQN.load(model_name)
    obs = env.reset()

    images = []
    img = env.get_images()

    for i in range(testRange):
        images.append(img)
        action, _states = model.predict(obs, deterministic=True)
        print("Step: {} Action: {}".format(i, action))
        obs, rewards, done, info = env.step(action)
        env.render(mode='human')
        img = env.get_images()

    os.makedirs(gif_dir, exist_ok=True)
    imageio.mimsave(gif_dir + model_name + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i % 2 == 0], fps=29)
Example #2
0
def test2():
    import gym
    import datetime as dt
    import matplotlib.pyplot as plt

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.common.evaluation import evaluate_policy
    from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC

    import pandas as pd

    from lutils.stock import LTdxHq

    ltdxhq = LTdxHq()
    df = ltdxhq.get_k_data_1min('603636') # 000032 300142 603636 
    # df = ltdxhq.get_k_data_5min('603636')
    # df = ltdxhq.get_k_data_daily('603636')
    ltdxhq.close()

    df = StockDataFrame(df) # .rename(columns={'vol': 'volume'}))

    # df = df.rename(columns={'open': 'Open', 'close': 'Close', 'high': 'High', 'low': 'Low', 'vol': 'Volume'})
    df.index = pd.to_datetime(df.index)
    df1 = df[:-240]
    df2 = df[-240:]

    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: LStockDailyEnv(df1)])
    eval_env = DummyVecEnv([lambda: LStockDailyEnv(df2)])

    # policy_kwargs = dict(net_arch=[64, 'lstm', dict(vf=[128, 128, 128], pi=[64, 64])])
    policy_kwargs = dict(net_arch=[128, 'lstm', dict(vf=[256, 256], pi=[256, 256])])
    
    model = A2C('MlpLstmPolicy', env, verbose=1, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=20000)

    # episode_rewards, _  = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True, return_episode_rewards=True) # EVAL_EPS
    # print(mean_reward)

    is_recurrent = model.policy.recurrent
    obs = eval_env.reset()
    # if is_recurrent:
    #     zero_completed_obs = np.zeros((model.n_envs,) + model.observation_space.shape)
    #     zero_completed_obs[0, :] = obs
    #     obs = zero_completed_obs

    net_worths = []
    done, state = False, None
    while not done:
        action, state = model.predict(obs, state=state, deterministic=True)
        obs, reward, done, _info = eval_env.step(action)
        net_worths.append(_info[0]['net_worth'])
        # if is_recurrent:
        #     obs[0, :] = new_obs
        # else:
        #     obs = new_obs
        eval_env.render()

    plt.plot(net_worths)
    plt.show()
Example #3
0
            if discreteActionsSpace:
                #this needs fixing
                sampledAction, logProbSampledAction, logProbsAll = policy.getSampledActions(
                    obs[l])
                additionalInfos = [logProbsAll]
            else:
                sampledAction, logProbSampledAction, actionsMean, actionLogStd = sess.run(
                    [
                        actionFinalOp, sampledLogProbsOp, actionMeanOp,
                        actionLogStdOp
                    ],
                    feed_dict={obsPh: np.expand_dims(obs[l], 0)})
                additionalInfos[0][l] = actionsMean
                additionalInfos[1][l] = actionLogStd
            nextObss, rews, nextDones, infoss = env.step(sampledAction)
            nextObs, rewards[l], nextDone, infos = nextObss[0], rews[
                0], nextDones[0], infoss[0]
            sampledLogProb[l] = logProbSampledAction[0]

            if dones[l]:

                summaryRet, summaryLen = sess.run([epTotalRewSum, epLenSum],
                                                  feed_dict={
                                                      epTotalRewPh: epTotalRew,
                                                      epLenPh: epLen
                                                  })
                globalStep = e * args.epoch_len + l
                writer.add_summary(summaryRet, globalStep)
                writer.add_summary(summaryLen, globalStep)
                epTotalTrainRews.append(epTotalRew)
Example #4
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

# 環境の生成
env = gym.make('CartPole-v1')
env = DummyVecEnv([lambda: env])

# モデルの生成
model = PPO2(MlpPolicy, env, verbose=1)

# モデルの学習
model.learn(total_timesteps=10000)

# モデルの保存
model.save('sample')

# モデルの削除
del model

# モデルの読み込み
model = PPO2.load('sample')

# モデルのテスト
state = env.reset()
for i in range(200):
    env.render()
    action, _ = model.predict(state)
    state, rewards, done, info = env.step(action)
    if done:
        break
Example #5
0
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

if curr_idx == -1:
    model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1,
            tensorboard_log=Path("./tensorboard").name, **model_params)
else:
    model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=train_env)

for idx in range(curr_idx + 1, 10):
    print('[', idx, '] Training for: ', train_len, ' time steps')

    model.learn(total_timesteps=train_len)

    obs = test_env.reset()
    done, reward_sum = False, 0

    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = test_env.step(action)
        reward_sum += reward

    print('[', idx, '] Total reward: ', reward_sum, ' (' + reward_strategy + ')')
    model.save('./agents/ppo2_' + reward_strategy + '_' + str(idx) + '.pkl')
def stock_trade_US(stock_file_train, no_of_test_trading_days):
    df_train = pd.read_csv(stock_file_train)
    # df_train = df_train.sort_values('date')

    # The algorithms require a vectorized environment to run
    env_train = DummyVecEnv([lambda: StockTradingEnv_US(df_train)])

    total_timesteps = int(4e4)
    # total_timesteps = int(1e5)

    model = PPO2('MlpPolicy',
                 env_train,
                 verbose=0,
                 tensorboard_log='./log',
                 seed=12345).learn(total_timesteps=total_timesteps)

    # Random Agent, after training
    # mean_reward, std_reward = evaluate_policy(model, env_train, n_eval_episodes=100)
    # print(f"after training, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

    # -----------------Test Model --------------------------------------

    import sys
    sys.stdout = open(
        f'./output/output_SPY_{total_timesteps}_days_{no_of_test_trading_days}.txt',
        'wt')

    day_profits = []
    buy_hold_profit = []

    df_test_raw = pd.read_csv(stock_file_train.replace('train', 'test'))
    #start from random day
    # df_test = df_test_raw.iloc[200:].reset_index(drop=True)
    df_test = df_test_raw

    df_test = df_test.drop(['Adj Close'], axis=1)

    env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)])
    obs = env_test.reset()
    no_of_shares = 0
    buy_hold_commission = 0
    for n in range(len(df_test) - 1):
        if n > no_of_test_trading_days:
            break

        action, _states = model.predict(obs)

        # let agent start with a buy all
        # if n == 0:
        #     action[0][0] = 0
        #     action[0][1] = 1

        obs, rewards, done, info = env_test.step(action)
        profit = env_test.render()
        day_profits.append(profit)

        if n == 0:
            buy_hold_profit.append(0)
            no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close']
            buy_hold_commission = no_of_shares * df_test.iloc[0][
                'Close'] * 0.001
            print('Buy ' + str(no_of_shares) + ' shares and hold')
        else:
            buy_hold_profit_per_step = no_of_shares * (
                df_test.iloc[n]['Close'] -
                df_test.iloc[0]['Close']) - buy_hold_commission
            buy_hold_profit.append(buy_hold_profit_per_step)
            print('Buy and Hold: ' + '*' * 40)
            print('No of shares: ' + str(no_of_shares) +
                  ' average cost per share ' + str(df_test.iloc[0]['Close']))
            print('profit is ' + str(buy_hold_profit_per_step))

        if done:
            break

    good_model = False
    if day_profits[-1] > buy_hold_profit[-1]:
        good_model = True

    return day_profits, buy_hold_profit, good_model, model, total_timesteps
Example #7
0
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2, A2C
from sonic_util import make_env
from gym.wrappers import Monitor


env = DummyVecEnv([lambda: make_env(level_name='LabyrinthZone.Act1', \
                stack=False, scale_rew=True)])

modelname = 'sonicppo'
model = PPO2(CnnPolicy, env, n_steps=4500, verbose=1)
model.load("./checkpoint" + modelname)

obs = env.reset()
done = False
reward = 0

while not done:
    actions, _ = model.predict(obs)
    obs, rew, done, info = env.step(actions)
    reward += rew
    env.render()
env.close()
Example #8
0
    def optimize_params(self,
                        trial,
                        n_prune_evals_per_trial: int = 2,
                        n_tests_per_eval: int = 1):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)
        train_provider, validation_provider = train_provider.split_data_train_test(
            self.train_split_percentage)

        del test_provider

        train_env = DummyVecEnv([lambda: TradingEnv(train_provider)])
        validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)])

        model_params = self.optimize_agent_params(trial)
        model = self.Model(self.Policy,
                           train_env,
                           verbose=self.model_verbose,
                           nminibatches=1,
                           tensorboard_log=self.tensorboard_path,
                           **model_params)

        last_reward = -np.finfo(np.float16).max
        n_steps_per_eval = int(
            len(train_provider.data_frame) / n_prune_evals_per_trial)

        for eval_idx in range(n_prune_evals_per_trial):
            try:
                model.learn(n_steps_per_eval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            trades = train_env.get_attr('trades')

            if len(trades[0]) < 1:
                self.logger.info(
                    f'Pruning trial for not making any trades: {eval_idx}')
                raise optuna.structs.TrialPruned()

            state = None
            obs = validation_env.reset()
            while n_episodes < n_tests_per_eval:
                action, state = model.predict(obs, state=state)
                obs, reward, done, _ = validation_env.step([action])

                reward_sum += reward[0]

                if all(done):
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = validation_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward
    def write_to_scheduler(self, action):
        print (action)


    def save_model(self):
        model.save("ppo2_model") 



#model = PPO2(policy="MlpPolicy", tensorboard_log="./ppo2_tensorborad/",env=env,learning_rate=0.00025, lam=0.8, n_steps=30, nminibatches=1)
#model = DQN(policy="MlpPolicy", tensorboard_log="./dqn_tensorborad2/", batch_size=1, gamma=0.1, exploration_fraction=0.1, env=env)
#model.learn(total_timesteps=int(1e+4), seed=0)


env = DummyVecEnv([lambda: EnviromentExample()])
model = PPO2.load("ppo2_model.pkl")
obs = env.reset()

while True:
    action, _ = model.predict(obs)

    obs, reward, done, _ = env.step(action)
    if done:
        break


del model, env



Example #10
0
    def run_test(self,
                 model,
                 validation=True,
                 finetune=False,
                 out_file=False,
                 verbose=True):
        """
        Validates a trained model.
        
        Args:
            model (stable_baselines model): Model to be tested.
            validation(bool): Whether or not the model is to be validated on 
                validation data. Defaults to True.
            finetune(bool): Whether or not the model is to be tested on 
                external dataset. Defaults to False.
            out_file(bool): Whether or not to write model stats to output file.
                Defaults to False.
            verbose(bool): Whether or not to print model stats.
                Defaults to True.
            
        Returns:
            Mean of the total reward of the model.
        """
        f = None
        if out_file:
            f = open(f"stories/{self.timestamp}-{self.mode}-BTC.csv", "w+")
            f.write()
        env = None

        if not finetune:
            if validation:
                env = DummyVecEnv([
                    lambda: SimulatedEnv(self.val_ds, self.initial_invest, self
                                         .mode)
                ])
            else:
                env = DummyVecEnv([
                    lambda: SimulatedEnv(self.test_ds, self.initial_invest,
                                         self.mode)
                ])
        else:
            data = historical_yahoo("NKE")
            self.logger.debug('Downloaded Data from yahoo finance.')

            _, test_data = train_val_test_split_finetune(data)
            env = DummyVecEnv([
                lambda: SimulatedEnv(test_data, self.initial_invest, self.mode)
            ])
            self.logger.debug('Downloaded Data from yahoo finance.')

        total_reward = []

        for e in range(self.test_episodes):
            # Reset the environment at every episode.
            state = env.reset()
            # Initialize variable to get reward stats.
            for _ in range(0, 180):
                action, _states = model.predict(state)
                next_state, reward, done, info = env.step(action)

                if out_file:
                    self.write_to_story(f, action[0], info[0])

                total_reward.append(reward)
                state = next_state

                if done:
                    if info[0]['cur_val'] < self.initial_invest:
                        self.losses = self.losses + 1
                    if verbose:
                        self.print_stats(e, info)
                    if out_file:
                        f.write("-1,-1,-1,-1,-1,-1\n")
                    break
        self.losses = 0
        return np.mean(total_reward)
Example #11
0
print('Training Started...')
print('-'*100)
state = obs

for frame_idx in range(1, num_frames + 1):
    epsilon = epsilon_by_frame(frame_idx)
    state = torch.FloatTensor(state).to(device)
    act,w_act = policy_net_act.act(state, epsilon)
    forecast.append(act)
    state_order = torch.cat([state,torch.unsqueeze(act,0)],1).to(device)
    order,dec,w_ord = policy_net_order.act(state_order, epsilon)
    action_list.append(dec)
    action = np.array([dec,order])
    action = np.expand_dims(action,axis=1)
    action = action.T
    next_state,reward,done,_ = env.step(action)
    bal, s_held, s_sold, cost, sales, net, prof = env.render()
        
    replay_buffer.push(state,state_order,act,dec,reward,next_state,done)
    
    frame_idx += 1
    state = next_state
    
    if len(replay_buffer) > replay_initial:
        ord_l, act_l, TD_Loss = compute_td_loss(batch_size)
    
    if (frame_idx%1000)==0:
        weights_act.append(w_act);weights_ord.append(w_ord)
        print('Step-', str(frame_idx), '/', str(num_frames), '| Profit-', prof,'| Model Loss-', ord_l)
        torch.save({'model_state_dict': policy_net_act.state_dict(), 'optimizer_state_dict': optimizer_order.state_dict(), 'loss': TD_Loss},checkpoint_name+'/policy_net_act.pth.tar') #save PolicyNet
        torch.save({'model_state_dict': policy_net_order.state_dict(), 'optimizer_state_dict': optimizer_order.state_dict(), 'loss': TD_Loss},checkpoint_name+'/policy_net_order.pth.tar') #save PolicyNet
def main():

    args = get_args()
    choose_device(args.device)
    set_global_seeds(args.seed)

    env_id = args.env
    exp_id = args.exp_id
    algo = args.algo
    env_name = env_id[:-3]
    env_index = env_list.index(env_id)

    # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training
    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs
    if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']):
        if 'rew_land' in env_kwargs:
            if (int(env_kwargs['rew_land']) in [500, 1000, 10000]):
                env_success[-1] = int(env_kwargs['rew_land'])
            else:
                raise ValueError(
                    'Given env reward not acceptable. Please try again')

    params = [exp_id, env_name.lower()]
    folder = [exp_id, env_name.lower(), args.algo.lower()]
    tensorboard_path, monitor_path, callback_path = None, None, None

    if args.tensorboard:
        tensorboard_path = "tensorboard/{}_{}".format(*params)
        make_dir(tensorboard_path)

    # if args.train_RL: # Begin training here (location of this condition also decides experiment performance)

    # Load hyperparameters from yaml file
    with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
        hyperparams_dict = yaml.safe_load(f)
        if env_id in list(hyperparams_dict.keys()):
            hyperparams = hyperparams_dict[env_id]
        else:
            raise ValueError("Hyperparameters not found for {}-{}".format(
                args.algo, env_id))

    if args.hyperparams is not None:
        # Overwrite hyperparams if needed
        hyperparams.update(args.hyperparams)

    # OPTIONAL: Print saved hyperparams
    saved_hyperparams = OrderedDict([(key, hyperparams[key])
                                     for key in sorted(hyperparams.keys())])
    if args.verbose > 0:
        pprint(saved_hyperparams)

    if args.n_envs > 1:
        # if args.verbose:
        print("Overwriting n_envs with n={}".format(args.n_envs))
        n_envs = args.n_envs
    else:
        n_envs = hyperparams.get('n_envs', 1)

    # choose Monitor log path according to multiprocessing setting
    if args.monitor:
        if n_envs == 1:
            monitor_path = 'logs/single/{}_{}_{}'.format(*folder)
        else:
            if algo not in ['dqn', 'her', 'sac', 'td3']:
                monitor_path = 'logs/multi/{}_{}_{}'.format(*folder)
        make_dir(monitor_path)

    if int(float(args.timesteps_RL)) > 0:
        # if args.verbose:
        print("Overwriting n_timesteps with n={}".format(
            int(float(args.timesteps_RL))))
        n_timesteps = int(float(args.timesteps_RL))
    else:
        n_timesteps = int(hyperparams['n_timesteps'])

    # Convert to python object if needed
    if 'policy_kwargs' in hyperparams.keys() and isinstance(
            hyperparams['policy_kwargs'], str):
        hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']  #To avoid error

    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())):
    #     hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate'])

    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env

    # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)])
    # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)])
    env = create_env(n_envs)

    # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility

    callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder)
    save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[
        env_index]
    save_freq, eval_freq = max(save_freq // n_envs,
                               1), max(eval_freq // n_envs, 1)
    make_dir(callback_path)
    if args.check_callback:
        callback.append(
            CheckpointCallback(save_freq=save_freq,
                               save_path=callback_path,
                               name_prefix='rl_model',
                               verbose=1))
    if args.eval_callback:
        callback.append(
            EvalCallback(create_env(1, eval_env=True),
                         best_model_save_path=callback_path,
                         log_path=callback_path,
                         eval_freq=eval_freq,
                         verbose=1))

    model = (algo_list[args.algo])(env=env,
                                   seed=args.seed,
                                   tensorboard_log=tensorboard_path,
                                   n_cpu_tf_sess=1,
                                   verbose=args.verbose,
                                   **hyperparams)
    print('\nTraining {} on {} now... \n'.format(algo, env_id))

    start_time = time.time()
    model.learn(total_timesteps=n_timesteps, callback=callback)
    total_time = time.time() - start_time

    if args.normalize:
        env.save(os.path.join(callback_path, "vec_normalize.pkl"))

    if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']):
        print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))
    else:
        print("Took {:.2f}s for single process version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))

    env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)])

    if args.normalize:
        env = VecNormalize.load(
            os.path.join(callback_path, "vec_normalize.pkl"), env)
        env.training = False
        env.norm_reward = False
        env.seed(args.seed)

    # Evaluate RL model - choose either best model or last available model
    model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model'))
    # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder))
    model.set_env(env)
    evaluate('policy', model, env_id, env, algo, 100)

    if args.monitor:
        results_plotter.plot_results([monitor_path], n_timesteps,
                                     results_plotter.X_TIMESTEPS,
                                     "{} {}".format(algo, env_id))
        plot_results(monitor_path)

    if args.test:
        print('\nTesting policy...\n')
        obs = env.reset()
        for _ in range(n_timesteps):
            action, _states = model.predict(obs, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            obs, rewards, dones, info = env.step(action)
            episode_reward += rewards
            env.render()
            if dones:
                done_count += 1
                success_count = check_success(env_index, env_success,
                                              success_count)
                total_reward += episode_reward
                episode_reward = 0
                env.reset()
        print('\n{}/{} successful episodes'.format(success_count, done_count))
        average_reward = total_reward / done_count
        print('\nAverage reward: {}'.format(average_reward))
        env.close()
Example #13
0
        commands,
        render=False,
        on_rack=False,
    )])
if normalize:
    env = VecNormalize(env,
                       clip_obs=1000.0,
                       clip_reward=1000.0,
                       training=False)
    env.load_running_average(workDirectory + "/resultats/" + name_resume +
                             "/normalizeData")

images = []
obs = env.reset()
img = env.render(mode='rgb_array')
for i in range(15 * 2 * 10):
    images.append(img)
    action, _ = model.predict(obs, deterministic=True)
    obs, _, _, _ = env.step(action)
    img = env.render(mode='rgb_array')
    print("frame " + str(i) + "/" + str(2 * 150))
if (args.dir == None):
    imageio.mimsave(
        workDirectory + "/resultats/" + name_resume + "/video/" + name_resume +
        ".gif", [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
        fps=50)
else:
    imageio.mimsave(
        args.dir,
        [np.array(img) for i, img in enumerate(images) if i % 2 == 0],
        fps=50)
Example #14
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO1

from env import OsmoEnv, NUMCONC

if __name__ == "__main__":
    env = DummyVecEnv([lambda: OsmoEnv()])
    model = PPO1(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=50000)
    model.save("PPO1_baselines")

    for i in range(10):
        observation = env.reset()
        done = False
        while not done:
            action, _ = model.predict(observation)
            observation, _, done, info = env.step(action)
        else:
            print(info)
Example #15
0
train_env = DummyVecEnv([lambda: TradingEnv(train_df)])
test_env = DummyVecEnv([lambda: TradingEnv(test_df)])

model = PPO2(MlpPolicy, train_env, verbose=1)
model.learn(total_timesteps=TOTAL_TIME_STEPS)
model.save(save_path="./saved_model/ppo_{}_{}.pkl".format(
    asset_name, TOTAL_TIME_STEPS),
           cloudpickle=True)

obs = train_env.reset()

# back testing on training data
done = False
while not done:
    action, _states = model.predict(obs)
    obs, rewards, done, info = train_env.step(action)
    train_env.render(title=name[:-13],
                     mode=DISPLAY_MODE,
                     filename='LB_{}_LF_{}_{}_{}_train.txt'.format(
                         LOOKBACK_WINDOW_SIZE, LOOKFORWARD_WINDOW_SIZE,
                         TOTAL_TIME_STEPS, asset_name))

done = False
model.set_env(test_env)
obs = test_env.reset()

# back testing on testing data
while not done:
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    test_env.render(title=name[:-13],
Example #16
0
def test2():
    import gym
    import datetime as dt
    import matplotlib.pyplot as plt

    from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.common.evaluation import evaluate_policy
    from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC

    import pandas as pd

    from lutils.stock import LTdxHq

    code = '603636'  # 000032 300142 603636 600519
    ltdxhq = LTdxHq()
    # df = ltdxhq.get_k_data_1min('603636') # 000032 300142 603636 600519
    # df = ltdxhq.get_k_data_5min('603636')
    df = ltdxhq.get_k_data_daily(code, end='2020-01-01')
    eval_df = ltdxhq.get_k_data_daily(code, start='2020-01-01')
    ltdxhq.close()

    df = StockDataFrame(df)  # .rename(columns={'vol': 'volume'}))
    env = DummyVecEnv([lambda: LStockDailyEnv(df)])

    # policy_kwargs = dict(net_arch=[64, 'lstm', dict(vf=[128, 128, 128], pi=[64, 64])])
    policy_kwargs = dict(
        net_arch=[128, 'lstm', dict(vf=[256, 256], pi=[256, 256])])

    model = A2C('MlpLstmPolicy', env, verbose=1, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=100000)
    model.save('ppo_stock')

    eval_env = DummyVecEnv([lambda: LStockDailyEnv(StockDataFrame(eval_df))])
    # episode_rewards, _  = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True, return_episode_rewards=True) # EVAL_EPS

    # is_recurrent = model.policy.recurrent
    obs = eval_env.reset()

    net_worths = []
    actions = []
    done, state = False, None
    # while not done:
    for _ in range(NEXT_OBSERVATION_SIZE, eval_df.shape[0]):
        action, state = model.predict(obs, state=state, deterministic=True)
        obs, reward, done, _info = eval_env.step(action)
        net_worths.append(_info[0]['net_worth'])
        # if is_recurrent:
        #     obs[0, :] = new_obs
        # else:
        #     obs = new_obs

        # if action[0] < Actions.Buy: # Buy
        #     actions.append(1)
        # elif action[0] < Actions.Sell: # Sell
        #     actions.append(2)
        # else:
        #     actions.append(0)
        actions.append(action[0])
        eval_env.render()

    print(net_worths)
    plt.plot(net_worths)
    plt.show()
Example #17
0
def main(env, load_path, fig_path):

    # skip over 1-baxter-no-penalty (no log monitor.csv)
    if load_path == "1-baxter-no-penalty":
        plot = False
    else:
        plot = True

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = PPO2.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    if plot:
        plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: ActionMaskEnv(10, 10)])

model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='PPO2' + model_tag)

model.save(model_folder + "PPO2" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2" + model_tag)

done = False
states = None
action_masks = []
obs = env.reset()

while not done:
    action, states = model.predict(obs, states, action_mask=action_masks)
    obs, _, done, infos = env.step(action)
    env.render()
    action_masks.clear()
    for info in infos:
        env_action_mask = info.get('action_mask')
        action_masks.append(env_action_mask) 
Example #19
0
    def test(self,
             model_epoch: int = 0,
             render_env: bool = True,
             render_report: bool = True,
             save_report: bool = False):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        init_envs = DummyVecEnv(
            [make_env(test_provider) for _ in range(self.n_envs)])

        model_path = path.join('data', 'agents',
                               f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros((self.n_envs, ) +
                                      init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if render_env:
                test_env.render(mode='human')

            if done:
                net_worths = pd.DataFrame({
                    'Date': info[0]['timestamps'],
                    'Balance': info[0]['net_worths'],
                })

                net_worths.set_index('Date', drop=True, inplace=True)
                returns = net_worths.pct_change()[1:]

                if render_report:
                    qs.plots.snapshot(returns.Balance,
                                      title='RL Trader Performance')

                if save_report:
                    reports_path = path.join(
                        'data', 'reports',
                        f'{self.study_name}__{model_epoch}.html')
                    qs.reports.html(returns.Balance, file=reports_path)

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}'
        )
Example #20
0
def test_check_nan():
    """Test VecCheckNan Object"""

    env = DummyVecEnv([NanAndInfEnv])
    env = VecCheckNan(env, raise_exception=True)

    env.step([[0]])

    try:
        env.step([[float('NaN')]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[float('inf')]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[-1]])
    except ValueError:
        pass
    else:
        assert False

    try:
        env.step([[1]])
    except ValueError:
        pass
    else:
        assert False

    env.step(np.array([[0, 1], [0, 1]]))
Example #21
0
    # n_cpu = 4
    # env = SubprocVecEnv([lambda: env for i in range(n_cpu)])
    #model = SAC.load('sac_hallway_new2')
    #model = SAC.load("sac_hallway_new")
    #model = DDPG.load("ddpg_hallway_depth_jerry")
    model = A2C.load("trpo_hallway_depth_1")
    obs = env.reset()

    
    
    rewards = []
    for i in range(1):
        print("Episode:" , i)
        obs = env.reset()
        total_reward = 0
        cumulated_tom_episode_reward = 0
        cumulated_jerry_episode_reward = 0
        done = False
        while not done:
            action = model.predict(obs)
            #print('AAAAAAAAction:', action)
            obs,reward,done,info= env.step(action)
            total_reward+=reward
            rewards.append(total_reward)
        print("Episode_rewards:", total_reward)
    print(rewards)
    print('Episode ended. The total reward achieved in this test is  :: ',str(total_reward))
    # obs = env.reset()

    time.sleep(5)
    env.close()
def test_model_manipulation(request, model_class, storage_method,
                            store_format):
    """
    Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A RL model
    :param storage_method: (str) Should file be saved to a file ("path") or to a buffer 
        ("file-like")
    :param store_format: (str) Save format, either "zip" or "cloudpickle".
    """

    # Use postfix ".model" so we can remove the file later
    model_fname = './test_model_{}.model'.format(request.node.name)
    store_as_cloudpickle = store_format == "cloudpickle"

    try:
        env = DummyVecEnv([lambda: IdentityEnv(10)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=50000)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            # Test action probability method
            model.action_probability(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # test action probability for given (obs, action) pair
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        actions = np.array([env.action_space.sample() for _ in range(10)])
        actions_probas = model.action_probability(observations,
                                                  actions=actions)
        assert actions_probas.shape == (len(actions), 1), actions_probas.shape
        assert actions_probas.min() >= 0, actions_probas.min()
        assert actions_probas.max() <= 1, actions_probas.max()

        # saving
        if storage_method == "path":  # saving to a path
            model.save(model_fname, cloudpickle=store_as_cloudpickle)
        else:  # saving to a file-like object (BytesIO in this case)
            b_io = BytesIO()
            model.save(b_io, cloudpickle=store_as_cloudpickle)
            model_bytes = b_io.getvalue()
            b_io.close()

        del model, env

        # loading
        if storage_method == "path":  # loading from path
            model = model_class.load(model_fname)
        else:
            b_io = BytesIO(
                model_bytes
            )  # loading from file-like object (BytesIO in this case)
            model = model_class.load(b_io)
            b_io.close()

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnv(10)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "loading and saving"

        # learn post loading
        model.learn(total_timesteps=100)

        # validate no reset post learning
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        del model, env

    finally:
        if os.path.exists(model_fname):
            os.remove(model_fname)
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS

        with pytest.warns(None) as record:
            act_prob = model.action_probability(obs)

        if model_class in [DDPG, SAC]:
            # check that only one warning was raised
            assert len(record) == 1, "No warning was raised for {}".format(
                model_class)
            assert act_prob is None, "Error: action_probability should be None for {}".format(
                model_class)
        else:
            assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \
                "Error: action_probability not returning correct shape"

        # test action probability for given (obs, action) pair
        # must return zero and raise a warning or raise an exception if not defined
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        observations = observations.reshape((-1, 1))
        actions = np.array([env.action_space.sample() for _ in range(10)])

        if model_class == DDPG:
            with pytest.raises(ValueError):
                model.action_probability(observations, actions=actions)
        else:
            with pytest.warns(UserWarning):
                actions_probas = model.action_probability(observations,
                                                          actions=actions)
            assert actions_probas.shape == (len(actions),
                                            1), actions_probas.shape
            assert np.all(actions_probas == 0.0), actions_probas

        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Example #24
0
def test_model_manipulation(model_class, storage_method):
    """
    Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A RL model
    """

    try:
        env = DummyVecEnv([lambda: IdentityEnv(10)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=50000, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            # Test action probability method
            model.action_probability(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        if storage_method == "path":  # saving to a path
            model.save("./test_model")
        else:  # saving to a file-like object (BytesIO in this case)
            b_io = BytesIO()
            model.save(b_io)
            model_bytes = b_io.getvalue()
            b_io.close()

        del model, env

        # loading
        if storage_method == "path":  # loading from path
            model = model_class.load("./test_model")
        else:
            b_io = BytesIO(
                model_bytes
            )  # loading from file-like object (BytesIO in this case)
            model = model_class.load(b_io)
            b_io.close()

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnv(10)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Example #25
0
                     tensorboard_log="./" + SAVE_DIR[-3:] + '_' + str(tstep) +
                     "_tensorboard/")
        model.learn(total_timesteps=tstep, log_interval=128)
        # model.learn(total_timesteps=tstep)
        model_name = common_fileName_prefix + str(tstep) + '-' + str(
            modelNo) + "-model.model"
        model.save(path.join(SAVE_DIR, model_name), cloudpickle=True)

        obs = testEnv.reset()

        # Test for consecutive 2000 days
        for testNo in range(365 * 5):
            action, _states = model.predict(obs)
            if np.isnan(action).any():
                print(testNo)
            obs, rewards, done, info = testEnv.step(action)
            if done:
                print("Done")
                break
            profit_list.append(info[0]['profit'])
            act_profit_list.append(info[0]['actual_profit'])
            singleDay_record = testEnv.render(mode="detail")
            singleDay_record['testNo'] = testNo
            singleDay_record['rewards'] = rewards[0]
            detail_list.append(singleDay_record)

            if testNo % 365 == 0:
                print("\n============= TESTING " + str(testNo) +
                      " =============\n")
                testEnv.render()
Example #26
0
# calmar_obs = calmar_env.reset()
omega_obs = omega_env.reset()

profit_net_worths = [10000]
sortino_net_worths = [10000]
# calmar_net_worths = [10000]
omega_net_worths = [10000]

done = False
while not done:
    profit_action, profit_states = profit_model.predict(profit_obs)
    sortino_action, sortino_states = sortino_model.predict(sortino_obs)
    # calmar_action, calmar_states = calmar_model.predict(calmar_obs)
    omega_action, omega_states = omega_model.predict(omega_obs)

    profit_obs, profit_reward, done, info = profit_env.step(profit_action)
    sortino_obs, sortino_reward, done, info = sortino_env.step(sortino_action)
    # calmar_obs, calmar_reward, done, info = calmar_env.step(calmar_action)
    omega_obs, omega_reward, done, info = omega_env.step(omega_action)

    profit_net_worths.append(profit_net_worths[-1] + profit_reward[0])
    sortino_net_worths.append(sortino_net_worths[-1] + sortino_reward[0])
    # calmar_net_worths.append(calmar_net_worths[-1] + calmar_reward[0])
    omega_net_worths.append(omega_net_worths[-1] + omega_reward[0])

with open('./research/results/profit_net_worths_4.pkl', 'wb') as handle:
    pickle.dump(profit_net_worths, handle)

with open('./research/results/sortino_net_worths_4.pkl', 'wb') as handle:
    pickle.dump(sortino_net_worths, handle)
class StableBaselinesTradingStrategy(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with stable-baselines.

    Arguments:
        environments: An instance of a trading environments for the agent to trade within.
        model: The RL model to create the agent with.
            Defaults to DQN.
        policy: The RL policy to train the agent's model with.
            Defaults to 'MlpPolicy'.
        model_kwargs: Any additional keyword arguments to adjust the model.
        kwargs: Optional keyword arguments to adjust the strategy.
    """
    def __init__(self,
                 environment: TradingEnvironment,
                 model: BaseRLModel = DQN,
                 policy: Union[str, BasePolicy] = 'MlpPolicy',
                 model_kwargs: any = {},
                 **kwargs):
        self._model = model
        self._model_kwargs = model_kwargs

        self.environment = environment
        self._agent = self._model(policy, self._environment,
                                  **self._model_kwargs)

    @property
    def environment(self) -> 'TradingEnvironment':
        """A `TradingEnvironment` instance for the agent to trade within."""
        return self._environment

    @environment.setter
    def environment(self, environment: 'TradingEnvironment'):
        self._environment = DummyVecEnv([lambda: environment])

    def restore_agent(self, path: str):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            path: The `str` path of the file the agent specification is stored in.
        """
        self._agent = self._model.load(path, self._environment,
                                       self._model_kwargs)

    def save_agent(self, path: str):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            path: The `str` path of the file to store the agent specification in.
        """
        self._agent.save(path)

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def run(
        self,
        steps: int = None,
        episodes: int = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None and episodes is None:
            raise ValueError(
                'You must set the number of `steps` or `episodes` to run the strategy.'
            )

        steps_completed = 0
        episodes_completed = 0
        average_reward = 0

        obs, state, dones = self._environment.reset(), None, [False]

        performance = {}

        while (steps is not None and
               (steps == 0 or steps_completed < steps)) or (
                   episodes is not None and episodes_completed < episodes):
            actions, state = self._agent.predict(obs, state=state, mask=dones)
            obs, rewards, dones, info = self._environment.step(actions)

            steps_completed += 1
            average_reward -= average_reward / steps_completed
            average_reward += rewards[0] / (steps_completed + 1)

            exchange_performance = info[0].get('exchange').performance
            performance = exchange_performance if len(
                exchange_performance) > 0 else performance

            if dones[0]:
                if episode_callback is not None and episode_callback(
                        self._environment._exchange.performance):
                    break

                episodes_completed += 1
                obs = self._environment.reset()

        print("Finished running strategy.")
        print("Total episodes: {} ({} timesteps).".format(
            episodes_completed, steps_completed))
        print("Average reward: {}.".format(average_reward))

        return performance
Example #28
0
    # Observed Player board
    observation = env.reset()
    # Init new Result
    result = Result()
    done = False
    # Amount of moves used to finish the game
    rounds = 0
    while not done:
        rounds += 1
        # Get a random action from the action space
        if randomAgent:
            action = random.choice(env.envs[0].env.available_actions)
        # Agent performs a step
        if not randomAgent:
            action, _states = model.predict(observation)
        nextObservation, reward, done, info = env.step(action)
        # Renders the Game state with radar board
        if choiceRender:
            env.render()
        score += reward
        # Add step to result Object
        result.append_history(rounds, action, nextObservation, reward, done, info)
        observation = nextObservation
        # Game is done
        if done:
            print("End of game: Rounds", rounds, "Score", score)
            # Store amount of rounds in result object
            result.set_rounds(rounds)
            # Add current result object to all results
            results.append(result)
print('Finished')
from gym import spaces
import numpy as np

# n_cpu = 4
# total_timesteps = 200000000
# # total_timesteps = 200000
# env = SubprocVecEnv([lambda: gym.make('WalkingSpider-v0') for i in range(n_cpu)])
# model = PPO2(MlpPolicy, env, verbose=1)
# model.learn(total_timesteps=total_timesteps)
# model.save("experience_learned/ppo2_WalkingSpider_v0_testing")
# del model # remove to demonstrate saving and loading

# # # Enjoy trained agent
model = PPO2.load("experience_learned/ppo2_WalkingSpider_v0_testing_3")
print("Enjoy trained agent")
env = DummyVecEnv([lambda: gym.make('WalkingSpider-v0')])
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

# Random Environment
# env = gym.make('WalkingSpider-v0')
# env.reset()
# for _ in range(1000):
#     env.render()
#     observation, reward, done, info = env.step(env.action_space.sample()) # take a random action

# print("Obs Shape ", observation, " Action Shape ", env.action_space.sample().shape)
Example #30
0
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <10% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <10% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
            "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")