def main(model_name, algo, testRange, isTargetPositionFixed, isDiscrete): panda_env = PandaGraspGymEnv(urdfRoot=object_data.getDataPath(), isRendering=True, useIK=True, isDiscrete=isDiscrete, numControlledJoints=7, isTargetPositionFixed=isTargetPositionFixed) env = DummyVecEnv([lambda: panda_env]) if algo == "DDPG": model = DDPG.load(model_name) else: model = DQN.load(model_name) obs = env.reset() images = [] img = env.get_images() for i in range(testRange): images.append(img) action, _states = model.predict(obs, deterministic=True) print("Step: {} Action: {}".format(i, action)) obs, rewards, done, info = env.step(action) env.render(mode='human') img = env.get_images() os.makedirs(gif_dir, exist_ok=True) imageio.mimsave(gif_dir + model_name + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i % 2 == 0], fps=29)
def test2(): import gym import datetime as dt import matplotlib.pyplot as plt from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC import pandas as pd from lutils.stock import LTdxHq ltdxhq = LTdxHq() df = ltdxhq.get_k_data_1min('603636') # 000032 300142 603636 # df = ltdxhq.get_k_data_5min('603636') # df = ltdxhq.get_k_data_daily('603636') ltdxhq.close() df = StockDataFrame(df) # .rename(columns={'vol': 'volume'})) # df = df.rename(columns={'open': 'Open', 'close': 'Close', 'high': 'High', 'low': 'Low', 'vol': 'Volume'}) df.index = pd.to_datetime(df.index) df1 = df[:-240] df2 = df[-240:] # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: LStockDailyEnv(df1)]) eval_env = DummyVecEnv([lambda: LStockDailyEnv(df2)]) # policy_kwargs = dict(net_arch=[64, 'lstm', dict(vf=[128, 128, 128], pi=[64, 64])]) policy_kwargs = dict(net_arch=[128, 'lstm', dict(vf=[256, 256], pi=[256, 256])]) model = A2C('MlpLstmPolicy', env, verbose=1, policy_kwargs=policy_kwargs) model.learn(total_timesteps=20000) # episode_rewards, _ = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True, return_episode_rewards=True) # EVAL_EPS # print(mean_reward) is_recurrent = model.policy.recurrent obs = eval_env.reset() # if is_recurrent: # zero_completed_obs = np.zeros((model.n_envs,) + model.observation_space.shape) # zero_completed_obs[0, :] = obs # obs = zero_completed_obs net_worths = [] done, state = False, None while not done: action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, _info = eval_env.step(action) net_worths.append(_info[0]['net_worth']) # if is_recurrent: # obs[0, :] = new_obs # else: # obs = new_obs eval_env.render() plt.plot(net_worths) plt.show()
if discreteActionsSpace: #this needs fixing sampledAction, logProbSampledAction, logProbsAll = policy.getSampledActions( obs[l]) additionalInfos = [logProbsAll] else: sampledAction, logProbSampledAction, actionsMean, actionLogStd = sess.run( [ actionFinalOp, sampledLogProbsOp, actionMeanOp, actionLogStdOp ], feed_dict={obsPh: np.expand_dims(obs[l], 0)}) additionalInfos[0][l] = actionsMean additionalInfos[1][l] = actionLogStd nextObss, rews, nextDones, infoss = env.step(sampledAction) nextObs, rewards[l], nextDone, infos = nextObss[0], rews[ 0], nextDones[0], infoss[0] sampledLogProb[l] = logProbSampledAction[0] if dones[l]: summaryRet, summaryLen = sess.run([epTotalRewSum, epLenSum], feed_dict={ epTotalRewPh: epTotalRew, epLenPh: epLen }) globalStep = e * args.epoch_len + l writer.add_summary(summaryRet, globalStep) writer.add_summary(summaryLen, globalStep) epTotalTrainRews.append(epTotalRew)
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 # 環境の生成 env = gym.make('CartPole-v1') env = DummyVecEnv([lambda: env]) # モデルの生成 model = PPO2(MlpPolicy, env, verbose=1) # モデルの学習 model.learn(total_timesteps=10000) # モデルの保存 model.save('sample') # モデルの削除 del model # モデルの読み込み model = PPO2.load('sample') # モデルのテスト state = env.reset() for i in range(200): env.render() action, _ = model.predict(state) state, rewards, done, info = env.step(action) if done: break
'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } if curr_idx == -1: model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1, tensorboard_log=Path("./tensorboard").name, **model_params) else: model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=train_env) for idx in range(curr_idx + 1, 10): print('[', idx, '] Training for: ', train_len, ' time steps') model.learn(total_timesteps=train_len) obs = test_env.reset() done, reward_sum = False, 0 while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) reward_sum += reward print('[', idx, '] Total reward: ', reward_sum, ' (' + reward_strategy + ')') model.save('./agents/ppo2_' + reward_strategy + '_' + str(idx) + '.pkl')
def stock_trade_US(stock_file_train, no_of_test_trading_days): df_train = pd.read_csv(stock_file_train) # df_train = df_train.sort_values('date') # The algorithms require a vectorized environment to run env_train = DummyVecEnv([lambda: StockTradingEnv_US(df_train)]) total_timesteps = int(4e4) # total_timesteps = int(1e5) model = PPO2('MlpPolicy', env_train, verbose=0, tensorboard_log='./log', seed=12345).learn(total_timesteps=total_timesteps) # Random Agent, after training # mean_reward, std_reward = evaluate_policy(model, env_train, n_eval_episodes=100) # print(f"after training, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") # -----------------Test Model -------------------------------------- import sys sys.stdout = open( f'./output/output_SPY_{total_timesteps}_days_{no_of_test_trading_days}.txt', 'wt') day_profits = [] buy_hold_profit = [] df_test_raw = pd.read_csv(stock_file_train.replace('train', 'test')) #start from random day # df_test = df_test_raw.iloc[200:].reset_index(drop=True) df_test = df_test_raw df_test = df_test.drop(['Adj Close'], axis=1) env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)]) obs = env_test.reset() no_of_shares = 0 buy_hold_commission = 0 for n in range(len(df_test) - 1): if n > no_of_test_trading_days: break action, _states = model.predict(obs) # let agent start with a buy all # if n == 0: # action[0][0] = 0 # action[0][1] = 1 obs, rewards, done, info = env_test.step(action) profit = env_test.render() day_profits.append(profit) if n == 0: buy_hold_profit.append(0) no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close'] buy_hold_commission = no_of_shares * df_test.iloc[0][ 'Close'] * 0.001 print('Buy ' + str(no_of_shares) + ' shares and hold') else: buy_hold_profit_per_step = no_of_shares * ( df_test.iloc[n]['Close'] - df_test.iloc[0]['Close']) - buy_hold_commission buy_hold_profit.append(buy_hold_profit_per_step) print('Buy and Hold: ' + '*' * 40) print('No of shares: ' + str(no_of_shares) + ' average cost per share ' + str(df_test.iloc[0]['Close'])) print('profit is ' + str(buy_hold_profit_per_step)) if done: break good_model = False if day_profits[-1] > buy_hold_profit[-1]: good_model = True return day_profits, buy_hold_profit, good_model, model, total_timesteps
import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv from stable_baselines import PPO2, A2C from sonic_util import make_env from gym.wrappers import Monitor env = DummyVecEnv([lambda: make_env(level_name='LabyrinthZone.Act1', \ stack=False, scale_rew=True)]) modelname = 'sonicppo' model = PPO2(CnnPolicy, env, n_steps=4500, verbose=1) model.load("./checkpoint" + modelname) obs = env.reset() done = False reward = 0 while not done: actions, _ = model.predict(obs) obs, rew, done, info = env.step(actions) reward += rew env.render() env.close()
def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) train_provider, validation_provider = train_provider.split_data_train_test( self.train_split_percentage) del test_provider train_env = DummyVecEnv([lambda: TradingEnv(train_provider)]) validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)]) model_params = self.optimize_agent_params(trial) model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1, tensorboard_log=self.tensorboard_path, **model_params) last_reward = -np.finfo(np.float16).max n_steps_per_eval = int( len(train_provider.data_frame) / n_prune_evals_per_trial) for eval_idx in range(n_prune_evals_per_trial): try: model.learn(n_steps_per_eval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 trades = train_env.get_attr('trades') if len(trades[0]) < 1: self.logger.info( f'Pruning trial for not making any trades: {eval_idx}') raise optuna.structs.TrialPruned() state = None obs = validation_env.reset() while n_episodes < n_tests_per_eval: action, state = model.predict(obs, state=state) obs, reward, done, _ = validation_env.step([action]) reward_sum += reward[0] if all(done): rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = validation_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
def write_to_scheduler(self, action): print (action) def save_model(self): model.save("ppo2_model") #model = PPO2(policy="MlpPolicy", tensorboard_log="./ppo2_tensorborad/",env=env,learning_rate=0.00025, lam=0.8, n_steps=30, nminibatches=1) #model = DQN(policy="MlpPolicy", tensorboard_log="./dqn_tensorborad2/", batch_size=1, gamma=0.1, exploration_fraction=0.1, env=env) #model.learn(total_timesteps=int(1e+4), seed=0) env = DummyVecEnv([lambda: EnviromentExample()]) model = PPO2.load("ppo2_model.pkl") obs = env.reset() while True: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: break del model, env
def run_test(self, model, validation=True, finetune=False, out_file=False, verbose=True): """ Validates a trained model. Args: model (stable_baselines model): Model to be tested. validation(bool): Whether or not the model is to be validated on validation data. Defaults to True. finetune(bool): Whether or not the model is to be tested on external dataset. Defaults to False. out_file(bool): Whether or not to write model stats to output file. Defaults to False. verbose(bool): Whether or not to print model stats. Defaults to True. Returns: Mean of the total reward of the model. """ f = None if out_file: f = open(f"stories/{self.timestamp}-{self.mode}-BTC.csv", "w+") f.write() env = None if not finetune: if validation: env = DummyVecEnv([ lambda: SimulatedEnv(self.val_ds, self.initial_invest, self .mode) ]) else: env = DummyVecEnv([ lambda: SimulatedEnv(self.test_ds, self.initial_invest, self.mode) ]) else: data = historical_yahoo("NKE") self.logger.debug('Downloaded Data from yahoo finance.') _, test_data = train_val_test_split_finetune(data) env = DummyVecEnv([ lambda: SimulatedEnv(test_data, self.initial_invest, self.mode) ]) self.logger.debug('Downloaded Data from yahoo finance.') total_reward = [] for e in range(self.test_episodes): # Reset the environment at every episode. state = env.reset() # Initialize variable to get reward stats. for _ in range(0, 180): action, _states = model.predict(state) next_state, reward, done, info = env.step(action) if out_file: self.write_to_story(f, action[0], info[0]) total_reward.append(reward) state = next_state if done: if info[0]['cur_val'] < self.initial_invest: self.losses = self.losses + 1 if verbose: self.print_stats(e, info) if out_file: f.write("-1,-1,-1,-1,-1,-1\n") break self.losses = 0 return np.mean(total_reward)
print('Training Started...') print('-'*100) state = obs for frame_idx in range(1, num_frames + 1): epsilon = epsilon_by_frame(frame_idx) state = torch.FloatTensor(state).to(device) act,w_act = policy_net_act.act(state, epsilon) forecast.append(act) state_order = torch.cat([state,torch.unsqueeze(act,0)],1).to(device) order,dec,w_ord = policy_net_order.act(state_order, epsilon) action_list.append(dec) action = np.array([dec,order]) action = np.expand_dims(action,axis=1) action = action.T next_state,reward,done,_ = env.step(action) bal, s_held, s_sold, cost, sales, net, prof = env.render() replay_buffer.push(state,state_order,act,dec,reward,next_state,done) frame_idx += 1 state = next_state if len(replay_buffer) > replay_initial: ord_l, act_l, TD_Loss = compute_td_loss(batch_size) if (frame_idx%1000)==0: weights_act.append(w_act);weights_ord.append(w_ord) print('Step-', str(frame_idx), '/', str(num_frames), '| Profit-', prof,'| Model Loss-', ord_l) torch.save({'model_state_dict': policy_net_act.state_dict(), 'optimizer_state_dict': optimizer_order.state_dict(), 'loss': TD_Loss},checkpoint_name+'/policy_net_act.pth.tar') #save PolicyNet torch.save({'model_state_dict': policy_net_order.state_dict(), 'optimizer_state_dict': optimizer_order.state_dict(), 'loss': TD_Loss},checkpoint_name+'/policy_net_order.pth.tar') #save PolicyNet
def main(): args = get_args() choose_device(args.device) set_global_seeds(args.seed) env_id = args.env exp_id = args.exp_id algo = args.algo env_name = env_id[:-3] env_index = env_list.index(env_id) # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training env_kwargs = {} if args.env_kwargs is None else args.env_kwargs if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']): if 'rew_land' in env_kwargs: if (int(env_kwargs['rew_land']) in [500, 1000, 10000]): env_success[-1] = int(env_kwargs['rew_land']) else: raise ValueError( 'Given env reward not acceptable. Please try again') params = [exp_id, env_name.lower()] folder = [exp_id, env_name.lower(), args.algo.lower()] tensorboard_path, monitor_path, callback_path = None, None, None if args.tensorboard: tensorboard_path = "tensorboard/{}_{}".format(*params) make_dir(tensorboard_path) # if args.train_RL: # Begin training here (location of this condition also decides experiment performance) # Load hyperparameters from yaml file with open('hyperparams/{}.yml'.format(args.algo), 'r') as f: hyperparams_dict = yaml.safe_load(f) if env_id in list(hyperparams_dict.keys()): hyperparams = hyperparams_dict[env_id] else: raise ValueError("Hyperparameters not found for {}-{}".format( args.algo, env_id)) if args.hyperparams is not None: # Overwrite hyperparams if needed hyperparams.update(args.hyperparams) # OPTIONAL: Print saved hyperparams saved_hyperparams = OrderedDict([(key, hyperparams[key]) for key in sorted(hyperparams.keys())]) if args.verbose > 0: pprint(saved_hyperparams) if args.n_envs > 1: # if args.verbose: print("Overwriting n_envs with n={}".format(args.n_envs)) n_envs = args.n_envs else: n_envs = hyperparams.get('n_envs', 1) # choose Monitor log path according to multiprocessing setting if args.monitor: if n_envs == 1: monitor_path = 'logs/single/{}_{}_{}'.format(*folder) else: if algo not in ['dqn', 'her', 'sac', 'td3']: monitor_path = 'logs/multi/{}_{}_{}'.format(*folder) make_dir(monitor_path) if int(float(args.timesteps_RL)) > 0: # if args.verbose: print("Overwriting n_timesteps with n={}".format( int(float(args.timesteps_RL)))) n_timesteps = int(float(args.timesteps_RL)) else: n_timesteps = int(hyperparams['n_timesteps']) # Convert to python object if needed if 'policy_kwargs' in hyperparams.keys() and isinstance( hyperparams['policy_kwargs'], str): hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs']) if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] #To avoid error env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())): # hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate']) def create_env(n_envs, eval_env=False): if algo in ['a2c', 'acer', 'acktr', 'ppo2']: if n_envs > 1: env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, args.seed, log_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)]) if env_wrapper is not None: env = env_wrapper(env) elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1): raise ValueError( "Error: {} does not support multiprocessing!".format(algo)) elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1): raise ValueError( "Error: {} uses MPI for multiprocessing!".format(algo)) else: env = make_vec_env(env_id, n_envs=n_envs, seed=args.seed, monitor_dir=monitor_path, wrapper_class=env_wrapper, env_kwargs=env_kwargs) if args.normalize: # choose from multiple options # env = VecNormalize(env, clip_obs=np.inf) env = VecNormalize(env, norm_reward=False, clip_obs=np.inf) # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs) return env # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)]) # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)]) env = create_env(n_envs) # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder) save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[ env_index] save_freq, eval_freq = max(save_freq // n_envs, 1), max(eval_freq // n_envs, 1) make_dir(callback_path) if args.check_callback: callback.append( CheckpointCallback(save_freq=save_freq, save_path=callback_path, name_prefix='rl_model', verbose=1)) if args.eval_callback: callback.append( EvalCallback(create_env(1, eval_env=True), best_model_save_path=callback_path, log_path=callback_path, eval_freq=eval_freq, verbose=1)) model = (algo_list[args.algo])(env=env, seed=args.seed, tensorboard_log=tensorboard_path, n_cpu_tf_sess=1, verbose=args.verbose, **hyperparams) print('\nTraining {} on {} now... \n'.format(algo, env_id)) start_time = time.time() model.learn(total_timesteps=n_timesteps, callback=callback) total_time = time.time() - start_time if args.normalize: env.save(os.path.join(callback_path, "vec_normalize.pkl")) if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']): print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) else: print("Took {:.2f}s for single process version - {:.2f} FPS".format( total_time, n_timesteps / total_time)) env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)]) if args.normalize: env = VecNormalize.load( os.path.join(callback_path, "vec_normalize.pkl"), env) env.training = False env.norm_reward = False env.seed(args.seed) # Evaluate RL model - choose either best model or last available model model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model')) # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder)) model.set_env(env) evaluate('policy', model, env_id, env, algo, 100) if args.monitor: results_plotter.plot_results([monitor_path], n_timesteps, results_plotter.X_TIMESTEPS, "{} {}".format(algo, env_id)) plot_results(monitor_path) if args.test: print('\nTesting policy...\n') obs = env.reset() for _ in range(n_timesteps): action, _states = model.predict(obs, deterministic=True) if isinstance(env.action_space, gym.spaces.Box): action = np.clip(action, env.action_space.low, env.action_space.high) obs, rewards, dones, info = env.step(action) episode_reward += rewards env.render() if dones: done_count += 1 success_count = check_success(env_index, env_success, success_count) total_reward += episode_reward episode_reward = 0 env.reset() print('\n{}/{} successful episodes'.format(success_count, done_count)) average_reward = total_reward / done_count print('\nAverage reward: {}'.format(average_reward)) env.close()
commands, render=False, on_rack=False, )]) if normalize: env = VecNormalize(env, clip_obs=1000.0, clip_reward=1000.0, training=False) env.load_running_average(workDirectory + "/resultats/" + name_resume + "/normalizeData") images = [] obs = env.reset() img = env.render(mode='rgb_array') for i in range(15 * 2 * 10): images.append(img) action, _ = model.predict(obs, deterministic=True) obs, _, _, _ = env.step(action) img = env.render(mode='rgb_array') print("frame " + str(i) + "/" + str(2 * 150)) if (args.dir == None): imageio.mimsave( workDirectory + "/resultats/" + name_resume + "/video/" + name_resume + ".gif", [np.array(img) for i, img in enumerate(images) if i % 2 == 0], fps=50) else: imageio.mimsave( args.dir, [np.array(img) for i, img in enumerate(images) if i % 2 == 0], fps=50)
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO1 from env import OsmoEnv, NUMCONC if __name__ == "__main__": env = DummyVecEnv([lambda: OsmoEnv()]) model = PPO1(MlpPolicy, env, verbose=1) model.learn(total_timesteps=50000) model.save("PPO1_baselines") for i in range(10): observation = env.reset() done = False while not done: action, _ = model.predict(observation) observation, _, done, info = env.step(action) else: print(info)
train_env = DummyVecEnv([lambda: TradingEnv(train_df)]) test_env = DummyVecEnv([lambda: TradingEnv(test_df)]) model = PPO2(MlpPolicy, train_env, verbose=1) model.learn(total_timesteps=TOTAL_TIME_STEPS) model.save(save_path="./saved_model/ppo_{}_{}.pkl".format( asset_name, TOTAL_TIME_STEPS), cloudpickle=True) obs = train_env.reset() # back testing on training data done = False while not done: action, _states = model.predict(obs) obs, rewards, done, info = train_env.step(action) train_env.render(title=name[:-13], mode=DISPLAY_MODE, filename='LB_{}_LF_{}_{}_{}_train.txt'.format( LOOKBACK_WINDOW_SIZE, LOOKFORWARD_WINDOW_SIZE, TOTAL_TIME_STEPS, asset_name)) done = False model.set_env(test_env) obs = test_env.reset() # back testing on testing data while not done: action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) test_env.render(title=name[:-13],
def test2(): import gym import datetime as dt import matplotlib.pyplot as plt from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC import pandas as pd from lutils.stock import LTdxHq code = '603636' # 000032 300142 603636 600519 ltdxhq = LTdxHq() # df = ltdxhq.get_k_data_1min('603636') # 000032 300142 603636 600519 # df = ltdxhq.get_k_data_5min('603636') df = ltdxhq.get_k_data_daily(code, end='2020-01-01') eval_df = ltdxhq.get_k_data_daily(code, start='2020-01-01') ltdxhq.close() df = StockDataFrame(df) # .rename(columns={'vol': 'volume'})) env = DummyVecEnv([lambda: LStockDailyEnv(df)]) # policy_kwargs = dict(net_arch=[64, 'lstm', dict(vf=[128, 128, 128], pi=[64, 64])]) policy_kwargs = dict( net_arch=[128, 'lstm', dict(vf=[256, 256], pi=[256, 256])]) model = A2C('MlpLstmPolicy', env, verbose=1, policy_kwargs=policy_kwargs) model.learn(total_timesteps=100000) model.save('ppo_stock') eval_env = DummyVecEnv([lambda: LStockDailyEnv(StockDataFrame(eval_df))]) # episode_rewards, _ = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True, return_episode_rewards=True) # EVAL_EPS # is_recurrent = model.policy.recurrent obs = eval_env.reset() net_worths = [] actions = [] done, state = False, None # while not done: for _ in range(NEXT_OBSERVATION_SIZE, eval_df.shape[0]): action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, _info = eval_env.step(action) net_worths.append(_info[0]['net_worth']) # if is_recurrent: # obs[0, :] = new_obs # else: # obs = new_obs # if action[0] < Actions.Buy: # Buy # actions.append(1) # elif action[0] < Actions.Sell: # Sell # actions.append(2) # else: # actions.append(0) actions.append(action[0]) eval_env.render() print(net_worths) plt.plot(net_worths) plt.show()
def main(env, load_path, fig_path): # skip over 1-baxter-no-penalty (no log monitor.csv) if load_path == "1-baxter-no-penalty": plot = False else: plot = True # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = PPO2.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results if plot: plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: ActionMaskEnv(10, 10)]) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask)
def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) del train_provider init_envs = DummyVecEnv( [make_env(test_provider) for _ in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=init_envs) test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)]) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') zero_completed_obs = np.zeros((self.n_envs, ) + init_envs.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None rewards = [] for _ in range(len(test_provider.data_frame)): action, state = model.predict(zero_completed_obs, state=state) obs, reward, done, info = test_env.step([action[0]]) zero_completed_obs[0, :] = obs rewards.append(reward) if render_env: test_env.render(mode='human') if done: net_worths = pd.DataFrame({ 'Date': info[0]['timestamps'], 'Balance': info[0]['net_worths'], }) net_worths.set_index('Date', drop=True, inplace=True) returns = net_worths.pct_change()[1:] if render_report: qs.plots.snapshot(returns.Balance, title='RL Trader Performance') if save_report: reports_path = path.join( 'data', 'reports', f'{self.study_name}__{model_epoch}.html') qs.reports.html(returns.Balance, file=reports_path) self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}' )
def test_check_nan(): """Test VecCheckNan Object""" env = DummyVecEnv([NanAndInfEnv]) env = VecCheckNan(env, raise_exception=True) env.step([[0]]) try: env.step([[float('NaN')]]) except ValueError: pass else: assert False try: env.step([[float('inf')]]) except ValueError: pass else: assert False try: env.step([[-1]]) except ValueError: pass else: assert False try: env.step([[1]]) except ValueError: pass else: assert False env.step(np.array([[0, 1], [0, 1]]))
# n_cpu = 4 # env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) #model = SAC.load('sac_hallway_new2') #model = SAC.load("sac_hallway_new") #model = DDPG.load("ddpg_hallway_depth_jerry") model = A2C.load("trpo_hallway_depth_1") obs = env.reset() rewards = [] for i in range(1): print("Episode:" , i) obs = env.reset() total_reward = 0 cumulated_tom_episode_reward = 0 cumulated_jerry_episode_reward = 0 done = False while not done: action = model.predict(obs) #print('AAAAAAAAction:', action) obs,reward,done,info= env.step(action) total_reward+=reward rewards.append(total_reward) print("Episode_rewards:", total_reward) print(rewards) print('Episode ended. The total reward achieved in this test is :: ',str(total_reward)) # obs = env.reset() time.sleep(5) env.close()
def test_model_manipulation(request, model_class, storage_method, store_format): """ Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A RL model :param storage_method: (str) Should file be saved to a file ("path") or to a buffer ("file-like") :param store_format: (str) Save format, either "zip" or "cloudpickle". """ # Use postfix ".model" so we can remove the file later model_fname = './test_model_{}.model'.format(request.node.name) store_as_cloudpickle = store_format == "cloudpickle" try: env = DummyVecEnv([lambda: IdentityEnv(10)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=50000) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) # Test action probability method model.action_probability(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # test action probability for given (obs, action) pair env = model.get_env() obs = env.reset() observations = np.array([obs for _ in range(10)]) observations = np.squeeze(observations) actions = np.array([env.action_space.sample() for _ in range(10)]) actions_probas = model.action_probability(observations, actions=actions) assert actions_probas.shape == (len(actions), 1), actions_probas.shape assert actions_probas.min() >= 0, actions_probas.min() assert actions_probas.max() <= 1, actions_probas.max() # saving if storage_method == "path": # saving to a path model.save(model_fname, cloudpickle=store_as_cloudpickle) else: # saving to a file-like object (BytesIO in this case) b_io = BytesIO() model.save(b_io, cloudpickle=store_as_cloudpickle) model_bytes = b_io.getvalue() b_io.close() del model, env # loading if storage_method == "path": # loading from path model = model_class.load(model_fname) else: b_io = BytesIO( model_bytes ) # loading from file-like object (BytesIO in this case) model = model_class.load(b_io) b_io.close() # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnv(10)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "loading and saving" # learn post loading model.learn(total_timesteps=100) # validate no reset post learning loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) del model, env finally: if os.path.exists(model_fname): os.remove(model_fname)
def test_model_manipulation(model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS with pytest.warns(None) as record: act_prob = model.action_probability(obs) if model_class in [DDPG, SAC]: # check that only one warning was raised assert len(record) == 1, "No warning was raised for {}".format( model_class) assert act_prob is None, "Error: action_probability should be None for {}".format( model_class) else: assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \ "Error: action_probability not returning correct shape" # test action probability for given (obs, action) pair # must return zero and raise a warning or raise an exception if not defined env = model.get_env() obs = env.reset() observations = np.array([obs for _ in range(10)]) observations = np.squeeze(observations) observations = observations.reshape((-1, 1)) actions = np.array([env.action_space.sample() for _ in range(10)]) if model_class == DDPG: with pytest.raises(ValueError): model.action_probability(observations, actions=actions) else: with pytest.warns(UserWarning): actions_probas = model.action_probability(observations, actions=actions) assert actions_probas.shape == (len(actions), 1), actions_probas.shape assert np.all(actions_probas == 0.0), actions_probas # assert <15% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning # This test was failing from time to time for no good reason # other than bad luck # We should change this test # loaded_acc_reward = 0 # set_global_seeds(0) # obs = env.reset() # for _ in range(N_TRIALS): # action, _ = model.predict(obs) # obs, reward, _, _ = env.step(action) # loaded_acc_reward += reward # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # # assert <10% diff # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ # "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
def test_model_manipulation(model_class, storage_method): """ Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A RL model """ try: env = DummyVecEnv([lambda: IdentityEnv(10)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=50000, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) # Test action probability method model.action_probability(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving if storage_method == "path": # saving to a path model.save("./test_model") else: # saving to a file-like object (BytesIO in this case) b_io = BytesIO() model.save(b_io) model_bytes = b_io.getvalue() b_io.close() del model, env # loading if storage_method == "path": # loading from path model = model_class.load("./test_model") else: b_io = BytesIO( model_bytes ) # loading from file-like object (BytesIO in this case) model = model_class.load(b_io) b_io.close() # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnv(10)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \ "pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
tensorboard_log="./" + SAVE_DIR[-3:] + '_' + str(tstep) + "_tensorboard/") model.learn(total_timesteps=tstep, log_interval=128) # model.learn(total_timesteps=tstep) model_name = common_fileName_prefix + str(tstep) + '-' + str( modelNo) + "-model.model" model.save(path.join(SAVE_DIR, model_name), cloudpickle=True) obs = testEnv.reset() # Test for consecutive 2000 days for testNo in range(365 * 5): action, _states = model.predict(obs) if np.isnan(action).any(): print(testNo) obs, rewards, done, info = testEnv.step(action) if done: print("Done") break profit_list.append(info[0]['profit']) act_profit_list.append(info[0]['actual_profit']) singleDay_record = testEnv.render(mode="detail") singleDay_record['testNo'] = testNo singleDay_record['rewards'] = rewards[0] detail_list.append(singleDay_record) if testNo % 365 == 0: print("\n============= TESTING " + str(testNo) + " =============\n") testEnv.render()
# calmar_obs = calmar_env.reset() omega_obs = omega_env.reset() profit_net_worths = [10000] sortino_net_worths = [10000] # calmar_net_worths = [10000] omega_net_worths = [10000] done = False while not done: profit_action, profit_states = profit_model.predict(profit_obs) sortino_action, sortino_states = sortino_model.predict(sortino_obs) # calmar_action, calmar_states = calmar_model.predict(calmar_obs) omega_action, omega_states = omega_model.predict(omega_obs) profit_obs, profit_reward, done, info = profit_env.step(profit_action) sortino_obs, sortino_reward, done, info = sortino_env.step(sortino_action) # calmar_obs, calmar_reward, done, info = calmar_env.step(calmar_action) omega_obs, omega_reward, done, info = omega_env.step(omega_action) profit_net_worths.append(profit_net_worths[-1] + profit_reward[0]) sortino_net_worths.append(sortino_net_worths[-1] + sortino_reward[0]) # calmar_net_worths.append(calmar_net_worths[-1] + calmar_reward[0]) omega_net_worths.append(omega_net_worths[-1] + omega_reward[0]) with open('./research/results/profit_net_worths_4.pkl', 'wb') as handle: pickle.dump(profit_net_worths, handle) with open('./research/results/sortino_net_worths_4.pkl', 'wb') as handle: pickle.dump(sortino_net_worths, handle)
class StableBaselinesTradingStrategy(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with stable-baselines. Arguments: environments: An instance of a trading environments for the agent to trade within. model: The RL model to create the agent with. Defaults to DQN. policy: The RL policy to train the agent's model with. Defaults to 'MlpPolicy'. model_kwargs: Any additional keyword arguments to adjust the model. kwargs: Optional keyword arguments to adjust the strategy. """ def __init__(self, environment: TradingEnvironment, model: BaseRLModel = DQN, policy: Union[str, BasePolicy] = 'MlpPolicy', model_kwargs: any = {}, **kwargs): self._model = model self._model_kwargs = model_kwargs self.environment = environment self._agent = self._model(policy, self._environment, **self._model_kwargs) @property def environment(self) -> 'TradingEnvironment': """A `TradingEnvironment` instance for the agent to trade within.""" return self._environment @environment.setter def environment(self, environment: 'TradingEnvironment'): self._environment = DummyVecEnv([lambda: environment]) def restore_agent(self, path: str): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. """ self._agent = self._model.load(path, self._environment, self._model_kwargs) def save_agent(self, path: str): """Serialize the learning agent to a file for restoring later. Arguments: path: The `str` path of the file to store the agent specification in. """ self._agent.save(path) def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def run( self, steps: int = None, episodes: int = None, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: if steps is None and episodes is None: raise ValueError( 'You must set the number of `steps` or `episodes` to run the strategy.' ) steps_completed = 0 episodes_completed = 0 average_reward = 0 obs, state, dones = self._environment.reset(), None, [False] performance = {} while (steps is not None and (steps == 0 or steps_completed < steps)) or ( episodes is not None and episodes_completed < episodes): actions, state = self._agent.predict(obs, state=state, mask=dones) obs, rewards, dones, info = self._environment.step(actions) steps_completed += 1 average_reward -= average_reward / steps_completed average_reward += rewards[0] / (steps_completed + 1) exchange_performance = info[0].get('exchange').performance performance = exchange_performance if len( exchange_performance) > 0 else performance if dones[0]: if episode_callback is not None and episode_callback( self._environment._exchange.performance): break episodes_completed += 1 obs = self._environment.reset() print("Finished running strategy.") print("Total episodes: {} ({} timesteps).".format( episodes_completed, steps_completed)) print("Average reward: {}.".format(average_reward)) return performance
# Observed Player board observation = env.reset() # Init new Result result = Result() done = False # Amount of moves used to finish the game rounds = 0 while not done: rounds += 1 # Get a random action from the action space if randomAgent: action = random.choice(env.envs[0].env.available_actions) # Agent performs a step if not randomAgent: action, _states = model.predict(observation) nextObservation, reward, done, info = env.step(action) # Renders the Game state with radar board if choiceRender: env.render() score += reward # Add step to result Object result.append_history(rounds, action, nextObservation, reward, done, info) observation = nextObservation # Game is done if done: print("End of game: Rounds", rounds, "Score", score) # Store amount of rounds in result object result.set_rounds(rounds) # Add current result object to all results results.append(result) print('Finished')
from gym import spaces import numpy as np # n_cpu = 4 # total_timesteps = 200000000 # # total_timesteps = 200000 # env = SubprocVecEnv([lambda: gym.make('WalkingSpider-v0') for i in range(n_cpu)]) # model = PPO2(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=total_timesteps) # model.save("experience_learned/ppo2_WalkingSpider_v0_testing") # del model # remove to demonstrate saving and loading # # # Enjoy trained agent model = PPO2.load("experience_learned/ppo2_WalkingSpider_v0_testing_3") print("Enjoy trained agent") env = DummyVecEnv([lambda: gym.make('WalkingSpider-v0')]) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() # Random Environment # env = gym.make('WalkingSpider-v0') # env.reset() # for _ in range(1000): # env.render() # observation, reward, done, info = env.step(env.action_space.sample()) # take a random action # print("Obs Shape ", observation, " Action Shape ", env.action_space.sample().shape)
def test_model_manipulation(model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # assert <10% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # assert <10% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")