def test_channel_first_env(tmp_path): # test_cnn uses environment with HxWxC setup that is transposed, but we # also want to work with CxHxW envs directly without transposing wrapper. SAVE_NAME = "cnn_model.zip" # Create environment with transposed images (CxHxW). # If underlying CNN processes the data in wrong format, # it will raise an error of negative dimension sizes while creating convolutions env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=1, discrete=True, channel_first=True) model = A2C("CnnPolicy", env, n_steps=100).learn(250) assert not is_vecenv_wrapped(model.get_env(), VecTransposeImage) obs = env.reset() action, _ = model.predict(obs, deterministic=True) model.save(tmp_path / SAVE_NAME) del model model = A2C.load(tmp_path / SAVE_NAME) # Check that the prediction is the same assert np.allclose(action, model.predict(obs, deterministic=True)[0]) os.remove(str(tmp_path / SAVE_NAME))
def a2c(path): env = make_env(HumanPlayer()) eval_env = make_env(RandomPlayer()) model = A2C.load(path, env, verbose=1) mean, std = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f"Loaded policy: mean={mean:.2f} +/- {std}") # Show how well we learned by plating a game: obs = env.reset() done = False while not done: action, _state = model.predict(obs) obs, reward, done, info = env.step(action) print(f"{info['turn']: <4} | Reward: {reward: >4} | {info['winner']}") env.render() print("done")
def evaluate(params): # Load saved model model = A2C.load(exp_name, env=env) results = np.zeros(shape=(0,0)) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(params.get("test_episodes")): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) episode_reward += reward if done or info.get('is_success', False): episode_reward = 0.0 obs = env.reset() result = ("Reward:", episode_reward, "Success?", info.get('is_success', True)) results = np.append(results, result, axis=None)
def load_state(run_dir): """ Function that loads the previously saved state of training """ #looking for the latest saved state state_dir = os.path.join(run_dir, "saved_states") i = max([int(f.name) for f in os.scandir(state_dir) if f.is_dir()]) load_dir = os.path.join(state_dir, str(i)) policy_load_path = os.path.join(load_dir, 'policy') rm_load_path = os.path.join(load_dir, 'rm.pth') data_buff_load_path = os.path.join(run_dir, 'data_buff.pth') args_path = os.path.join(run_dir, "config.json") with open(args_path) as f: args = argparse.Namespace() args.__dict__.update(json.load(f)) reward_model = pickle.load(open(rm_load_path, 'rb')) data_buffer = pickle.load(open(data_buff_load_path, 'rb')) policy = A2C.load(path=policy_load_path) return reward_model, policy, data_buffer, i + 1
end_loop = args.start_iter + args.step print("START , END", start_loop, end_loop) for i in range(start_loop, end_loop): print("EVAL ", i) avg_dis_reward_run = [] for j in range(0, 10): print("SEED 0") # lambd = np.load(f"./{args.folder}/buffers/lambda_{args.algo}_{j}.npy") # N = np.load(f"./{args.folder}/buffers/N_{args.algo}_{j}.npy") model_name = f"./{args.folder}/models/model_{args.algo}_{j}_{i}" #print ("Lambd N i ", lambd[i], N[i]) env.set_N(int(N[i]), list(lambd[i])) if args.algo == 0: model = PPO.load(model_name, env) elif args.algo == 1: model = A2C.load(model_name, env) elif args.algo == 2: model = SAC.load(model_name, env) elif args.algo == 3: thres_vec = np.load( f"./{args.folder}/buffers/thresvec_{args.env_name}_{j}.npy" ) model.set_threshold_vec(thres_vec[i]) avg_dis_reward = 0.0 for k in range(100): env.seed(k) obs = env.reset() reward_traj = [] dis_reward = 0.0 for t in range(int(1e3)): if args.algo == 3:
def __init__(self, model_path, env): self.model = A2C.load(model_path, env)
import gym from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy, CnnPolicy from stable_baselines3.common.cmd_util import make_vec_env # Parallel environments # env = make_vec_env('SpaceInvaders-v0', n_envs=4) # env = gym.make('SpaceInvaders-v0') env = gym.make('Pong-v0') # model = A2C(MlpPolicy, env, verbose=1) #model = A2C(CnnPolicy, env, verbose=1) model = A2C.load("a2c_pong") #model.set_env(env) #model.learn(total_timesteps=50000) #model.save("a2c_pong") #del model # remove to demonstrate saving and loading obs = env.reset() score = 0 wins = 0 while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) #env.render() score = score + 1 if rewards > 0:
def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0') env = NormalizeObservationSpace(env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution(get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action time = np.array(list(map(lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[0].getDate()), hist_sc_state))) / 3600.0 # Convert to hours a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0 # Convert to km e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state))) mass = np.array(list(map(lambda sc_state: sc_state.getMass(), hist_sc_state))) ra = a * (1.0 + e) rp = a * (1.0 - e) v = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getVelocity().toArray(), hist_sc_state))) h = np.array(list(map(lambda sc_state: sc_state.getPVCoordinates().getMomentum().toArray(), hist_sc_state))) angle_f_v = list(map(lambda q: np.degrees(np.arccos( np.dot(q[0], q[1]) / np.linalg.norm(q[0]) / (np.linalg.norm(q[1]) + 1e-10) )), zip(v, hist_action))) hist_action_plane = list(map(lambda q: q[1] - np.dot(q[1], q[0]) * q[0] / (np.linalg.norm(q[0]) ** 2), zip(h, hist_action))) angle_fp_v = list(map(lambda q: np.degrees(np.arccos( np.dot(q[0], q[1] * [1, 1, 0]) / np.linalg.norm(q[0]) / ( np.linalg.norm(q[1] * [1, 1, 0]) + 1e-10) )), zip(v, hist_action_plane))) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("ra (km)") axs.plot(time, ra, "k") plt.tight_layout() fig.savefig("plan_ra.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("rp (km)") axs.plot(time, rp, "k") plt.tight_layout() fig.savefig("plan_rp.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(mass[0] - 0.04, mass[0]) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("mass (kg)") axs.plot(time, mass, "k") plt.tight_layout() fig.savefig("plan_m.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain') axs.set_xlim(time[0], time[-1]) axs.set_ylim(-1.3, 1.3) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("action") l1, l2, l3 = axs.plot(time[0:-1], hist_action, "k") l1.set_color("#000000") l2.set_color("#777777") l3.set_color("#BBBBBB") axs.legend(["Act1", "Act2", "Act3"], loc='upper left') plt.tight_layout() fig.savefig("plan_action.pdf", format="pdf") plt.close(fig)
type=str, help='Help (default: ..)', metavar='') ARGS = parser.parse_args() #### Load the model from file ############################## algo = ARGS.exp.split("-")[2] if os.path.isfile(ARGS.exp + '/success_model.zip'): path = ARGS.exp + '/success_model.zip' elif os.path.isfile(ARGS.exp + '/best_model.zip'): path = ARGS.exp + '/best_model.zip' else: print("[ERROR]: no model under the specified path", ARGS.exp) if algo == 'a2c': model = A2C.load(path) if algo == 'ppo': model = PPO.load(path) if algo == 'sac': model = SAC.load(path) if algo == 'td3': model = TD3.load(path) if algo == 'ddpg': model = DDPG.load(path) #### Parameters to recreate the environment ################ env_name = ARGS.exp.split("-")[1] + "-aviary-v0" OBS = ObservationType.KIN if ARGS.exp.split( "-")[3] == 'kin' else ObservationType.RGB if ARGS.exp.split("-")[4] == 'rpm': ACT = ActionType.RPM
# Step 3.b Passing through Normalization and stack frame (Optional) env = VecFrameStack( env, n_stack=custom_params['FRAME_STACK']) # Use 1 for now because we use image if not custom_params['USING_VAE']: env = VecTransposeImage(env) # Uncomment if using 3d obs if custom_params['USING_NORMALIZATION']: env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"), env) # Load the agent if custom_params['algo'] == 'sac': model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'a2c': model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'dqn': model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'ppo': model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip")) else: raise ValueError("Error model") # Load the saved statistics # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False obs = env.reset()
from stable_baselines3.common.evaluation import evaluate_policy # Create environment #env = gym.make('LunarLander-v2') env = gym.make('ransim-v0') # Instantiate the agent model = A2C('MlpPolicy', env, verbose=1) # Train the agent model.learn(total_timesteps=int(2e3), eval_log_path='log_msa') # Save the agent model.save("a2c_ran") del model # delete trained model to demonstrate loading # Load the trained agent model = A2C.load("a2c_ran") # Evaluate the agent #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, return_episode_rewards=False) episode_rewards, episode_lengths = evaluate_policy(model, env, n_eval_episodes=10, return_episode_rewards=True) #print('mean_reward: %.3f std_reward: %.3f' %(mean_reward, std_reward)) msa = 1 '''# Enjoy trained agent obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()'''
n_steps=config["n_steps"], vf_coef=config["vf_coef"], ent_coef = config["ent_coef"], max_grad_norm=config["max_grad_norm"], learning_rate=lr, rms_prop_eps=config["epsilon"], use_rms_prop=config["use_rms_prop"], use_sde=config["use_sde"], normalize_advantage=config["normalize_advantage"], verbose=config["verbose"], tensorboard_log="tb/{}/".format(config["session_ID"]), policy_kwargs=dict(net_arch=[int(config["policy_hid_dim"]), int(config["policy_hid_dim"])])) model.learn(learn_total_steps) model.save("learned/{}".format(config["session_ID"])) env.save("learned/{}.pkl".format(config["session_ID"])) env.close() else: model = A2C.load("learned/{}".format(config["session_ID"])) env = DummyVecEnv([lambda: HumanoidBulletEnv(animate=True, max_steps=env_max_steps)]) env = VecNormalize.load(("learned/{}.pkl".format(config["session_ID"])), env) env.training = False env.norm_reward = False obs = env.reset() for i in range(env_max_steps): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.close()
def load_model(self): print('Loading model from: {}'.format(self.model_path)) model = A2C.load(self.model_path) model.set_env(self.env) model.tensorboard_log = self.log_dir return model
model = A2C('CnnPolicy', env, gamma=0.8, learning_rate=5e-4, verbose=1, tensorboard_log="logs/") model.learn(total_timesteps=int(2e5)) model.save("a2c_highway") # model = A2C('CnnPolicy', env).learn(total_timesteps=int(2e5)) # model.save("a2c_highway_basic") # model.save("a2c_highway_policy5") # Record video # env.configure({"policy_frequency": 15, "duration": 20 * 15}) # model = A2C.load("a2c_highway_policy5") model = A2C.load("a2c_highwayv0") # model = A2C.load("a2c_highway_basic") # env.configure({"policy_frequency": 15, "duration": 20 * 15}) # video_length = 2 * env.config["duration"] # env = VecVideoRecorder(env, "videos/", # record_video_trigger=lambda x: x == 0, video_length=video_length, # name_prefix="dqn-agent") evaluate(env, model) for _ in range(5): obs = env.reset() done = False while not done: action, _ = model.predict(obs) print(action)
import gym from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy from stable_baselines3.common.env_util import make_vec_env # Parallel environments env = make_vec_env('CartPole-v1', n_envs=4) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("a2c_cartpole") del model # remove to demonstrate saving and loading model = A2C.load("a2c_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0') env.unwrapped._ref_sv[2] = 0.0 env.unwrapped._ref_sv[3] = 0.0 env.unwrapped._ref_sv[4] = 0.0 env = NormalizeObservationSpace( env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action x = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getX(), hist_sc_state))) / 1000.0 # Convert to km y = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getY(), hist_sc_state))) / 1000.0 # Convert to km env2 = gym.make('PerigeeRaising-Continuous3D-v0') env2.unwrapped._ref_sv[0] = 11000000.0 / 1.05 env2.unwrapped._ref_sv[1] = 0.05 env2.unwrapped._ref_sv[2] = 0.0 env2.unwrapped._ref_sv[3] = 0.0 env2.unwrapped._ref_sv[4] = 0.0 env2 = NormalizeObservationSpace( env2, lambda o: o / env2.unwrapped.observation_space.high) env2 = Monitor(env2) env2.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env2, n_eval_episodes=1) hist_sc_state2 = env2.unwrapped.hist_sc_state hist_action2 = env2.unwrapped.hist_action x2 = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getX(), hist_sc_state2))) / 1000.0 # Convert to km y2 = np.array( list( map( lambda sc_state: sc_state.getPVCoordinates().getPosition(). getY(), hist_sc_state2))) / 1000.0 # Convert to km fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.set_xlim(-12000, 12000) axs.set_ylim(-12000, 12000) axs.grid(False) axs.plot(x, y, "k", zorder=2) l2, = axs.plot(x2, y2, zorder=1) l2.set_color("#777777") axs.legend(["Before", "After"], loc='upper right', frameon=False, bbox_to_anchor=(0.0, 1.0)) im = mpimg.imread('earth.png') plt.imshow(im, extent=[-6400, 6400, -6400, 6400], interpolation="none") axs.set_aspect('equal') plt.text(11000, 0, "Pericenter") plt.text(-18500, 0, "Apocenter") plt.axis('off') plt.tight_layout() fig.savefig("orbit.pdf", format="pdf") plt.close(fig)
def compare_models(ticker): # initialize structures for evaluation train_data_path = '../data/{}_train.csv'.format(ticker.lower()) val_data_path = '../data/{}_validation.csv'.format(ticker.lower()) train_data = pd.read_csv(train_data_path) val_data = pd.read_csv(val_data_path) val_data['Date'] = pd.to_datetime(val_data['Date']) env = SingleStockTradingEnv(train_data_path, engineer_features, initial_value=INITIAL_PORTFOLIO_VALUE, borrowing=BORROWING, long_only=LONG_ONLY) # run evaluation for just RL agent rl_checkpoint_path = 'checkpoints/{}_rl_no_restrictions'.format( ticker.lower()) a2c = A2C.load(rl_checkpoint_path) rl_portfolio_values, rl_agent_holdings, rl_agent_actions, rl_goal_num_shares, rl_fig = evaluate( a2c, ticker, val_data, INITIAL_PORTFOLIO_VALUE, BORROWING, LONG_ONLY, use_gp=False, plot=True, show_plots=False, save_plots=False, env_type='no_restrictions') # get features for GP's lookback = 5 train_features = engineer_features(train_data, lookback=lookback) # turn data in dataframes into model inputs X_train = torch.Tensor( train_features.drop( ['Date', 'Volume', 'Returns', 'Close', f'Open -{lookback}'], axis=1).values) y_train = torch.Tensor(train_features['Returns'].values) gp_params = { 'n_train': 20, 'training_iter': 10, 'cvar_limit': -5, # maximum loss tolerance % 'gp_limit': 0.3, # predicted magnitude of GPR such that GP takes over 'data': { 'X_train': X_train[-20:], 'y_train': y_train[-20:] } # last month's worth of data? } # run evaluation for RL w/ GP agent a2c_gp = TradingAgent(use_gp=True, gp_params=gp_params, policy='MlpPolicy', env=env) a2c_gp.load(rl_path=rl_checkpoint_path) a2c_gp.learn(5000) a2c_gp.save(rl_path='checkpoints/{}_a2c_gp_no_restrictions_rl'.format( ticker.lower()), gp_path='checkpoints/{}_a2c_gp_no_restrictions_gp'.format( ticker.lower())) gp_portfolio_values, gp_agent_holdings, gp_agent_actions, gp_goal_num_shares, gp_fig = evaluate( a2c_gp, ticker, val_data, INITIAL_PORTFOLIO_VALUE, BORROWING, LONG_ONLY, use_gp=True, plot=True, show_plots=False, save_plots=False, env_type='no_restrictions') # plot some stuff that might be interesting to look at comp_fig = plt.figure(figsize=(20, 5)) plt.plot(val_data['Date'].iloc[6:], np.exp(val_data['Returns'].iloc[6:].cumsum()) * INITIAL_PORTFOLIO_VALUE, label='Buy and Hold') plt.plot(val_data['Date'].iloc[6:], rl_portfolio_values, label='A2C') plt.plot(val_data['Date'].iloc[6:], gp_portfolio_values, label='A2C + GP') plt.title('Performance Comparison - {}'.format(ticker)) plt.xlabel('Date') plt.ylabel('Portfolio Value') plt.legend() actions_fig = plt.figure(figsize=(20, 5)) plt.plot(val_data['Date'].iloc[6:], rl_agent_actions, label='RL Actions') plt.plot(val_data['Date'].iloc[6:], gp_agent_actions, label='GP Actions') plt.title('Actions Comparison - {}'.format(ticker)) plt.legend() shares_fig = plt.figure(figsize=(20, 5)) plt.plot(val_data['Date'].iloc[6:], rl_agent_holdings, label='RL Current # Shares') plt.plot(val_data['Date'].iloc[6:], gp_goal_num_shares, label='GP Target # Shares') plt.plot(val_data['Date'].iloc[6:], gp_agent_holdings, label='GP Current # Shares') plt.title('Holdings Comparison - {}'.format(ticker)) plt.legend() # plt.show() # save figures # rl_fig.savefig('figures/{}_rl_base_no_restrictions.pdf'.format(ticker.lower()), bbox_inches='tight') gp_fig.savefig('figures/{}_rl_with_gp_no_restrictions.pdf'.format( ticker.lower()), bbox_inches='tight') comp_fig.savefig('figures/{}_rl_gp_comparison_no_restrictions.pdf'.format( ticker.lower()), bbox_inches='tight') actions_fig.savefig( 'figures/{}_actions_comparison_no_restrictions.pdf'.format( ticker.lower()), bbox_inches='tight') shares_fig.savefig( 'figures/{}_num_shares_comparison_no_restrictions.pdf'.format( ticker.lower()), bbox_inches='tight') # Calculate and output Sharpe ratios (assume risk-free rate is 0) base_log_returns = np.diff(np.log(val_data['Adj Close'])) base_daily_vol = np.std(base_log_returns) base_sharpe = np.sqrt(252) * np.mean(base_log_returns) / base_daily_vol rl_log_returns = np.diff(np.log(rl_portfolio_values)) rl_daily_vol = np.std(rl_log_returns) rl_sharpe = np.sqrt(252) * np.mean(rl_log_returns) / rl_daily_vol gp_log_returns = np.diff(np.log(gp_portfolio_values)) gp_daily_vol = np.std(gp_log_returns) gp_sharpe = np.sqrt(252) * np.mean(gp_log_returns) / gp_daily_vol print('Base: {:.4f}, {:.4f}\tA2C: {:.4f}, {:.4f}\tA2C+GP: {:.4f}, {:.4f}'. format(base_sharpe, base_daily_vol, rl_sharpe, rl_daily_vol, gp_sharpe, gp_daily_vol))
# Download from given bucket (gcloud configured with privileges) client = gcloud.init_storage_client() bucket_name = args.model.split('/')[2] model_path = args.model.split(bucket_name + '/')[-1] gcloud.read_from_bucket(client, bucket_name, model_path) model_path = './' + model_path else: model_path = args.model model = None if args.algorithm == 'DQN': model = DQN.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'DDPG': model = DDPG.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'A2C': model = A2C.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'PPO': model = PPO.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'SAC': model = SAC.load(model_path, tensorboard_log=args.tensorboard) elif args.algorithm == 'TD3': model = TD3.load(model_path, tensorboard_log=args.tensorboard) else: raise RuntimeError('Algorithm specified is not registered.') model.set_env(env) # ---------------------------------------------------------------------------- # # Calculating total training timesteps based on number of episodes # # ---------------------------------------------------------------------------- # n_timesteps_episode = env.simulator._eplus_one_epi_len / \
def load(self, rl_path, gp_path=None): self.rl = A2C.load(rl_path) if gp_path is not None: state_dict = torch.load(gp_path) self.gp.load_state_dict(state_dict)
# Download from given bucket (gcloud configured with privileges) client = gcloud.init_storage_client() bucket_name = args.model.split('/')[2] model_path = args.model.split(bucket_name + '/')[-1] gcloud.read_from_bucket(client, bucket_name, model_path) model_path = './' + model_path else: model_path = args.model model = None if args.algorithm == 'DQN': model = DQN.load(model_path) elif args.algorithm == 'DDPG': model = DDPG.load(model_path) elif args.algorithm == 'A2C': model = A2C.load(model_path) elif args.algorithm == 'PPO': model = PPO.load(model_path) elif args.algorithm == 'SAC': model = SAC.load(model_path) elif args.algorithm == 'TD3': model = TD3.load(model_path) else: raise RuntimeError('Algorithm specified is not registered.') # ---------------------------------------------------------------------------- # # Execute loaded agent # # ---------------------------------------------------------------------------- # for i in range(args.episodes): obs = env.reset() rewards = []
def load(cls, filename, **kwargs): rlberry_a2c_wrapper = cls(**kwargs) rlberry_a2c_wrapper.wrapped = A2CStableBaselines.load(filename) return rlberry_a2c_wrapper
import gym import os from stable_baselines3.common.monitor import Monitor as M from stable_baselines3 import A2C from random import randint from csv import reader model = A2C.load("./best_models/combined_600_1000") env = gym.make('LunarLander-v2') # read csv file as a list of lists with open('./moderate_dataset/urgan_test_samples.csv', 'r') as read_obj: # pass the file object to reader() to get the reader object csv_reader = reader(read_obj) # Pass reader object to list() to get a list of lists list_of_rows = list(csv_reader) test_samples = [[float(j) for j in i] for i in list_of_rows] TEST_LEVEL_NUMS = 20 cumulated_reward_ls = [] last_reward_ls = [] for i in range(TEST_LEVEL_NUMS): env.load_terrain(test_samples[i]) init_position = randint(1, 18) env.set_initial_x(init_position) # Logs will be saved in log_dir/monitor.csv obs = env.reset()
env_id = 'PongNoFrameskip-v4' video_folder = 'logs/videos/' video_length = 1000 nEnv = 8 startFresh = False if (startFresh): env = make_atari_env(env_id, n_envs=nEnv, seed=0) env = VecFrameStack(env, n_stack=4) env.reset() model = A2C('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) model.save("a2c_pong_{}".format(model.num_timesteps)) record_video(env_id, model, video_length=500, prefix='ac2_' + env_id, video_folder='videos/') else: env = make_atari_env(env_id, n_envs=nEnv, seed=0) env = VecFrameStack(env, n_stack=4) env.reset() trained_model = A2C.load("a2c_pong_200000", verbose=1) trained_model.set_env(env) trained_model.learn(total_timesteps=1000, reset_num_timesteps=False) trained_model.save("a2c_pong_{}".format(trained_model.num_timesteps)) record_video(env_id, trained_model, video_length=500, prefix='ac2_' + env_id, video_folder='videos/')
model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups") def act(env, model): # env is deterministic as in if I say "go right" the gripper will go right all the time. obs = env.reset() for i in range(100): env.render() action, _states = model.predict(obs, deterministic=True) # print(action) obs, reward, done, info = env.step(action) if done: print('[FINAL] obs=', obs, 'reward=', reward, 'done=', done) break type = "DQN" TIME_STEPS = 50000 env = gym.make('CupsWorld-v0') # train(env, type, TIME_STEPS) if type == "A2C": model = A2C.load('model_cups') elif type == "DQN": model = DQN.load('model_cups') elif type == "PPO": model = PPO.load('model_cups') act(env, model) env.close()
def __init__(self): self.env = A2CAgent.create_env(1) self.model = A2C.load(MODEL_PATH)
def process(file): env = gym.make('PerigeeRaising-Continuous3D-v0', use_perturbations=True, perturb_action=True) env = NormalizeObservationSpace( env, lambda o: o / env.unwrapped.observation_space.high) env = Monitor(env) env.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env, n_eval_episodes=1) hist_sc_state = env.unwrapped.hist_sc_state hist_action = env.unwrapped.hist_action time = np.array( list( map( lambda sc_state: sc_state.getDate().durationFrom(hist_sc_state[ 0].getDate()), hist_sc_state))) / 3600.0 # Convert to hours a = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state))) / 1000.0 # Convert to km e = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state))) mass = np.array( list(map(lambda sc_state: sc_state.getMass(), hist_sc_state))) ra = a * (1.0 + e) rp = a * (1.0 - e) env2 = gym.make('PerigeeRaising-Continuous3D-v0') env2 = NormalizeObservationSpace( env2, lambda o: o / env2.unwrapped.observation_space.high) env2 = Monitor(env2) env2.seed(42) agent = A2C.load(file) agent.policy.action_dist = SquashedDiagGaussianDistribution( get_action_dim(env.action_space)) evaluate_policy(agent, env2, n_eval_episodes=1) hist_sc_state2 = env2.unwrapped.hist_sc_state hist_action2 = env2.unwrapped.hist_action time2 = np.array( list( map( lambda sc_state: sc_state.getDate().durationFrom( hist_sc_state2[0].getDate()), hist_sc_state2))) / 3600.0 # Convert to hours a2 = np.array(list(map(lambda sc_state: sc_state.getA(), hist_sc_state2))) / 1000.0 # Convert to km e2 = np.array(list(map(lambda sc_state: sc_state.getE(), hist_sc_state2))) mass2 = np.array( list(map(lambda sc_state: sc_state.getMass(), hist_sc_state2))) ra2 = a2 * (1.0 + e2) rp2 = a2 * (1.0 - e2) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=ra[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(ra[0] - 20.0, ra[0] + 20.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("ra (km)") l2, = axs.plot(time2, ra2, "--") l2.set_color("#777777") axs.plot(time, ra, "k") axs.legend(["Planned", "Real"], loc='upper left') plt.tight_layout() fig.savefig("real_ra.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=rp[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(rp[0] - 5.0, rp[0] + 35.0) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("rp (km)") l2, = axs.plot(time2, rp2, "--") l2.set_color("#777777") axs.plot(time, rp, "k") axs.legend(["Planned", "Real"], loc='upper left') plt.tight_layout() fig.savefig("real_rp.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain', useOffset=mass[0]) axs.set_xlim(time[0], time[-1]) axs.set_ylim(mass[0] - 0.04, mass[0]) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("mass (kg)") l2, = axs.plot(time2, mass2, "--") l2.set_color("#777777") axs.plot(time, mass, "k") axs.legend(["Planned", "Real"], loc='upper right') plt.tight_layout() fig.savefig("real_m.pdf", format="pdf") plt.close(fig) fig, axs = plt.subplots(1, 1, figsize=(4.8, 3.0)) axs.ticklabel_format(axis='y', style='plain') axs.set_xlim(time[0], time[-1]) axs.set_ylim(-1.3, 1.3) axs.grid(True) axs.set_xlabel("time (h)") axs.set_ylabel("action") l1, l2, l3 = axs.plot(time[0:-1], hist_action) l1.set_color("#000000") l2.set_color("#777777") l3.set_color("#BBBBBB") axs.legend(["Act1", "Act2", "Act3"], loc='upper left') plt.tight_layout() fig.savefig("real_action.pdf", format="pdf") plt.close(fig)
learning_rate=linear_schedule(0.0001), seed=1, gamma=gamma) model.learn(total_timesteps=1000000, callback=reward_callback) model.save(current_path + "/models/model_" + dt_string) elif mode == "train_on_pretrained": # Loading pre-trained agent model_files = [ f for f in listdir(current_path + "/models") if isfile(join(current_path + "/models", f)) ] model_pre_trained = A2C.load( current_path + "/models/" + model_files[0]) # Loading the most recently saved agent model_pre_trained.set_env(env=env) model_pre_trained.learn(total_timesteps=1000000, callback=reward_callback) elif mode == "test": total_test_episodes = 100 model_files = [ f for f in listdir(current_path + "/models") if isfile(join(current_path + "/models", f)) ] model = A2C.load( current_path + "/models/" +
env, gamma=0.8, learning_rate=5e-4, verbose=1, tensorboard_log="logs/") model.learn(total_timesteps=int(2e5)) model.save("a2c_multiv3") # model = A2C('CnnPolicy', env).learn(total_timesteps=int(2e5)) # model.save("a2c_highway_basic") # model.save("a2c_highway_policy5") # Record video # env.configure({"policy_frequency": 15, "duration": 20 * 15}) # model = A2C.load("a2c_highway_policy5") model = A2C.load("a2c_multiv2") # model = A2C.load("a2c_highway_basic") # env.configure({"policy_frequency": 15, "duration": 20 * 15}) # video_length = 2 * env.config["duration"] # env = VecVideoRecorder(env, "videos/", # record_video_trigger=lambda x: x == 0, video_length=video_length, # name_prefix="dqn-agent") evaluate(env, model) print("End") for _ in range(5): obs = env.reset() done = False while not done: action, _ = model.predict(obs) print(action)