def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, epsilon_final, zero_index_gradients, initialise_params, scale_actions, clip_grad, split, indexed, layers, multipass, weighted, average, random_weighted, render_freq, save_freq, save_dir, save_frames, visualise, action_input_layer, title): if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) assert not (save_frames and visualise) if visualise: assert render_freq > 0 if save_frames: assert render_freq > 0 vidir = os.path.join(save_dir, "frames") os.makedirs(vidir, exist_ok=True) env = gym.make('Platform-v0') initial_params_ = [3., 10., 400.] if scale_actions: for a in range(env.action_space.spaces[0].n): initial_params_[a] = 2. * ( initial_params_[a] - env.action_space.spaces[1].spaces[a].low ) / (env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1. env = ScaledStateWrapper(env) # 状态空间转换为-1~1 env = PlatformFlattenedActionWrapper(env) # 扁平化动作空间 if scale_actions: # 转换动作空间为-1~1 env = ScaledParameterisedActionWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) print(env.observation_space) from agents.pdqn import PDQNAgent from agents.pdqn_split import SplitPDQNAgent from agents.pdqn_multipass import MultiPassPDQNAgent assert not (split and multipass) agent_class = PDQNAgent if split: agent_class = SplitPDQNAgent elif multipass: agent_class = MultiPassPDQNAgent agent = agent_class(env.observation_space.spaces[0], env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, learning_rate_actor_param=learning_rate_actor_param, epsilon_steps=epsilon_steps, gamma=gamma, tau_actor=tau_actor, tau_actor_param=tau_actor_param, clip_grad=clip_grad, indexed=indexed, weighted=weighted, average=average, random_weighted=random_weighted, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, epsilon_final=epsilon_final, inverting_gradients=inverting_gradients, actor_kwargs={ 'hidden_layers': layers, 'action_input_layer': action_input_layer, }, actor_param_kwargs={ 'hidden_layers': layers, 'squashing_function': False, 'output_layer_init_std': 0.0001, }, zero_index_gradients=zero_index_gradients, seed=seed) if initialise_params: initial_weights = np.zeros((env.action_space.spaces[0].n, env.observation_space.spaces[0].shape[0])) initial_bias = np.zeros(env.action_space.spaces[0].n) for a in range(env.action_space.spaces[0].n): initial_bias[a] = initial_params_[a] agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) print(agent) max_steps = 250 total_reward = 0. returns = [] start_time = time.time() video_index = 0 # agent.epsilon_final = 0. # agent.epsilon = 0. # agent.noise = None for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) if visualise and i % render_freq == 0: env.render() act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) agent.step(state, (act, all_action_parameters), reward, next_state, (next_act, next_all_action_parameters), terminal, steps) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters action = next_action state = next_state episode_reward += reward if visualise and i % render_freq == 0: env.render() if terminal: break agent.end_episode() if save_frames and i % render_freq == 0: video_index = env.unwrapped.save_render_states( vidir, title, video_index) returns.append(episode_reward) total_reward += episode_reward if i % 100 == 0: print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format( str(i), total_reward / (i + 1), np.array(returns[-100:]).mean())) end_time = time.time() print("Took %.2f seconds" % (end_time - start_time)) env.close() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) returns = env.get_episode_rewards() print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, epsilon_final, tau_actor, tau_actor_param, tau_actor_param_critic, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, learning_rate_actor_param_critic, reward_scale, clip_grad, title, scale_actions, zero_index_gradients, split, layers, multipass, indexed, weighted, average, random_weighted, render_freq, action_input_layer, initialise_params, save_freq, save_dir, save_frames, visualise): env = gym.make('Goal-v0') env = GoalObservationWrapper(env) if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) assert not (save_frames and visualise) if visualise: assert render_freq > 0 if save_frames: assert render_freq > 0 vidir = os.path.join(save_dir, "frames") os.makedirs(vidir, exist_ok=True) if scale_actions: kickto_weights = np.array( [[-0.375, 0.5, 0, 0.0625, 0], [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]]) shoot_goal_left_weights = np.array([0.857346647646219686, 0]) shoot_goal_right_weights = np.array([-0.857346647646219686, 0]) else: xfear = 50.0 / PITCH_LENGTH yfear = 50.0 / PITCH_WIDTH caution = 5.0 / PITCH_WIDTH kickto_weights = np.array([[2.5, 1, 0, xfear, 0], [0, 0, 1 - caution, 0, yfear]]) shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0]) shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0]) initial_weights = np.zeros((4, 20)) #np.zeros((4, 17)) initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:] initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:] initial_weights[2, 16] = shoot_goal_left_weights[1] initial_weights[3, 16] = shoot_goal_right_weights[1] initial_bias = np.zeros((4, )) initial_bias[0] = kickto_weights[0, 0] initial_bias[1] = kickto_weights[1, 0] initial_bias[2] = shoot_goal_left_weights[0] initial_bias[3] = shoot_goal_right_weights[0] if not scale_actions: # rescale initial action-parameters for a scaled state space for a in range(env.action_space.spaces[0].n): mid = (env.observation_space.spaces[0].high + env.observation_space.spaces[0].low) / 2. initial_bias[a] += np.sum(initial_weights[a] * mid) initial_weights[ a] = initial_weights[a] * env.observation_space.spaces[ 0].high - initial_weights[a] * mid env = GoalFlattenedActionWrapper(env) if scale_actions: env = ScaledParameterisedActionWrapper(env) env = ScaledStateWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) assert not (split and multipass) agent_class = HHQNAgent agent = agent_class( observation_space=env.observation_space.spaces[0], action_space=env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, # 0.0001 learning_rate_actor_param=learning_rate_actor_param, # 0.001 learning_rate_actor_param_critic=learning_rate_actor_param_critic, epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, clip_grad=clip_grad, indexed=indexed, average=average, random_weighted=random_weighted, tau_actor=tau_actor, weighted=weighted, tau_actor_param=tau_actor_param, tau_actor_param_critic=tau_actor_param_critic, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, actor_kwargs={ 'hidden_layers': layers, 'output_layer_init_std': 1e-5, 'action_input_layer': action_input_layer, }, actor_param_kwargs={ 'hidden_layers': layers, 'output_layer_init_std': 1e-5, 'squashing_function': False }, zero_index_gradients=zero_index_gradients, seed=seed) if initialise_params: agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) print(agent) max_steps = 150 total_reward = 0. returns = [] start_time = time.time() video_index = 0 Reward = [] possibility = [] for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) if visualise and i % render_freq == 0: env.render() episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) r = reward * reward_scale agent.step(state, (act, all_action_parameters), r, next_state, (next_act, next_all_action_parameters), terminal, steps) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters action = next_action state = next_state episode_reward += reward if visualise and i % render_freq == 0: env.render() if terminal: break agent.end_episode() if save_frames: video_index = env.unwrapped.save_render_states( vidir, title, video_index) returns.append(episode_reward) total_reward += episode_reward if (i + 1) % 100 == 0: print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format( str(i + 1), total_reward / (i + 1), (np.array(returns) == 50.).sum() / len(returns))) Reward.append(total_reward / (i + 1)) possibility.append((np.array(returns) == 50.).sum() / len(returns)) plot_reward(Reward) plot_p(possibility) end_time = time.time() print("Training took %.2f seconds" % (end_time - start_time)) env.close() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) returns = env.get_episode_rewards() np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, epsilon_final, zero_index_gradients, initialise_params, scale_actions, clip_grad, split, indexed, layers, multipass, weighted, average, random_weighted, render_freq, save_freq, save_dir, save_frames, visualise, action_input_layer, title, window): pic_name = filename_generator("./results/imgs/", "capacity60-5-10", seed, title) print(pic_name) if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) assert not (save_frames and visualise) if visualise: assert render_freq > 0 if save_frames: assert render_freq > 0 vidir = os.path.join(save_dir, "frames") os.makedirs(vidir, exist_ok=True) env = gym.make('Cloud-v0') # initial_params_ = [0.0, 0.0, 0.0] initial_params_ = [0.5, 0.5, 0.5] # initial_params_ = [1.0, 1.0, 1.0] if scale_actions: for a in range(env.action_space.spaces[0].n): initial_params_[a] = 2. * ( initial_params_[a] - env.action_space.spaces[1].spaces[a].low ) / (env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1. env = ScaledStateWrapper(env) # 状态空间转换为-1~1 env = PlatformFlattenedActionWrapper(env) # 扁平化动作空间 if scale_actions: # 转换动作空间为-1~1 env = ScaledParameterisedActionWrapper(env) # dir = os.path.join(save_dir,title) # env = Monitor(env, directory=os.path.join(dir,str(seed)), video_callable=False, write_upon_reset=False, force=True) # env.seed(seed) np.random.seed(seed) print(env.observation_space) from agents.pdqn import PDQNAgent from agents.pdqn_split import SplitPDQNAgent from agents.pdqn_multipass import MultiPassPDQNAgent assert not (split and multipass) agent_class = PDQNAgent if split: agent_class = SplitPDQNAgent elif multipass: agent_class = MultiPassPDQNAgent agent = agent_class(env.observation_space.spaces[0], env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, learning_rate_actor_param=learning_rate_actor_param, epsilon_steps=epsilon_steps, gamma=gamma, tau_actor=tau_actor, tau_actor_param=tau_actor_param, clip_grad=clip_grad, indexed=indexed, weighted=weighted, average=average, random_weighted=random_weighted, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, epsilon_final=epsilon_final, inverting_gradients=inverting_gradients, actor_kwargs={ 'hidden_layers': layers, 'action_input_layer': action_input_layer, }, actor_param_kwargs={ 'hidden_layers': layers, 'squashing_function': False, 'output_layer_init_std': 0.0001, }, zero_index_gradients=zero_index_gradients, seed=seed, spot_bound=-0.4167) # <=8 if initialise_params: initial_weights = np.zeros((env.action_space.spaces[0].n, env.observation_space.spaces[0].shape[0])) initial_bias = np.zeros(env.action_space.spaces[0].n) for a in range(env.action_space.spaces[0].n): initial_bias[a] = initial_params_[a] agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) print(agent) max_steps = 5000 total_reward = 0. returns = [] start_time = time.time() video_index = 0 # agent.epsilon_final = 0. # agent.epsilon = 0. # agent.noise = None best = -float("inf") for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) state = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step( action ) # execute action in environment, and observe next state next_state, reward, terminal, _ = ret # obtain result next_state = np.array(next_state, dtype=np.float32, copy=False) # convert to nparray next_act, next_act_param, next_all_action_parameters = agent.act( next_state) # choose action according to next state next_action = pad_action( next_act, next_act_param) # package action and param agent.step( state, (act, all_action_parameters), reward, next_state, # add sample and learn (next_act, next_all_action_parameters), terminal, time_steps=1) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters # transfer state and action action = next_action state = next_state episode_reward += reward # calculate the episode reward if terminal: break agent.end_episode() returns.append(episode_reward) total_reward += episode_reward if episode_reward > best: best = episode_reward with open('results/res.txt', "w") as f: f.write(str(best * 500.0)) print('Episode{0:5s} R:{1:.4f} Avg:{2:.4f} r10:{3:.4f}'.format( str(i), episode_reward, total_reward / (i + 1), np.array(returns[-window:]).mean())) if visualise and i % window == 0 and i is not 0: plot_window_reward(returns, filename=pic_name, window=window) # plot_reward(returns, filename=pic_name) end_time = time.time() print("Took %.2f seconds" % (end_time - start_time)) print(best * 500.0) # env.close() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.)
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, scale_actions, epsilon_steps, epsilon_final, tau_actor, tau_critic, use_ornstein_noise, learning_rate_actor, learning_rate_critic, reward_scale, clip_grad, initialise_params, layers, save_dir, title): env = gym.make('Goal-v0') env = GoalObservationWrapper(env) if scale_actions: kickto_weights = np.array( [[-0.375, 0.5, 0, 0.0625, 0], [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]]) shoot_goal_left_weights = np.array([0.857346647646219686, 0]) shoot_goal_right_weights = np.array([-0.857346647646219686, 0]) else: xfear = 50.0 / PITCH_LENGTH yfear = 50.0 / PITCH_WIDTH caution = 5.0 / PITCH_WIDTH kickto_weights = np.array([[2.5, 1, 0, xfear, 0], [0, 0, 1 - caution, 0, yfear]]) shoot_goal_left_weights = np.array([GOAL_WIDTH / 2 - 1, 0]) shoot_goal_right_weights = np.array([-GOAL_WIDTH / 2 + 1, 0]) initial_weights = np.zeros((4, 17)) initial_weights[0, [10, 11, 14, 15]] = kickto_weights[0, 1:] initial_weights[1, [10, 11, 14, 15]] = kickto_weights[1, 1:] initial_weights[2, 16] = shoot_goal_left_weights[1] initial_weights[3, 16] = shoot_goal_right_weights[1] initial_bias = np.zeros((4, )) initial_bias[0] = kickto_weights[0, 0] initial_bias[1] = kickto_weights[1, 0] initial_bias[2] = shoot_goal_left_weights[0] initial_bias[3] = shoot_goal_right_weights[0] env = GoalFlattenedActionWrapper(env) if scale_actions: env = ScaledParameterisedActionWrapper(env) env = ScaledStateWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) print(env.action_space) print(env.observation_space) env.seed(seed) np.random.seed(seed) agent = PADDPGAgent( observation_space=env.observation_space.spaces[0], action_space=env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, learning_rate_critic=learning_rate_critic, epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, clip_grad=clip_grad, tau_actor=tau_actor, tau_critic=tau_critic, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, n_step_returns=False, adam_betas=(0.9, 0.999), critic_kwargs={ 'hidden_layers': layers, 'init_type': "kaiming" }, actor_kwargs={ 'hidden_layers': layers, 'init_type': "kaiming", # 'init_std': 1e-5, # 0.0001, 'squashing_function': False }, seed=seed) if initialise_params: agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) print(agent) max_steps = 150 total_reward = 0. returns = [] start_time = time.time() log_f = open("log_paddpg_GoalEnv.txt", "w+") for i in range(episodes): state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_actions, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) r = reward * reward_scale agent.step(state, (act, act_param, all_actions, all_action_parameters), r, next_state, (next_act, next_act_param, next_all_actions, next_all_action_parameters), terminal, optimise=True) act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters action = next_action state = next_state episode_reward += reward if terminal: break agent.end_episode() returns.append(episode_reward) total_reward += episode_reward if (i + 1) % 100 == 0: print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format( str(i + 1), total_reward / (i + 1), (np.array(returns) == 50.).sum() / len(returns))) # from left to right: episode number, episode reward, averaged total reward for all past episodes, # returns for nearest 100 episodes and success rates log_f.write('{},{},{},{},{}\n'.format( i, episode_reward, total_reward / (i + 1), np.array(returns[-100:]).mean(), (np.array(returns) == 50.).sum() / len(returns))) log_f.flush() end_time = time.time() print("Took %.2f seconds" % (end_time - start_time)) env.close() print(agent) returns = env.get_episode_rewards() np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, save_dir, epsilon_steps, epsilon_final, tau_actor, tau_critic, use_ornstein_noise, learning_rate_actor, learning_rate_critic, clip_grad, layers, initialise_params, title): env = gym.make('Platform-v0') env = ScaledStateWrapper(env) initial_params_ = [3., 10., 400.] for a in range(env.action_space.spaces[0].n): initial_params_[a] = 2. * ( initial_params_[a] - env.action_space.spaces[1].spaces[a].low) / ( env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1. env = PlatformFlattenedActionWrapper(env) env = ScaledParameterisedActionWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) agent = PADDPGAgent(observation_space=env.observation_space.spaces[0], action_space=env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, learning_rate_critic=learning_rate_critic, epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, clip_grad=clip_grad, tau_actor=tau_actor, tau_critic=tau_critic, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, adam_betas=(0.9, 0.999), critic_kwargs={ 'hidden_layers': layers, 'init_type': "kaiming" }, actor_kwargs={ 'hidden_layers': layers, 'init_type': "kaiming", 'init_std': 0.0001, 'squashing_function': False }, seed=seed) print(agent) if initialise_params: initial_weights = np.zeros((env.action_space.spaces[0].n, env.observation_space.spaces[0].shape[0])) initial_bias = np.zeros(env.action_space.spaces[0].n) for a in range(env.action_space.spaces[0].n): initial_bias[a] = initial_params_[a] agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) max_steps = 250 total_reward = 0. returns = [] start_time = time.time() for i in range(episodes): state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_actions, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) agent.step(state, (act, act_param, all_actions, all_action_parameters), reward, next_state, (next_act, next_act_param, next_all_actions, next_all_action_parameters), terminal, steps) act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters action = next_action state = next_state # .copy() episode_reward += reward if terminal: break agent.end_episode() returns.append(episode_reward) total_reward += episode_reward if (i + 1) % 100 == 0: print('{0:5s} R:{1:.5f}'.format(str(i + 1), total_reward / (i + 1))) end_time = time.time() print("Took %.2f seconds" % (end_time - start_time)) env.close() returns = env.get_episode_rewards() print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)