def main(env_id, dim_latent, render, num_process, lr_p, lr_v, gamma, polyak, target_action_noise_std, target_action_noise_clip, explore_size, memory_size, step_per_iter, batch_size, min_update_step, update_step, max_iter, eval_iter, save_iter, action_noise, policy_update_delay, model_path, log_path, seed): base_dir = log_path + env_id + "/TD3_encoder_exp{}".format(seed) writer = SummaryWriter(base_dir) td3 = TD3(env_id, dim_latent=dim_latent, render=render, num_process=num_process, memory_size=memory_size, lr_p=lr_p, lr_v=lr_v, gamma=gamma, polyak=polyak, target_action_noise_std=target_action_noise_std, target_action_noise_clip=target_action_noise_clip, explore_size=explore_size, step_per_iter=step_per_iter, batch_size=batch_size, min_update_step=min_update_step, update_step=update_step, action_noise=action_noise, policy_update_delay=policy_update_delay, seed=seed, model_path='trained_models') for i_iter in range(1, 6): td3.eval(i_iter, render=True) torch.cuda.empty_cache()
def main(env_id, render, num_process, lr_p, lr_v, gamma, polyak, target_action_noise_std, target_action_noise_clip, explore_size, memory_size, step_per_iter, batch_size, min_update_step, update_step, test_epochs, action_noise, policy_update_delay, model_path, seed): td3 = TD3(env_id, render=render, num_process=num_process, memory_size=memory_size, lr_p=lr_p, lr_v=lr_v, gamma=gamma, polyak=polyak, target_action_noise_std=target_action_noise_std, target_action_noise_clip=target_action_noise_clip, explore_size=explore_size, step_per_iter=step_per_iter, batch_size=batch_size, min_update_step=min_update_step, update_step=update_step, action_noise=action_noise, policy_update_delay=policy_update_delay, seed=seed, model_path=model_path ) for i_iter in range(1, test_epochs + 1): td3.eval(i_iter)
def get_td3_agent(*, d_state, d_action, discount, device, value_tau, value_loss, policy_lr, value_lr, policy_n_units, value_n_units, policy_n_layers, value_n_layers, policy_activation, value_activation, agent_grad_clip, td3_policy_delay, tdg_error_weight, td_error_weight, td3_expl_noise): return TD3(d_state=d_state, d_action=d_action, device=device, gamma=discount, tau=value_tau, value_loss=value_loss, policy_lr=policy_lr, value_lr=value_lr, policy_n_layers=policy_n_layers, value_n_layers=value_n_layers, value_n_units=value_n_units, policy_n_units=policy_n_units, policy_activation=policy_activation, value_activation=value_activation, grad_clip=agent_grad_clip, policy_delay=td3_policy_delay, tdg_error_weight=tdg_error_weight, td_error_weight=td_error_weight, expl_noise=td3_expl_noise)
def main(): value_function_1 = Sequential(Linear(in_features=4, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=1)).to( torch.device("cuda:0")) value_function_2 = Sequential(Linear(in_features=4, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=1)).to( torch.device("cuda:0")) policy_function = Sequential(Linear(in_features=3, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=1)).to( torch.device("cuda:0")) optimizer_value_1 = Adam(params=value_function_1.parameters(), lr=0.0003) optimizer_value_2 = Adam(params=value_function_2.parameters(), lr=0.0003) optimizer_policy = Adam(params=policy_function.parameters(), lr=0.0003) agent = TD3(value_net_1=value_function_1, value_net_2=value_function_2, policy_net=policy_function, optimizer_value_net_1=optimizer_value_1, optimizer_value_net_2=optimizer_value_2, optimizer_policy_net=optimizer_policy, lr_scheduler_value_net_1=None, lr_scheduler_value_net_2=None, lr_scheduler_policy_net=None, gamma=0.99, noise_std_f=lambda x: 0.1, target_policy_smoothing_std=0.2, target_policy_smoothing_bound=0.5, policy_update_frequency=2, tau=0.005, min_action=-2, max_action=2, replay_buffer_size=10000, replay_batch_size=64, start_training_at=1000, device=torch.device("cuda:0"), verbose=True) run_td3(agent, render=True)
def main(env_name, seed, hyper_params, eval_episodes=10): env = gym.make(env_name) state_dim = sum(list(env.observation_space.shape)) action_dim = sum(list(env.action_space.shape)) action_max = float(env.action_space.high[0]) torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') kwargs = { 'device': device, 'state_dim': state_dim, 'action_dim': action_dim, 'action_max': action_max, 'gamma': hyper_params['gamma'], 'tau': hyper_params['tau'], 'lr': hyper_params['lr'], 'policy_noise': hyper_params['policy_noise'] * action_max, 'noise_clip': hyper_params['noise_clip'] * action_max, 'policy_freq': hyper_params['policy_freq'] } agent = TD3(**kwargs) file_dir = os.path.abspath(os.path.dirname(__file__)) save_dir = os.path.join(file_dir, 'results', env_name, 'seed' + str(seed), 'learned_model') agent.load(save_dir) env.seed(seed + 100) episode_rewards = [] for _ in range(eval_episodes): state = env.reset() done = False sum_rewards = 0 while not done: env.render() action = agent.rollout_actor.deterministic_action(state) next_state, reward, done, _ = env.step(action) sum_rewards += reward state = next_state episode_rewards.append(sum_rewards) print( f'Episode: {len(episode_rewards)} Sum Rewards: {sum_rewards:.3f}') avg_reward = np.mean(episode_rewards) print('\n---------------------------------------') print(f'Evaluation over {eval_episodes} episodes: {avg_reward:.3f}') print('---------------------------------------')
def start_training(args): env = build_env(args) td3 = TD3(state_dim=env.observation_space.shape[0], action_num=env.action_space.shape[0], lr=args.learning_rate, batch_size=args.batch_size, device=args.gpu) load_params(td3, args) run_training_loop(env, td3, args) env.close()
def start_test_run(args): env = build_env(args) td3 = TD3(state_dim=env.observation_space.shape[0], action_num=env.action_space.shape[0], lr=args.learning_rate, batch_size=args.batch_size, device=args.gpu) load_params(td3, args) rewards = td3.evaluate_policy(env, render=True, save_video=args.save_video) print('rewards: ', rewards) mean = np.mean(rewards) median = np.median(rewards) print('mean: {mean}, median: {median}'.format(mean=mean, median=median)) env.close()
def get_algorithm(*argv, **kwargs): if args.algorithm == 'pg': return PG(*argv, **kwargs) if args.algorithm == 'ddpg': return DDPG(*argv, **kwargs) if args.algorithm == 'td3': return TD3(*argv, **kwargs) if args.algorithm == 'rbi': return RBI(*argv, **kwargs) if args.algorithm == 'drbi': return DRBI(*argv, **kwargs) if args.algorithm == 'ppo': return PPO(*argv, **kwargs) if args.algorithm == 'sacq': return SACQ(*argv, **kwargs) if args.algorithm == 'sspg': return SSPG(*argv, **kwargs) raise NotImplementedError
def run_td3(agent: TD3, render: bool = True): env = gym.make("Pendulum-v0") draw = env.render if render else lambda:... # Train forever. while True: next_state = env.reset() reward = 0 done = False ret = 0 while True: action = agent.train_step(state=next_state.flatten(), reward=reward, episode_ended=done) if done: break next_state, reward, done, info = env.step(action) ret += reward draw()
min_epsilon = 0.1 EXPLORE = 200 BUFFER_SIZE = 100000 RANDOM_SEED = 51234 MINIBATCH_SIZE = 64 # 32 # 5 with tf.Session() as sess: np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env = gym.make(ENV_NAME) state_dim = np.size(env.reset()) #2 #env.observation_space.shape[0] action_dim = 1 #env.action_space.shape[0] ddpg = TD3(sess, state_dim, action_dim, max_action, min_action, ACTOR_LEARNING_RATE, CRITIC_LEARNING_RATE, TAU, RANDOM_SEED, device=DEVICE) sess.run(tf.global_variables_initializer()) ddpg.load() replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) ruido = OUNoise(action_dim, mu=0.0) llegadas = 0 init_state = np.zeros(state_dim) irradiancias = list( [1000.] ) #list([1000., 500., 1000., 500., 900., 600., 800., 400., 100.]) #irradiancias = list([1000., 1000., 800., 700.]) #list([100., 200., 300., 400., 500., 600., 700., 800., 900., 1000]) temperaturas = list( [25.]
drone.load_level(params["level_name"]) return drone ################################################################################ # Exec env = init_drone_env(params) env.start_race(params["race_tier"]) env.initialize_drone() env.takeoff_with_moveOnSpline() env.get_ground_truth_gate_poses() print([[en.position.x_val, en.position.y_val, en.position.z_val] for en in env.gate_poses_ground_truth]) raise state_dim = env.observation_space[0] action_dim = env.action_space[0] max_action = float(env.action_high) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True writer = SummaryWriter(log_dir="./logs") policy = TD3(state_dim, action_dim, max_action, env) eval(policy, env, writer, params) time.sleep(2.0)
'rl_method': args.rl_method, 'delayed_reward_threshold': args.delayed_reward_threshold, 'net': args.net, 'num_steps': args.num_steps, 'lr': args.lr, 'output_path': output_path, 'reuse_models': args.reuse_models } # 강화학습 시작 learner = None common_params.update({ 'stock_code': stock_code, 'chart_data': chart_data, 'training_data': training_data, 'min_trading_unit': min_trading_unit, 'max_trading_unit': max_trading_unit }) if args.rl_method == 'td3': learner = TD3( **{ **common_params, 'value_network_path': value_network_path, 'policy_network_path': policy_network_path }) if learner is not None: learner.run(balance=args.balance, num_epoches=args.num_epoches, discount_factor=args.discount_factor, start_epsilon=args.start_epsilon, learning=args.learning)
help='number of simulations steps per update (default: 1)') parser.add_argument('--dir', default="runs", help='loggin directory to create folder containing tensorboard and loggin files') parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() env = gym.make(args.env) env.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic=True agent = TD3(env.observation_space.shape[0], env.action_space.shape[0], env.action_space, args) LOG_DIR = '{}/{}_TD3_{}'.format(args.dir, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env) writer = SummaryWriter(logdir=LOG_DIR) LOG = Logger(LOG_DIR) LOG.create("q_values") LOG.create("estimated_r") LOG.create("test_reward") LOG.create("train_reward") total_numsteps = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset()
hidden_dim=crit_hid_dim, output_non_linearity=crit_out_non_linear) # Agent lr = 3e-5 gamma = 0.99 tau = 0.01 policy_freq = 2 rb_max_size = 1e6 rb_batch_size = 64 agent = TD3(actor, critic, reward_fun, gamma=gamma, tau=tau, policy_freq=policy_freq, max_buffer_size=rb_max_size, batch_size=rb_batch_size, lr=lr) # Training show = False train_agent(agent, desc, file_name, runs, episodes, time_steps, test_episodes,
episode_length = deque(maxlen=10) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, "policy": args.policy } # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) replay_buffer = ReplayBuffer(state_dim, action_dim, max_size=int(1e5)) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env_name, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(args.max_timesteps)): episode_timesteps += 1
import gym if __name__ == '__main__': env_id = 'Pendulum-v0' #Pendulum-v0, MountainCarContinuous-v0 env = gym.make(env_id) agent = TD3(env, h_layers=[64, 64], seed=0, steps_per_epoch=4000, epochs=10, max_ep_len=1000, batch_size=100, start_steps=10000, update_after=1000, update_every=50, replay_size=int(1e5), gamma=0.99, polyak=0.995, lr_a=1e-3, lr_c=1e-3, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, save_freq=1, save_path='./checkpoints/') # training agent.train() #test
def main(env_name, low_list, high_list): sess = tf.Session() K.set_session(sess) # Define environment env = gym.make(env_name) td3 = TD3(env, sess, low_action_bound_list=low_list, high_action_bound_list=high_list) # Main loop num_episodes = 2000 max_episode_len = 1000 scores_deque = deque(maxlen=50) for i in range(num_episodes): total_reward = 0 current_state = env.reset() for step in range(max_episode_len): current_state = current_state.reshape((1, td3.state_dim)) action = td3.act(i, current_state) if td3.action_dim == 1: action = action.reshape((1, td3.action_dim)) elif td3.action_dim > 1: action = action.reshape((1, td3.action_dim))[0] next_state, reward, done, info = env.step(action) next_state = next_state.reshape((1, td3.state_dim)) total_reward += reward td3.replay_buffer.add(current_state, action, reward, next_state, done) current_state = next_state td3.train_critic() # Delayed training for policy if (step % 2) == 0: td3.train_actor() td3.update_target_models() if done: break scores_deque.append(total_reward) score_average = np.mean(scores_deque) print('Episode {}, Reward {}, Avg reward:{}'.format( i, total_reward, score_average)) if score_average >= -300: td3.actor_model.save_weights('model_{}.h5'.format(env_name)) # Display when finished current_state = env.reset() for step in range(1000): env.render() current_state = current_state.reshape((1, td3.state_dim)) action = td3.act(i, current_state) if td3.action_dim == 1: action = action.reshape((1, td3.action_dim)) elif td3.action_dim > 1: action = action.reshape((1, td3.action_dim))[0] next_state, reward, done, info = env.step(action) next_state = next_state.reshape((1, td3.state_dim)) current_state = next_state if done: break break
# Adding this line if we don't want the right click to put a red point Config.set('input', 'mouse', 'mouse,multitouch_on_demand') Config.set('graphics', 'resizable', False) Config.set('graphics', 'width', '1429') Config.set('graphics', 'height', '660') # Introducing last_x and last_y, used to keep the last point in memory when we draw the sand on the map last_x = 0 last_y = 0 n_points = 0 length = 0 max_action = 45 # Getting our AI, which we call "brain", and that contains our neural network that represents our Q-function #brain = Dqn(5,3,0.9) brain = TD3(5, 1, max_action) #action2rotation = [0,5,-5] replay_buffer = ReplayBuffer() last_reward = 0 scores = [] im = CoreImage("./images/MASK1.png") # Initializing the map first_update = True def init(): global sand global goal_x global goal_y global first_update
def __init__(self, policy_primitive_learning_rate, policy_movement_learning_rate, policy_model_arch, critic_learning_rate, critic_model_arch, target_smoothing_stddev, tau, exploration_prob, state_size, action_size, goal_size, n_simulations, movement_exploration_prob_ratio, policy_bottleneck_size, policy_default_layer_size, critic_default_layer_size): self.movement_exploration_prob_ratio = movement_exploration_prob_ratio full_policy_model = keras.models.model_from_yaml( policy_model_arch.pretty(resolve=True), custom_objects=custom_objects ) if not isinstance(full_policy_model.layers[-1], NormalNoise): raise ValueError("Last layer of the policy must be of type NormalNoise") noise_layers_indices = [ i for i, layer in enumerate(full_policy_model.layers) if isinstance(layer, NormalNoise) ] if len(noise_layers_indices) > 2: raise ValueError("More than 2 NormalNoise layers have been found in the policy") self.has_movement_primitive = len(noise_layers_indices) == 2 if self.has_movement_primitive: primitive_policy_model = keras.models.Sequential( full_policy_model.layers[ :noise_layers_indices[0] + 1 ]) movement_policy_model = keras.models.Sequential( full_policy_model.layers[ noise_layers_indices[0] + 1:noise_layers_indices[1] + 1 ]) self.primitive_size = primitive_policy_model.layers[-2].units self.primitive_td3 = TD3( policy_learning_rate=policy_primitive_learning_rate, policy_model=primitive_policy_model, critic_learning_rate=critic_learning_rate, critic_model=keras.models.model_from_yaml( critic_model_arch.pretty(resolve=True), custom_objects=custom_objects ), target_smoothing_stddev=target_smoothing_stddev, tau=tau, policy_state_size=state_size + goal_size, critic_state_size=state_size + goal_size, action_size=self.primitive_size, n_simulations=n_simulations, ) self.movement_td3 = TD3( policy_learning_rate=policy_movement_learning_rate, policy_model=movement_policy_model, critic_learning_rate=critic_learning_rate, critic_model=keras.models.model_from_yaml( critic_model_arch.pretty(resolve=True), custom_objects=custom_objects ), target_smoothing_stddev=target_smoothing_stddev, tau=tau, policy_state_size=self.primitive_size, critic_state_size=state_size + goal_size, action_size=int(action_size), n_simulations=n_simulations, ) else: movement_policy_model = full_policy_model self.primitive_td3 = None self.primitive_size = None self.movement_td3 = TD3( policy_learning_rate=policy_movement_learning_rate, policy_model=movement_policy_model, critic_learning_rate=critic_learning_rate, critic_model=keras.models.model_from_yaml( critic_model_arch.pretty(resolve=True), custom_objects=custom_objects ), target_smoothing_stddev=target_smoothing_stddev, tau=tau, policy_state_size=state_size + goal_size, critic_state_size=state_size + goal_size, action_size=int(action_size), n_simulations=n_simulations, ) self.n_simulations = n_simulations self.exploration_prob = exploration_prob
else: raise Exception('Unknown env') obs_size, act_size = env.observation_space.shape[ 0], env.action_space.shape[0] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env.seed(args.seed) env.action_space.seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # if args.agent == 'td3': agent = TD3(device, obs_size, act_size) train(agent, env, n_episodes=100, n_random_episodes=10) plt.plot(episode_list, reward_list, label='td3') #plt.savefig('ddpg_stoch2.png') # elif args.agent == 'ddpg': np.save('td3_episode_list.npy', episode_list) np.save('td3_reward_list.npy', reward_list) episode_list = [] reward_list = [] agent = DDPG(device, obs_size, act_size) train(agent, env, n_episodes=100, n_random_episodes=10) plt.plot(episode_list, reward_list, label='ddpg') plt.legend() plt.savefig('ddpg_stoch2.png')
def main(env_name, seed, hyper_params): env = gym.make(env_name) state_dim = sum(list(env.observation_space.shape)) action_dim = sum(list(env.action_space.shape)) action_max = float(env.action_space.high[0]) torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.np_random.seed(seed) device = torch.device('cuda' if hyper_params['use_cuda'] else 'cpu') kwargs = { 'device': device, 'state_dim': state_dim, 'action_dim': action_dim, 'action_max': action_max, 'gamma': hyper_params['gamma'], 'tau': hyper_params['tau'], 'lr': hyper_params['lr'], 'policy_noise': hyper_params['policy_noise'] * action_max, 'noise_clip': hyper_params['noise_clip'] * action_max, 'exploration_noise': hyper_params['exploration_noise'] * action_max, 'policy_freq': hyper_params['policy_freq'] } agent = TD3(**kwargs) replay_buffer = ReplayBuffer(state_dim, action_dim, device, max_size=int(1e6)) file_dir = os.path.abspath(os.path.dirname(__file__)) save_dir = os.path.join( file_dir, 'results', env_name, 'seed' + str(seed) ) os.makedirs(save_dir, exist_ok=True) evals = [eval_policy(agent.rollout_actor, env_name, seed)] state = env.reset() episode_reward = 0 episode_time_step = 0 episode_num = 0 episode_start = time.time() for t in range(hyper_params['max_time_step']): episode_time_step += 1 if t < hyper_params['initial_time_step']: action = env.action_space.sample() else: action = agent.rollout_actor.select_action(state) next_state, reward, done, _ = env.step(action) done_buffer = done if episode_time_step < env._max_episode_steps else False replay_buffer.add(state, next_state, action, reward, done_buffer) state = next_state episode_reward += reward if t >= hyper_params['initial_time_step']: agent.train(replay_buffer, batch_size=hyper_params['batch_size']) if done: print(f'Total T: {t + 1} Episode Num: {episode_num + 1} Reward: {episode_reward:.3f}', f'(Frame/sec {episode_time_step / (time.time() - episode_start):.3f})') # Reset environment state = env.reset() episode_reward = 0 episode_time_step = 0 episode_num += 1 episode_start = time.time() if (t + 1) % hyper_params['eval_freq'] == 0: test_start = time.time() # test policy evals.append(eval_policy(agent.rollout_actor, env_name, seed)) test_time = time.time() - test_start episode_start += test_time evals = np.array(evals) np.savetxt(os.path.join(save_dir, 'Episode_Rewards.txt'), evals) plt.figure() time_step = np.arange(len(evals)) * hyper_params['eval_freq'] plt.plot(time_step, evals) plt.xlabel('Time Steps') plt.ylabel('Episode Rewards') plt.grid() file_name = 'Episode_Rewards.png' file_path = os.path.join(save_dir, file_name) plt.savefig(file_path) plt.close() model_path = os.path.join(save_dir, 'learned_model') os.makedirs(model_path, exist_ok=True) agent.save(model_path)
def main(): env = gym.make('BipedalWalker-v3') # set seed for reproducable results seed = 1 env.seed(seed) np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) buffer_size = 1000000 batch_size = 100 noise = 0.1 # Uncomment to use GPU, but errors exist if GPU is not supported anymore. # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cpu") policy = TD3(state_dim, action_dim, max_action, env, device) try: print("Loading previous model") policy.load() except Exception as e: print('No previous model to load. Training from scratch.') buffer = ExperienceReplay(buffer_size, batch_size, device) save_score = 400 episodes = 650 timesteps = 2000 best_reward = -1 * sys.maxsize scores_over_episodes = [] for episode in range(episodes): avg_reward = 0 state = env.reset() for i in range(timesteps): # Same as the TD3, select an action and add noise: action = policy.select_action(state) + np.random.normal( 0, max_action * noise, size=action_dim) action = action.clip(env.action_space.low, env.action_space.high) # Make an action. next_state, reward, done, _ = env.step(action) buffer.store_transition(state, action, reward, next_state, done) state = next_state avg_reward += reward env.render() if (len(buffer) > batch_size): policy.train(buffer, i) if (done or i > timesteps): scores_over_episodes.append(avg_reward) print('Episode ', episode, 'finished with reward:', avg_reward) print('Finished at timestep ', i) break if (np.mean(scores_over_episodes[-50:]) > save_score): print('Saving agent- past 50 scores gave better avg than ', save_score) best_reward = np.mean(scores_over_episodes[-50:]) save_score = best_reward policy.save() break # Saved agent. Break out of episodes and end, 400 is pretty good. if (episode >= 0 and avg_reward > best_reward): print( 'Saving agent- score for this episode was better than best-known score..' ) best_reward = avg_reward policy.save() # Save current policy + optimizer fig = plt.figure() plt.plot(np.arange(1, len(scores_over_episodes) + 1), scores_over_episodes) plt.ylabel('Score') plt.xlabel('Episode #') plt.show()
# action noise if args.ou_noise: a_noise = OrnsteinUhlenbeckProcess(action_dim, mu=args.ou_mu, theta=args.ou_theta, sigma=args.ou_sigma) else: a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma) for run in range(7, 8): #args.nbRuns): memory = Memory(args.mem_size, state_dim, action_dim) # agent if args.use_td3: print("RUNNING : TD3") #TD3 agent = TD3(state_dim, action_dim, max_action, memory, args) else: print("RUNNING : DDPG") #DDPG agent = DDPG(state_dim, action_dim, max_action, memory, args) if args.mode == 'train': train(run, n_episodes=args.n_episodes, output=args.output, debug=args.debug, render=False) #modif en brut else: raise RuntimeError('undefined mode {}'.format(args.mode))
from tqdm import trange from IPython.display import clear_output from td3 import TD3 import tensorflow as tf from cpprb import ReplayBuffer, PrioritizedReplayBuffer BUFFER_SIZE = int(1e5) STATE_DIM = 5 ACTION_DIM = 1 BATCH_SIZE = 256 env = suite.load(domain_name='cartpole', task_name='swingup') action_spec = env.action_spec() agent = TD3(STATE_DIM, ACTION_DIM, max_action=action_spec.maximum) print('Running on ', agent.device) rb = ReplayBuffer(BUFFER_SIZE, {"obs": {"shape": (STATE_DIM,)}, "act": {"shape": ACTION_DIM}, "rew": {}, "next_obs": {"shape": (STATE_DIM,)}, "done": {}}) n_episodes=3; max_t=1e3; print_every=2 scores_deque = deque(maxlen=print_every) scores = [] for i_episode in trange(1, int(n_episodes)+1): time_step = env.reset()