def _train(args): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = NormalizeWrapper(env) env = ImgWrapper(env) # to make the images from 160x120x3 into 3x160x120 env = ActionWrapper(env) env = DtRewardWrapper(env) print("Initialized Wrappers") # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") replay_buffer = ReplayBuffer(args.replay_buffer_max_size) print("Initialized DDPG") # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 reward = 0 episode_timesteps = 0 print("Starting training") while total_timesteps < args.max_timesteps: print("timestep: {} | reward: {}".format(total_timesteps, reward)) if done: if total_timesteps != 0: print( ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (total_timesteps, episode_num, episode_timesteps, episode_reward)) policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) print("rewards at time {}: {}".format( total_timesteps, evaluations[-1])) if args.save_models: policy.save(filename='{}_{}'.format( 'ddpg', total_timesteps), directory=args.model_dir) np.savez("./results/rewards.npz", evaluations) # Reset environment env_counter += 1 obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.predict(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal( 0, args.expl_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) if episode_timesteps >= args.env_timesteps: done = True done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float( done) episode_reward += reward # Store data in replay buffer replay_buffer.add(obs, new_obs, action, reward, done_bool) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 print("Training done, about to save..") policy.save(filename='ddpg', directory=args.model_dir) print("Finished saving..should return now!")
def _train(args): if not os.path.exists("./results"): os.makedirs("./results") if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) # Launch the env with our helper function env = launch_env() print("Initialized environment") # Wrappers env = ResizeWrapper(env) env = GrayscaleWrapper(env) env = NormalizeWrapper(env) env = FrameStack(env, 4) env = DtRewardWrapper(env) env = ActionWrapper(env) print("Initialized Wrappers") # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Init training data total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_reward = 0 env_counter = 0 reward = 0 episode_timesteps = 0 avg_episodes = 100 # Keep track of the best reward over time best_reward = -np.inf # Keep track of train_rewards train_rewards = [] # To print mean actions per episode mean_action = [] # To keep track of moving averages moving_avgs = [] # Summary writer for tensorboard writer = SummaryWriter(log_dir="reinforcement/pytorch/runs") # Initialize policy if args.policy not in policies: raise ValueError( "Policy {} is not available, chose one of : {}".format( args.policy, list(policies.keys()))) policy = policies[args.policy](state_dim, action_dim, max_action, args.per, args.gradclip) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] moving_avgs.append(evaluations[0]) writer.add_scalar("Timesteps/EvaluationReward", evaluations[0], total_timesteps) ## Initialize ReplayBuffer if args.per: print("Training with Prioritized Experience Reply") replay_buffer = PrioritizedReplayBuffer( args.replay_buffer_max_size, args.batch_size, args.seed, initial_beta=0.5, delta_beta=2 / args.max_timesteps, ) else: replay_buffer = ReplayBuffer(args.replay_buffer_max_size, args.batch_size, args.seed) # Load previous policy if args.load_initial_policy: # Disable random start steps args.start_timesteps = 0 # Load training data checkpoint = load_training_state(args.model_dir, args.policy + "_training") evaluations = checkpoint["evaluations"] total_timesteps = checkpoint["total_timesteps"] train_rewards = checkpoint["train_rewards"] episode_num = checkpoint["episode_num"] best_reward = checkpoint["best_reward"] moving_avgs = checkpoint["moving_avgs"] # Load policy policy.load(args.model_dir, args.policy) print("Starting training") obs = env.reset() while total_timesteps < args.max_timesteps: # Select action if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.predict(np.array(obs)) action = add_noise(action, args.expl_noise, env.action_space.low, env.action_space.high) mean_action.append(action) # Perform action new_obs, reward, done, _ = env.step(action) # Update episode reward episode_reward += reward # Store data in replay buffer replay_buffer.add(obs, action, reward, new_obs, float(done)) # Update network if len(replay_buffer) >= args.batch_size: policy.update(replay_buffer, args.discount, args.tau) # Update env obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 if episode_timesteps >= args.env_timesteps: done = True if done: print(( "Total T: %d Episode Num: %d \nMean actions: %.2f %.2f Episode T: %d Reward: %.1f Moving Average: %.1f" ) % ( total_timesteps, episode_num, np.mean(np.array(mean_action), axis=0)[0], np.mean(np.array(mean_action), axis=0)[1], episode_timesteps, episode_reward, moving_avgs[-1], )) train_rewards.append(episode_reward) moving_avgs.append(moving_average(train_rewards, avg_episodes)) writer.add_scalar("Timesteps/Rewards", episode_reward, total_timesteps) writer.add_scalar("Timesteps/MovingAverage", moving_avgs[-1], total_timesteps) # writer.add_scalar("Episode/Wheel1Mean", np.mean(np.array(mean_action), axis=0)[0], episode_num) # writer.add_scalar("Episode/Wheel2Mean", np.mean(np.array(mean_action), axis=0)[1], episode_num) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq eval_reward = evaluate_policy(env, policy) evaluations.append(eval_reward) writer.add_scalar("Timesteps/EvaluationReward", eval_reward, total_timesteps) print( "\n-+-+-+-+-+-+-+-+-+-+ Evaluation reward at time {}: {} +-+-+-+-+-+-+-+-+-+-" .format(total_timesteps, eval_reward)) np.savetxt( "reinforcement/pytorch/results/eval_rewards_" + args.policy + ".csv", np.array(evaluations), delimiter=",", ) np.savetxt( "reinforcement/pytorch/results/train_rewards_" + args.policy + ".csv", np.array(train_rewards), delimiter=",", ) np.savetxt( "reinforcement/pytorch/results/moving_averages_" + args.policy + ".csv", np.array(moving_avgs), delimiter=",", ) # Save the policy according to the best reward over training if eval_reward > best_reward: best_reward = eval_reward policy.save(args.model_dir, args.policy) save_training_state( args.model_dir, args.policy + "_training", best_reward, total_timesteps, evaluations, train_rewards, episode_num, moving_avgs, ) print( "-+-+-+-+-+-+-+-+-+-+ Model saved +-+-+-+-+-+-+-+-+-+-\n" ) # Reset environment mean_action = [] obs = env.reset() env_counter += 1 episode_reward = 0 episode_timesteps = 0 episode_num += 1 print("Finished..should return now!")