def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 # saver.restore(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model_SMode_exp4/Exp4_1/Exp4_SMode_best') roll_distance = deque([]) roll_success_rate = deque([]) roll_return = deque([]) max_success_rate = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # episode_distance.append(info['distance']) # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: if len(roll_distance) < 2000: roll_distance.append(info['distance']) else: roll_distance.popleft() roll_distance.append(info['distance']) success_rate = 1 if info['distance'] <= 1 else 0 if len(roll_success_rate) < 2000: roll_success_rate.append(success_rate) else: roll_success_rate.popleft() roll_success_rate.append(success_rate) if len(roll_return) < 2000: roll_return.append(episode_reward) else: roll_return.popleft() roll_return.append(episode_reward) # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. saver.save(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model/Exp5_mv') if np.mean(roll_success_rate) > max_success_rate: max_success_rate = np.mean(roll_success_rate) saver.save(tf.get_default_session(), '/home/projectvenom/Documents/AIPilot/AIPilot-ProjectVenom-master/model/Exp5_mv_best') mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() # only for test combined_stats['AI_success_rate'] = np.mean(roll_success_rate) combined_stats['AI_distance'] = np.mean(roll_distance) combined_stats['AI_return'] = np.mean(roll_return) # only for test combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, saved_model_basename, restore_model_name, crowdai_client, crowdai_token, reward_shaping, feature_embellishment, relative_x_pos, relative_z_pos, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. saved_model_dir = 'saved-models/' if saved_model_basename is None: saved_model_basename = ''.join( random.choices(string.ascii_lowercase + string.digits, k=8)) saved_model_path = saved_model_dir + saved_model_basename if restore_model_name: restore_model_path = restore_model_name if not pathlib.Path(restore_model_path + '.index').is_file(): restore_model_path = saved_model_dir + restore_model_name max_to_keep = 500 eval_reward_threshold_to_keep = 300 saver = tf.train.Saver(max_to_keep=max_to_keep) adam_optimizer_store = dict() adam_optimizer_store['actor_optimizer'] = dict() adam_optimizer_store['critic_optimizer'] = dict() #eval_episode_rewards_history = deque(maxlen=100) #episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: try: if restore_model_name: logger.info("Restoring from model at", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) saver.restore(sess, restore_model_path) else: logger.info("Creating new model") sess.run(tf.global_variables_initializer( )) # this should happen here and not in the agent right? except InvalidArgumentError as exc: if "Assign requires shapes of both tensors to match." in str(exc): print("Unable to restore model from {:s}.".format( restore_model_path)) print( "Chances are you're trying to restore a model with reward embellishment into an environment without reward embellishment (or vice versa). Unfortunately this isn't supported (yet)." ) print(exc.message) sys.exit() else: raise exc # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() # restore adam optimizer try: if restore_model_name: logger.info("Restoring pkl file with adam state", restore_model_path) #saver.restore(sess, tf.train.latest_checkpoint(model_path)) adam_optimizer_store = pickle.load( open(restore_model_path + ".pkl", "rb")) agent.actor_optimizer.m = adam_optimizer_store[ 'actor_optimizer']['m'] agent.actor_optimizer.v = adam_optimizer_store[ 'actor_optimizer']['v'] agent.actor_optimizer.t = adam_optimizer_store[ 'actor_optimizer']['t'] agent.critic_optimizer.m = adam_optimizer_store[ 'critic_optimizer']['m'] agent.critic_optimizer.v = adam_optimizer_store[ 'critic_optimizer']['v'] agent.critic_optimizer.t = adam_optimizer_store[ 'critic_optimizer']['t'] if 'param_noise' in adam_optimizer_store: agent.param_noise = adam_optimizer_store['param_noise'] except: print("Unable to restore adam state from {:s}.".format( restore_model_path)) obs = env.reset() done = False episode_reward = 0. #episode_step = 0 #episodes = 0 #t = 0 #epoch_episode_steps = [] #epoch_episode_eval_rewards = [] #epoch_episode_eval_steps = [] #epoch_start_time = time.time() #epoch_actions = [] #epoch_episodes = 0 for epoch in range(nb_epochs): start_time = time.time() epoch_episode_rewards = [] epoch_qs = [] eval_episode_rewards = [] eval_qs = [] eval_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] worth_keeping = False for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape #new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = env.step(action) #t += 1 if rank == 0 and render: env.render() episode_reward += r #episode_step += 1 # Book-keeping. #epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) #episode_rewards_history.append(episode_reward) #epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_step = 0 #epoch_episodes += 1 #episodes += 1 agent.reset() obs = env.reset() # Train. #epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() #epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Submit to crowdai competition. What a hack. :) #if crowdai_client is not None and crowdai_token is not None and eval_env is not None: crowdai_submit_count = 0 if crowdai_client is not None and crowdai_token is not None: eval_obs_dict = crowdai_client.env_create( crowdai_token, env_id="ProstheticsEnv") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) while True: action, _ = agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False) submit_action = prosthetics_env.openai_to_crowdai_submit_action( action) clipped_submit_action = np.clip(submit_action, 0., 1.) actions_equal = clipped_submit_action == submit_action if not np.all(actions_equal): logger.debug("crowdai_submit_count:", crowdai_submit_count) logger.debug(" openai-action:", action) logger.debug(" submit-action:", submit_action) crowdai_submit_count += 1 [eval_obs_dict, reward, done, info] = crowdai_client.env_step( clipped_submit_action.tolist(), True) #[eval_obs_dict, reward, done, info] = crowdai_client.env_step(agent.pi(eval_obs_projection, apply_noise=False, compute_Q=False), True) eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) if done: logger.debug("done: crowdai_submit_count:", crowdai_submit_count) eval_obs_dict = crowdai_client.env_reset() if not eval_obs_dict: break logger.debug( "done: eval_obs_dict exists after reset") eval_obs_dict, eval_obs_projection = prosthetics_env.transform_observation( eval_obs_dict, reward_shaping=reward_shaping, reward_shaping_x=1., feature_embellishment=feature_embellishment, relative_x_pos=relative_x_pos, relative_z_pos=relative_z_pos) crowdai_client.submit() return # kids, don't try any of these (expedient hacks) at home! if eval_env: eval_episode_reward_mean, eval_q_mean, eval_step_mean = evaluate_n_episodes( 3, eval_env, agent, nb_eval_steps, render_eval) if eval_episode_reward_mean >= eval_reward_threshold_to_keep: worth_keeping = True mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time if nb_epochs and nb_epoch_cycles and nb_train_steps > 0: #stats = agent.get_stats() #combined_stats = stats.copy() combined_stats = {} combined_stats['train/epoch_episode_reward_mean'] = np.mean( epoch_episode_rewards) #combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) #combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) #combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/epoch_Q_mean'] = np.mean(epoch_qs) combined_stats['train/epoch_loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/epoch_loss_critic'] = np.mean( epoch_critic_losses) #combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['train/epoch_duration'] = duration #combined_stats['epoch/steps_per_second'] = float(t) / float(duration) #combined_stats['total/episodes'] = episodes #combined_stats['rollout/episodes'] = epoch_episodes #combined_stats['rollout/actions_std'] = np.std(epoch_actions) #combined_stats['memory/rss'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss else: combined_stats = {} # Evaluation statistics. if eval_env: combined_stats[ 'eval/epoch_episode_reward_mean'] = eval_episode_reward_mean # np.mean(eval_episode_rewards) #combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) #combined_stats['eval/epoch_episode_reward_std'] = np.std(eval_episode_rewards) combined_stats[ 'eval/epoch_Q_mean'] = eval_q_mean # np.mean(eval_qs) #combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats[ 'eval/steps_mean'] = eval_step_mean # np.mean(eval_steps) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. #combined_stats['total/epochs'] = epoch + 1 #combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.info('') logger.info('Epoch', epoch) logger.dump_tabular() logdir = logger.get_dir() if worth_keeping and rank == 0 and nb_epochs and nb_epoch_cycles and nb_train_steps and nb_rollout_steps: logger.info( 'Saving model to', saved_model_dir + saved_model_basename + '-' + str(epoch)) saver.save(sess, saved_model_path, global_step=epoch, write_meta_graph=False) adam_optimizer_store['actor_optimizer'][ 'm'] = agent.actor_optimizer.m adam_optimizer_store['actor_optimizer'][ 'v'] = agent.actor_optimizer.v adam_optimizer_store['actor_optimizer'][ 't'] = agent.actor_optimizer.t adam_optimizer_store['critic_optimizer'][ 'm'] = agent.critic_optimizer.m adam_optimizer_store['critic_optimizer'][ 'v'] = agent.critic_optimizer.v adam_optimizer_store['critic_optimizer'][ 't'] = agent.critic_optimizer.t adam_optimizer_store['param_noise'] = agent.param_noise pickle.dump( adam_optimizer_store, open((saved_model_path + "-" + str(epoch) + ".pkl"), "wb")) old_epoch = epoch - max_to_keep if old_epoch >= 0: try: os.remove(saved_model_path + "-" + str(old_epoch) + ".pkl") except OSError: pass if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, tensorboard_directory=None): rank = MPI.COMM_WORLD.Get_rank() # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = 1 logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None if not os.path.exists(tensorboard_directory): os.makedirs(tensorboard_directory) else: for file in os.listdir(tensorboard_directory): file_path = os.path.join(tensorboard_directory, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) episode_summaries, individual_summaries, batch_summaries, \ episode_reward_pl, qfunc_loss_pl, actions_pl, prices_pl, individual_reward_pl, \ individual_pnl_pl, individual_tc_pl, individual_estimated_q_pl = build_summaries(env.action_space.shape[0]) sess.graph.finalize() writer = tf.summary.FileWriter(tensorboard_directory, sess.graph) agent.reset() obs_state = env.reset() obs = obs_state.features if eval_env is not None: eval_obs = eval_env.reset().features done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 train_steps = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() # assert max_action.shape == action.shape new_obs_state, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs = new_obs_state.features t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 summary = sess.run(individual_summaries, feed_dict={ actions_pl: action, prices_pl: obs_state.price, individual_reward_pl: r, individual_pnl_pl: info['pnl'], individual_tc_pl: info['tc'], individual_estimated_q_pl: q[0, 0] }) writer.add_summary(summary, t) # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs obs_state = new_obs_state if done: # Episode done. summary = sess.run(episode_summaries, feed_dict={ episode_reward_pl: episode_reward }) writer.add_summary(summary, episodes) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs_state = env.reset() obs = obs_state.features # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): train_steps += 1 # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() summary = sess.run(batch_summaries, feed_dict={ qfunc_loss_pl: cl }) writer.add_summary(summary, train_steps) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs = eval_obs.features if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset().features eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() print("EPOCH_EPISODE_REWARDS:", len(epoch_episode_rewards)) combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=True): rank = MPI.COMM_WORLD.Get_rank() # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, (env.action_space.shape[0] -2,), gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, observation_range=(env.observation_space.low[0], env.observation_space.high[0]), action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up saving stuff only for a single worker. home = expanduser("~") savingModelPath = home + "/Documents/saved_models_OpenAI_gym/" if rank == 0: saver = tf.train.Saver(keep_checkpoint_every_n_hours=1) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # from https://github.com/openai/baselines/issues/162#issuecomment-397356482 and # https://www.tensorflow.org/api_docs/python/tf/train/import_meta_graph if restore == True: # restoring doesn't actually work logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(savingModelPath + "ddpg_test_model.meta") saver.restore(sess, tf.train.latest_checkpoint(savingModelPath)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer()) # this should happen here and not in the agent right? agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): start_time_epoch = time.time() for cycle in range(nb_epoch_cycles): start_time_cycle = time.time() # Perform rollouts. for t_rollout in range(nb_rollout_steps): if(t_rollout == nb_rollout_steps - 2): print("break here") start_time_rollout = time.time() # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) # e.g. action = array([ 0.02667301, 0.9654905 , -0.5694418 , -0.40275186], dtype=float32) np.set_printoptions(precision=3) print("selected (unscaled) action: " + str(action)) # e.g. [ 0.04 -0.662 -0.538 0.324] # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) target = np.insert(action, 3, [0.0, 0.0]) target = scale_range(target, -1, 1, env.action_space.low, env.action_space.high) # e.g. target = array([0.17346749, 0.24137263, 0.10763955, 0.83703685, 1.8033525 , 1.8763105 ], dtype=float32) # we keep the roll & pitch angle fixed target[3] = 0.0 target[4] = np.pi/2 # Execute next action. if rank == 0 and render: env.render() assert target.shape == env.action_space.shape new_obs, r, done, info = env.step(target) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() logger.info('runtime rollout-step {0}.{1}.{2}: {3}s'.format(epoch, cycle, t_rollout, time.time() - start_time_rollout)) # for rollout_steps # Train. print("Training the Agent") start_time_train = time.time() epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # 50 iterations # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() # e.g. 0.7446093559265137 epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) # e.g. 25.988863 epoch_actor_losses.append(al) # e.g. -0.008966461 agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. logger.info('runtime training actor & critic: {}s'.format(time.time() - start_time_train)) # Saving the trained model if(saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_test_model") logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) logger.info('runtime epoch-cycle {0}: {1}s'.format(cycle, time.time() - start_time_cycle)) # for epoch_cycles mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Saving the trained model if(saver is not None): logger.info("saving the trained model") start_time_save = time.time() saver.save(sess, savingModelPath + "ddpg_model_epochSave", global_step=epoch) logger.info('runtime saving: {}s'.format(time.time() - start_time_save)) logger.info('runtime epoch {0}: {1}s'.format(epoch, time.time() - start_time_epoch))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, **kwargs): # print("kwargs:",kwargs) rank = MPI.COMM_WORLD.Get_rank() print("rank:", rank) assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. # --------------- AMEND: For saving and restoring the model. added by xlv ------------------ if kwargs['restore'] == True and kwargs['restore_path'] != None: logger.info("Restoring from saved model") saver = tf.train.import_meta_graph(restore_path + "trained_model.meta") saver.restore(sess, tf.train.latest_checkpoint(restore_path)) else: logger.info("Starting from scratch!") sess.run(tf.global_variables_initializer()) # ---------------------------------------------------------------------------------------- agent.initialize(sess) sess.graph.finalize() agent.reset() obs = eval_obs = env.reset() # if eval_env is not None: # eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 # every 30 epochs plot statistics and save it. nb_epochs_unit = 30 ddpg_rewards = [] eval_ddpg_rewards = [] ddpg_suc_percents = [] eval_suc_percents = [] # ---- AMEND: added by xlv to calculate success percent ----- suc_num = 0 episode_num = 0 # ----------------------------------------------------------- for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, suc, info = env.step(max_action * action) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 # --- AMEND: added by xlv to calculate success percent --- episode_num += 1 if suc: suc_num += 1 # ------------------------------------------------------- agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. # eval_episode_rewards = [] # eval_qs = [] # if eval_env is not None: # eval_episode_reward = 0. # for t_rollout in range(nb_eval_steps): # eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # if render_eval: # eval_env.render() # eval_episode_reward += eval_r # # eval_qs.append(eval_q) # if eval_done: # eval_obs = eval_env.reset() # eval_episode_rewards.append(eval_episode_reward) # eval_episode_rewards_history.append(eval_episode_reward) # eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. # if eval_env is not None: # combined_stats['eval/return'] = eval_episode_rewards # combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs # combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # ------------------------------ plot statistics every nb_epochs_unit ----------------------------------- ddpg_rewards.append(np.mean(episode_rewards_history)) if (epoch + 1) % nb_epochs_unit == 0: ddpg_suc_percents.append(suc_num / episode_num) # ---------- Evaluate for 5 iters ----------------------- nb_eval_epochs = 5 nb_eval_epoch_cycles = 5 eval_episode_num = 0 eval_suc_num = 0 eval_episode_reward = 0 eval_episode_step = 0 eval_epoch_episode_rewards = [] eval_epoch_episode_steps = [] for i_epoch in range(nb_eval_epochs): logger.log( "********** Start Evaluation. Iteration %i ************" % i_epoch) for i_cycle in range(nb_eval_epoch_cycles): for t_rollout in range(nb_rollout_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) assert eval_action.shape == env.action_space.shape eval_obs, eval_r, eval_done, eval_suc, eval_info = env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_episode_reward += eval_r eval_episode_step += 1 if eval_done: eval_obs = env.reset() eval_epoch_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_epoch_episode_steps.append( eval_episode_step) eval_episode_reward = 0 eval_episode_step = 0 eval_episode_num += 1 if eval_suc: eval_suc_num += 1 logger.record_tabular( "Eval_EpRewMean", np.mean(eval_episode_rewards_history)) logger.record_tabular("Eval_EpNumUntilNow", eval_episode_num) logger.record_tabular("Eval_EpNumSuc", eval_suc_num) logger.record_tabular("Eval_EpSucPercent", eval_suc_num / eval_episode_num) logger.dump_tabular() eval_ddpg_rewards.append( np.mean(eval_episode_rewards_history)) eval_suc_percents.append(eval_suc_num / eval_episode_num) # ---------------------------------------------------------------------------------------------- # --------------------- plotting and saving ------------------------- if saver is not None: logger.info("saving the trained model") start_time_save = time.time() if epoch + 1 == nb_epochs: saver.save(sess, kwargs['MODEL_DIR'] + "/trained_model") else: saver.save( sess, kwargs['MODEL_DIR'] + "/iter_" + str( (epoch + 1) // nb_epochs_unit)) plot_performance(range(len(ddpg_rewards)), ddpg_rewards, ylabel=r'avg reward per DDPG learning step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'ddpg_reward'), title='TRAIN') plot_performance( range(len(ddpg_suc_percents)), ddpg_suc_percents, ylabel= r'overall success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'success_percent'), title="TRAIN") plot_performance(range(len(eval_ddpg_rewards)), eval_ddpg_rewards, ylabel=r'avg reward per DDPG eval step', xlabel='ddpg iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_ddpg_reward'), title='EVAL') plot_performance( range(len(eval_suc_percents)), eval_suc_percents, ylabel= r'overall eval success percentage per algorithm step under DDPG', xlabel='algorithm iteration', figfile=os.path.join(kwargs['FIGURE_DIR'], 'eval_success_percent'), title="EVAL") # save data which is accumulated UNTIL iter i with open( kwargs['RESULT_DIR'] + '/ddpg_reward_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f2: pickle.dump(ddpg_rewards, f2) with open( kwargs['RESULT_DIR'] + '/success_percent_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as fs: pickle.dump(ddpg_suc_percents, fs) # save evaluation data accumulated until iter i with open( kwargs['RESULT_DIR'] + '/eval_ddpg_reward_' + 'iter_' + str((epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_er: pickle.dump(eval_ddpg_rewards, f_er) with open( kwargs['RESULT_DIR'] + '/eval_success_percent_' + 'iter_' + str( (epoch + 1) // nb_epochs_unit) + '.pickle', 'wb') as f_es: pickle.dump(eval_suc_percents, f_es)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, aux_apply, aux_tasks, tc_lambda, prop_lambda, caus_lambda, repeat_lambda, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) # Setup aux tasks' lambdas aux_lambdas = { 'tc': tc_lambda, 'prop': prop_lambda, 'caus': caus_lambda, 'repeat': repeat_lambda } # Create agent agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, aux_tasks=aux_tasks, aux_lambdas=aux_lambdas) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): ep_rollout_times = [] ep_train_times = [] for cycle in range(nb_epoch_cycles): rollout_startt = time.time() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape #print("action mean:{} -- Q: {}".format(np.mean(action), q)) # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # for the first 5 cycles just gather data if epoch == 0 and cycle < 5: continue train_startt = time.time() ep_rollout_times.append(train_startt - rollout_startt) # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_aux_losses = {} epoch_aux_losses['grads/actor_grads'] = [] epoch_aux_losses['grads/critic_grads'] = [] epoch_aux_losses['grads/aux_grads'] = [] for name in aux_tasks: epoch_aux_losses['aux/' + name] = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, auxl = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) for name, value in auxl.items(): if 'grads' in name: epoch_aux_losses['grads/' + name].append( np.abs(value)) else: epoch_aux_losses['aux/' + name].append( np.abs(value)) agent.update_target_net() ep_train_times.append(time.time() - train_startt) if eval_env is not None: # Evaluate. eval_episode_rewards = [] eval_qs = [] eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. print('rollout avg time (s): {}'.format(np.mean(ep_rollout_times))) print('train avg time (s): {}'.format(np.mean(ep_train_times))) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Auxiliary statistics. if aux_tasks is not None: for name, values in epoch_aux_losses.items(): combined_stats[name] = np.mean(values) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, use_vision=False): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) if use_vision: obs_shape = (100, 100, 3) # TODO: make adjustable else: obs_shape = env.observation_space["observation"].shape agent = DDPG(actor, critic, memory, env.observation_space["observation"].shape, obs_shape, env.action_space.shape, env.observation_space["desired_goal"].shape, env.observation_space["desired_goal"].shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs_dict = env.reset() obs = obs_dict goal = obs_dict["desired_goal"] goalobs = obs_dict["desired_goal"] if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print("starting epoch", epoch, "of", nb_epochs) for cycle in range(nb_epoch_cycles): print(" starting rollout", cycle, "of", nb_epoch_cycles) # Perform rollouts. for t_rollout in range(nb_rollout_steps): print(" starting step", t_rollout, "of", nb_rollout_steps) # Predict next action. if use_vision: obs_train = obs["pixels"] obs_train = cv2.resize(obs_train, (100, 100)) else: #TODO: Make this take correct shape obs_train = obs["observation"] action, q = agent.pi(obs_train, goalobs, state=obs["observation"], apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if use_vision: new_obs_train = new_obs["pixels"] new_obs_train = cv2.resize(new_obs_train, (100, 100)) else: #TODO: Make this take correct shape new_obs_train = new_obs["observation"] t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs_train, action, r, new_obs_train, done, obs["observation"], new_obs["observation"], goal, goalobs) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs_dict = env.reset() obs = obs_dict goal = obs_dict["desired_goal"] goalobs = obs_dict["desired_goal"] # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): print(" >>> starting train step", t_train, "of", nb_train_steps) # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_obs = eval_env.reset() goalobs = eval_obs["desired_goal"] eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): print(" >>> starting eval step", t_rollout, "of", nb_eval_steps) if use_vision: eval_obs_train = eval_obs["pixels"] eval_obs_train = cv2.resize( eval_obs_train, (100, 100)) else: # TODO: Make this take correct shape eval_obs_train = eval_obs["observation"] eval_action, eval_q = agent.pi(eval_obs_train, goalobs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() goalobs = eval_obs["desired_goal"] eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time # stats = agent.get_stats() # TODO: add bback in combined_stats = {} # for key in sorted(stats.keys()): # TODO: add back in # combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train_return(env, param_noise, actor, critic, memory, nb_epochs=250, nb_epoch_cycles=20, reward_scale=1., render=False, normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, action_noise=None, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, nb_rollout_steps=2048, batch_size=64, tau=0.01, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # Set up logging stuff only for a single worker. episode_rewards_history = deque(maxlen=100) #with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print('epoch number:', epoch) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() return agent
class DDPGAgent(Policy): def __init__(self, env, agent_index, sess, action_range=(-1., 1.), reward_scale=0.1, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.975, clip_norm=10, batch_size=64, memory_size=1e6, tau=0.01, normalize_returns=False, normalize_observations=False, noise_type="adaptive-param_0.1", layer_norm=True, nb_layers=2, nb_neurons=64, activation='tanh', **network_kwargs): super(DDPGAgent, self).__init__(agent_index) # self.sess = sess self.nb_actions = env.action_space[agent_index].n print('agent action_space ' + str(env.action_space[agent_index].n)) self.state_size = env.observation_space[agent_index].shape self.action_range = action_range with tf.variable_scope('ddpg_' + str(agent_index)): critic = Critic(name='critic_' + str(agent_index), layer_norm=layer_norm, nb_layers=nb_layers, nb_neurons=nb_neurons) actor = Actor(self.nb_actions, name='actor_' + str(agent_index), layer_norm=layer_norm, nb_neurons=nb_neurons, activation=activation) memory = Memory(limit=int(memory_size), action_shape=(self.nb_actions, ), observation_shape=self.state_size) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.nb_actions), sigma=float(stddev) * np.ones(self.nb_actions), dt=env.world.dt, theta=0.1) else: raise RuntimeError('unknown noise type "{}"'.format( current_noise_type)) self.agent = DDPG(actor, critic, memory, self.state_size, (self.nb_actions, ), action_range=self.action_range, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(self.agent.__dict__.items())) self.agent.initialize(sess) self.agent.reset() def action(self, obs, apply_noise=False, compute_Q=False): if compute_Q: return self.agent.pi(obs, apply_noise=apply_noise, compute_Q=compute_Q) else: return self.agent.pi(obs, apply_noise=apply_noise, compute_Q=compute_Q)[0] def reset(self): return self.agent.reset()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() #print(np.abs(env.action_space.low)) #print(np.abs(env.action_space.high)) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) if load_memory: memory=pickle.load(open("/home/vaisakhs_shaj/Desktop/BIG-DATA/memory1000000.pickle","rb")) ''' samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries) print(len(samps['obs0'][1])) for i in range(memoryPrev.nb_entries): memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i], samps['terminals1'][i]) ''' print("=============memory loaded================") agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) envs = [make_env(seed) for seed in range(nproc)] envs = SubprocVecEnv(envs) ''' # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None ''' saver=tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=10) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if restore: filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(15000)+".model" saver.restore(sess,filename) print("loaded!!!!!!!!!!!!!") #p=[v.name for v in tf.all_variables()] #print(p) obs = envs.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_reward3 = 0. episode_step = 0 episode_step3 = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = deque(maxlen=10) epoch_episode_steps3 = deque(maxlen=10) epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 learning_starts = 10000 for epoch in range(nb_epochs): print("cycle-memory") print(max_action) for cycle in range(nb_epoch_cycles): print(cycle,"-",memory.nb_entries,end=" ") sys.stdout.flush() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=False)[0] for i in range(nproc)]) q = np.stack([agent.pi(obs[i], apply_noise=True, compute_Q=True)[1] for i in range(nproc)]) # action, q = agent.pi(obs, apply_noise=True, compute_Q=True) #assert action.shape == env.action_space.shape #print(i) # Execute next action in parallel. if rank == 0 and render: env.render() #assert max_action.shape == action.shape new_obs, r, done, info = envs.step(action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() #print(r) #print(r[1]) sys.stdout.flush() episode_reward += r[1] #episode_reward3 += r[2] episode_step += 1 #episode_step3 += 1 ''' if episode_step==300: e=episode_step re=episode_reward if episode_step>300: episode_step=e episode_reward=re ''' #print(episode_step) book_keeping_obs=obs obs = new_obs #print(envs[1]) #print(episode_reward) # Book-keeping in parallel. epoch_actions.append(np.mean(action)) epoch_qs.append(np.mean(q)) for i in range(nproc): agent.store_transition(book_keeping_obs[i], action[i], r[i], new_obs[i], done[i]) #print(done) if done[i]: # Episode done. #print("====done====",episode_reward) if i==1: epoch_episode_rewards.append(episode_reward) #rint(epoch_episode_rewards) #episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_reward3 = 0 episode_step = 0 epoch_episodes += 1 episodes += 1 ''' if i==2: #epoch_episode_rewards.append(episode_reward3) #rint(epoch_episode_rewards) episode_rewards_history.append(episode_reward3) epoch_episode_steps3.append(episode_step3) episode_reward3 = 0 episode_step3 = 0 ''' agent.reset() temp = envs.reset() obs[i]=temp[i] ''' Variables in TensorFlow only have values inside sessions. Once the session is over, the variables are lost. saver,save and saver .restore depends on session and has to be inside the session. ''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_rl eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. #print(episode_rewards_history) if (t)%20000 == 0: fname="/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryStill"+str(memory.nb_entries)+".pickle" pickle.dump(memory,open(fname,"wb"),protocol=-1) if t % 5000 == 0: print("=======saving interim model==========") filename="/home/vaisakhs_shaj/Desktop/MODEL/tfSteps"+str(t)+".model" saver.save(sess,filename) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps2'] = np.mean(epoch_episode_steps) combined_stats['rollout/episode_steps3'] = np.mean(epoch_episode_steps3) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() print(logdir) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0, reward_param_thr = 70, reward_param_type='const'): print('Start training for env: '+env_id) #change to your dir of choice for saving save_path = os.getcwd() print('Save data at '+save_path+'. Change to your desired path.') dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl' append_num = 0 while os.path.exists(os.path.join(save_path,dump_name)): dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl' append_num+=1 rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_com_sav = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): # collect data for saving plot save_data = {'act': [], 'obs': [], 'qpos':[], 'rew':[], # reward for this episode 'freq_com':[], # communication frequency 'act_ts': [], 'obs_ts': [], 'qpos_ts': [], 'rew_ts': [], # reward for this episode 'freq_com_ts': [], # communication frequency 'comm_r_factor':reward_param_scaling, 'eplen_ts':[] # len of test episodes } # decay the exploration e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1) explore_switch = (t < 20000 and eg_explore and e_greed > 0) print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch)) for cycle in range(nb_epoch_cycles): # Perform rollouts. # init u_old, don't forget to change test also u_old = 1.0 * env.action_space.sample() / max_action num_no_com = 0 for t_rollout in range(nb_rollout_steps): # Predict next action. # edit this to be param version a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=True, compute_Q=True) a0 = a_raw[0] a1 = a_raw[1] # eps greedy, flip the coin # make eps decay first 10k updates dice_greed = np.random.uniform() if explore_switch and dice_greed < e_greed: com = ( np.random.uniform() > 0.5 ) else: com = (a0 > a1) # action according to com switch if com: r_com = 0.0 action = np.copy(a_raw[2:]) #motor cmd else: if reward_param_type=='const': r_com = 1. # const reward elif reward_param_type=='linear': r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward elif reward_param_type=='inv': r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0))) # inv decay reward else: print('no such reward type!') assert 1==0 r_com = reward_param_scaling * r_com action = np.copy(u_old) num_no_com += 1 assert action.shape == env.action_space.shape assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: pass # env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(a_raw) epoch_qs.append(q) agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done) obs = np.squeeze(new_obs) save_data['act'].append(np.array(action)) save_data['obs'].append(np.array(obs)) if hasattr(env.unwrapped, 'data'): save_data['qpos'].append(np.array(env.unwrapped.data.qpos)) u_old = np.copy(action) if done: # Episode done. epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() print('communication savings: ' + str(num_no_com)) # check com number # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # log stuff save_data['rew'].append(np.mean(epoch_episode_rewards)) save_data['freq_com'].append(np.mean(epoch_com_sav)) duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ###=============================================== # test the fully-trained agent env = env.unwrapped print('*Final testing*') n_test = 1 n_ts_rollout = 500 # obs = env.env.reset() for i_test in range(n_test): if i_test%50==0: print('test iteration: '+str(i_test)) obs = env.reset() # take some actions # start with small during test time u_old = 0 * env.action_space.sample() / max_action num_no_com = 0 ts_step = 0 ts_reward = 0 for i_test_rollout in range(n_ts_rollout): # Predict next action. # edit this to be param version a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) a0 = a_raw[0] a1 = a_raw[1] com = (a0 > a1) # action according to com switch if com: action = np.copy(a_raw[2:]) else: action = np.copy(u_old) num_no_com += 1 assert action.shape == env.action_space.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # print('Done: '+str(done)) ts_reward += r # do i really need to change this? change back to r only ts_step += 1 # record trajectory # save_data['rew'].append(np.array(r)) # need to change here, what's a good performance measure? save_data['act_ts'].append(max_action *action) # record the actual u save_data['obs_ts'].append(np.array(obs)) u_old = np.copy(action) obs = np.copy(new_obs) # update obs # # store episode rew as performance measure # save_data['eplen_ts'].append(np.array(i_test_rollout+1)) # save_data['rew_ts'].append(np.array(ts_reward)) # save_data['freq_com_ts'].append(np.array(1.0*num_no_com/(i_test_rollout+1))) agent.reset() # doesn't matter if not stochastic # plot the trajectory ### states xs = np.asarray(save_data['obs_ts']) ths = np.arctan2(xs[:, 1], xs[:, 0]) ### control us = np.asarray(save_data['act_ts']) id_seg = 0 horz_plt = 500 plt.figure(figsize=[15, 20]) plt.subplot(211) plt.plot(ths[id_seg * horz_plt:(id_seg + 1) * horz_plt], label='th') plt.plot(xs[:, 2][id_seg * horz_plt:(id_seg + 1) * horz_plt], color='g', label='th_dot') plt.legend() plt.title('state plot') plt.subplot(212) plt.plot(us[id_seg * horz_plt:(id_seg + 1) * horz_plt], color='r') plt.title('control plot') plt.show()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, perform=False, expert=None, save_networks=False): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, expert=expert, save_networks=save_networks) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. network_saving_dir = os.path.join('./saved_networks', env.env.spec.id) + '/' if not os.path.exists(network_saving_dir): os.makedirs(network_saving_dir) agent.initialize(sess, saver, network_saving_dir, 10000, 30000) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 small_buffer = [] big_buffer = [] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): if not perform: # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): old_eval_obs = eval_obs eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if perform: small_buffer.append([ old_eval_obs, eval_action, eval_r, eval_obs, eval_done ]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. if perform and len(small_buffer) > 0: big_buffer.append(small_buffer) small_buffer = [] if len(big_buffer ) > 0 and len(big_buffer) % 1000 == 0: expert_dir = os.path.join( './expert', env.env.spec.id) + '/' if not os.path.exists(expert_dir): os.makedirs(expert_dir) pwritefile = open( os.path.join(expert_dir, 'expert.pkl'), 'wb') pickle.dump(big_buffer, pwritefile, -1) pwritefile.close() logger.info('Expert data saved!') return # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time combined_stats = {} if not perform: stats = agent.get_stats() for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. if not perform: combined_stats['rollout/return'] = mpi_mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean( epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean( epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean( epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) if not perform: # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
class DDPGEnvLearner(EnvLearner): def __init__(self, env_in): EnvLearner.__init__(self, env_in) # from baselines.ddpg.models import Actor, Critic # Parse noise_type action_noise = None param_noise = None noise_type = 'adaptive-param_0.2' layer_norm = True nb_actions = self.state_dim for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. self.buff_len = 10 self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) obs_space = (self.buff_init[0].size * self.buff_len, ) self.memory = Memory(limit=int(1e6), action_shape=env_in.observation_space.shape, observation_shape=obs_space) self.critic = models.Critic(layer_norm=layer_norm) self.actor = models.Actor(nb_actions, layer_norm=layer_norm) self.agent = DDPG(self.actor, self.critic, self.memory, obs_space, env_in.observation_space.shape, gamma=0.99, tau=0.01, normalize_returns=False, normalize_observations=True, batch_size=64, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=1e-2, actor_lr=1e-5, critic_lr=1e-5, enable_popart=False, clip_norm=None, reward_scale=1.) def initialize(self, session, load=False): self.sess = session if not load: self.sess.run(tf.global_variables_initializer()) self.agent.initialize(self.sess) def train(self, train, total_steps, valid=None, log_interval=10, early_stopping=-1, saver=None, save_str=None): G, yS, yR, yD, X, S, A = self.__prep_data__(train, batch_size=0) X = X[0] S = S[0] self.agent.reset() # max_action = self.env.action_space.high batch_size = 64 t = 0 episode_reward = 0 episode_step = 0 episodes = 0 epoch_episodes = 0 epoch_episode_rewards = [] nb_epoch_cycles = 10 nb_rollout_steps = 100 nb_epochs = int(len(train) / (nb_epoch_cycles * nb_rollout_steps)) nb_train_steps = total_steps param_noise_adaption_interval = 50 i = 0 for epoch in range(nb_epochs): start_time = time.time() for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. # (obs_in, action_in, _, new_obs_in, done, episode_step) = train[i] # obs = np.array([np.concatenate([obs_in/self.state_mul_const, # action_in/self.act_mul_const])]).flatten() obs = X[i] done = train[i][4] action, q = self.agent.pi(obs, apply_noise=True, compute_Q=True) r = -np.linalg.norm(S[i] / self.state_mul_const - action) / action.shape[0] # if not done and i < len(train): # new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const, # train[i][1] / self.act_mul_const])]).flatten() # else: # new_obs = np.array([np.concatenate([new_obs_in / self.state_mul_const, # np.zeros_like(action_in)])]).flatten() if i < len(train): new_obs = X[i + 1] else: new_obs = np.zeros_like(X[i]) t += 1 i += 1 episode_reward += r episode_step += 1 # Book-keeping. self.agent.store_transition(obs, action, r, new_obs, done) if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_reward = 0. epoch_episodes += 1 episodes += 1 self.agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = self.agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = self.agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) self.agent.update_target_net() print('Epoch ' + str(epoch) + '/' + str(nb_epochs) + ' with avg rew of: ' + str(sum(epoch_episode_rewards) / len(epoch_episode_rewards)) + ' in ' + str(time.time() - start_time) + 's') if epoch % log_interval == 0 and epoch > 0: if saver is not None and save_str is not None: save_path = saver.save(self.sess, 'models/' + str(save_str) + '.ckpt') print("Model saved in path: %s" % save_path) if saver is not None and save_str is not None: save_path = saver.save(self.sess, 'models/' + str(save_str) + '.ckpt') print("Model saved in path: %s" % save_path) def step(self, obs_in, action_in, episode_step, save=True, buff=None): import copy obs = obs_in / self.state_mul_const action = action_in / self.act_mul_const if save: if episode_step == 0: self.buffer = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) self.buffer.append( np.array([np.concatenate([obs, action])]).flatten()) else: if buff is None: buff = copy.copy(self.buffer) if episode_step == 0: buff = deque(self.buff_init * self.buff_len, maxlen=self.buff_len) buff.append(np.array([np.concatenate([obs, action])]).flatten()) if buff is not None: x = np.array([np.concatenate(buff).flatten()])[0] else: x = np.array([np.concatenate(self.buffer).flatten()])[0] new_obs, _ = self.agent.pi(x, apply_noise=True, compute_Q=True) return new_obs
def train(env_id, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, my_render=True, eg_explore=True,reward_param_scaling=1.0, reward_param_thr = 70, reward_param_type='const'): #save data #################################### full_path = txt_path + '_etc_RL.txt' file = open(full_path,'w') print('Start training for env: '+env_id) #change to your dir of choice for saving save_path = os.getcwd() print('Save data at '+save_path+'. Change to your desired path.') dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.pkl' append_num = 0 while os.path.exists(os.path.join(save_path,dump_name)): dump_name = 'sav_ddpg_'+env_id+'.reward_'+reward_param_type+'_'+str(reward_param_scaling)+'.'+str(append_num)+'.pkl' append_num+=1 rank = MPI.COMM_WORLD.Get_rank() print('second rank is ',rank) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.##############danny max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver(max_to_keep = 1) else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_com_sav = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 count_fwj = 0 # used to count how many step() is played for epoch in range(nb_epochs): print(nb_epochs) # collect data for saving plot save_data = {'act': [], 'obs': [], 'qpos':[], 'rew':[], # reward for this episode 'freq_com':[], # communication frequency 'act_ts': [], 'obs_ts': [], 'qpos_ts': [], 'rew_ts': [], # reward for this episode 'freq_com_ts': [], # communication frequency 'comm_r_factor':reward_param_scaling, 'eplen_ts':[] # len of test episodes } # decay the exploration e_greed = 0.5 - 0.1 * np.log10( (t%10000) + 1) explore_switch = (t < 20000 and eg_explore and e_greed > 0) print('total steps: '+str(t)+', eps greedy rate: '+str(e_greed)+', explore is '+str(explore_switch)) for cycle in range(nb_epoch_cycles): # Perform rollouts. # init u_old, don't forget to change test also u_old = 1.0 * env.action_space.sample() / max_action num_no_com = 0 for t_rollout in range(nb_rollout_steps): count_fwj+=1 print ('test played is ###################',count_fwj) # Predict next action. # edit this to be param version if len(obs) is not 6: obs.append(obs[2]-obs[0]) obs.append(obs[3]-obs[1]) a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) else: #a_1 = np.zeros() a_raw, q = agent.pi(np.concatenate([obs,u_old],axis=0), apply_noise=False, compute_Q=True) print('value of q is',q) if count_fwj % 50000 == 0: saver.save(sess,'./home/test_3_ar.ckpt',global_step =1) a0 = a_raw[0] a1 = a_raw[1] # eps greedy, flip the coin # make eps decay first 10k updates dice_greed = np.random.uniform() if explore_switch and dice_greed < e_greed: com = ( np.random.uniform() > 0.5 ) else: com = (a0 > a1) # action according to com switch if com: r_com = 0.0 action = np.copy(a_raw[2:]) #No communication num_no_com += 1 #No communication else: if reward_param_type=='const': r_com = 1. # const reward elif reward_param_type=='linear': r_com = (1.0 / (nb_rollout_steps - reward_param_thr)) * (nb_rollout_steps - num_no_com) # linear interp reward elif reward_param_type=='inv': r_com = 1.0 / (1.0 + (np.maximum(num_no_com - reward_param_thr, 0))) # inv decay reward else: print('no such reward type!') assert 1==0 r_com = reward_param_scaling * r_com #action = np.copy(u_old) action = np.copy(a_raw[2:]) num_no_com += 1 assert action.shape == env.action_space.shape assert max_action.shape == action.shape new_obs, r, done, info = env.step(action) print(done) file.write(str(new_obs)+',q_value_is,'+str(q)+',step_reward,'+str(r)+',action used,' + str(max_action*action)+'\n') t += 1 if rank == 0 and render: pass episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(a_raw) epoch_qs.append(q) agent.store_transition(np.concatenate([obs,u_old],axis=0), a_raw, r+r_com, np.concatenate([np.squeeze(new_obs), action],axis=0) , done) obs = np.squeeze(new_obs) save_data['act'].append(np.array(action)) save_data['obs'].append(np.array(obs)) if hasattr(env.unwrapped, 'data'): save_data['qpos'].append(np.array(env.unwrapped.data.qpos)) u_old = np.copy(action) if done: # Episode done. epoch_com_sav.append(np.asarray(1.0*num_no_com/episode_step)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 print('one game finished, count is =================================',env.count) file.write('count is,'+str(env.count)) file.write('done is,' + str(done)) file.write('long term reward is,' + str(env.long_term_reward)) file.write('#'*12+'one game finished\n') agent.reset() obs = env.reset() #end of loop nb_rollout print('communication savings: ' + str(num_no_com)) # check com number # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # log stuff save_data['rew'].append(np.mean(epoch_episode_rewards)) save_data['freq_com'].append(np.mean(epoch_com_sav)) duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) ###=============================================== # test the fully-trained agent env = env.unwrapped
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, overwrite_memory, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, logdir, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, agentName=None, resume=0): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) logF = open(os.path.join(logdir, 'log.txt'), 'a') logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a') with U.single_threaded_session() as sess: # Prepare everything. if (resume == 0): agent.initialize(sess) else: #restore = "{}-{}".format(agentName,resume) agent.initialize(sess, path=os.path.abspath(logdir), restore=agentName, itr=resume, overwrite=overwrite_memory) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. print("Epoch " + str(epoch) + " episodes " + str(episodes) + " steps " + str(episode_step) + " reward " + str(episode_reward)) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: print("Eval reward " + str(eval_episode_reward)) eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') # logdir = logger.get_dir() if rank == 0: logF.write(str(combined_stats["rollout/return"]) + "\n") json.dump(combined_stats, logStats) logF.flush() logStats.flush() # agent.save(path = os.path.abspath(logdir), name = agentName, overwrite = overwrite_memory) agent.save(path=logdir, name=agentName, overwrite=overwrite_memory) logger.info("agent {} saved".format(agent.itr.eval())) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, callback=None, pretrained='none'): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Copy an env for evaluation env_eval = copy.deepcopy(env.env) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() # load pretrained agent if possible if pretrained == 'none': logger.info('Training from scratch...') else: logger.info('Loading pretrained model from {}'.format(pretrained)) #assert os.path.exists(pretrained) saver.restore(sess, pretrained) agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 total_time = 0 start_time = time.time() total_time_record = [] epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() #epochxposdict = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 total_time += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) total_time_record.append(total_time) #epochxposdict.append(info['pos'][0]) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: # eval for one episode eval_episode_reward = 0.0 eval_done = False eval_obs = eval_env.reset() while not eval_done: eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) eval_episode_reward += eval_r eval_qs.append(eval_q) eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) """ eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. """ # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) # Call the callback if callback is not None: if callback(locals(), globals()): # callback returns a boolean value break # Evaluate the policy on env to record trajs eval_rewards, eval_steps, trajs_obs, trajs_actions = evaluate( env_eval, agent=agent) if callback is not None: callback.final_call(locals(), globals())
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, num_timesteps, nb_trials, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, test_interval, batch_size, memory, output, load_file, save=False, tau=0.01, evaluation=False, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. observation_range = [env.observation_space.low, env.observation_space.high] max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, observation_range=observation_range) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None trial_return_history = deque(maxlen=100) eval_trial_return_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() #dir_path = os.path.dirname(os.path.realpath(__file__)) #tf.summary.FileWriter(dir_path, sess.graph) trial = 0 ts = 0 if load_file != '': saver.restore(sess, load_file) start_time = time.time() trial_returns = [] trial_steps = [] actions = [] qs = [] train_actor_losses = [] train_critic_losses = [] train_adaptive_distances = [] while True: test = (test_interval >= 0 and trial % (test_interval + 1) == test_interval) if not test: # Perform rollout. env.set_test(test=False) obs = env.reset() agent.reset() done = 0 trial_return = 0. trial_step = 0 while done == 0: # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) ts += 1 if rank == 0 and render: env.render() trial_return += r trial_step += 1 # Book-keeping. actions.append(action) qs.append(q) agent.store_transition( obs, action, r, new_obs, done == 2) # terminal indicator is 2 obs = new_obs # Train. if memory.nb_entries >= batch_size: for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if trial % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() train_adaptive_distances.append(distance) cl, al = agent.train() train_critic_losses.append(cl) train_actor_losses.append(al) agent.update_target_net() # Episode done. trial_steps.append(trial_step) trial_returns.append(trial_return) trial_return_history.append(trial_return) else: # Evaluate. eval_trial_return = 0. eval_trial_steps = 0 if evaluation is not None: env.set_test(test=True) eval_obs = env.reset() agent.reset() eval_done = 0 while eval_done == 0: eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: env.render() eval_trial_return += eval_r eval_trial_steps += 1 # Episode done. eval_trial_return_history.append(eval_trial_return) # Log stats. duration = time.time() - start_time combined_stats = {} if memory.nb_entries > 0: # Print only if learing was happaning stats = agent.get_stats() for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/Q_mean'] = mpi_mean(qs) combined_stats['rollout/actions_mean'] = mpi_mean(actions) combined_stats['rollout/actions_std'] = mpi_std(actions) combined_stats['rollout/trial_steps'] = mpi_mean( trial_steps) combined_stats['rollout/return'] = mpi_mean(trial_returns) combined_stats['rollout/return_history'] = mpi_mean( trial_return_history) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean( train_actor_losses) combined_stats['train/loss_critic'] = mpi_mean( train_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( train_adaptive_distances) # Evaluation statistics. if evaluation is not None: combined_stats['eval/Q'] = mpi_mean(eval_q) combined_stats['eval/return'] = eval_trial_return combined_stats['eval/return_history'] = mpi_mean( eval_trial_return_history) combined_stats['eval/steps'] = eval_trial_steps # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(ts) / float(duration)) combined_stats['total/trials'] = trial combined_stats['total/steps'] = ts for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if evaluation and hasattr(env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) # Reset statistics. trial_returns = [] trial_steps = [] actions = [] qs = [] train_actor_losses = [] train_critic_losses = [] train_adaptive_distances = [] # End of evaluate and log statistics # Check if this is the last trial trial += 1 if nb_trials and trial >= nb_trials: break if num_timesteps and ts >= num_timesteps: break # Saving policy and value function if save and saver and output != '': saver.save(sess, './%s' % output)