def train(env_id, num_frames, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def test_function(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_multikwargs(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_set_value(): a = tf.Variable(42.) with single_threaded_session(): set_value(a, 5) assert a.eval() == 5 g = tf.get_default_graph() g.finalize() set_value(a, 6) assert a.eval() == 6 # test the test try: assert a.eval() == 7 except AssertionError: pass else: assert False, "assertion should have failed"
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env, nb_epochs, nb_episodes, episode_length, nb_train_steps, eval_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, clip_norm, batch_size, reward_scale, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high with U.single_threaded_session() as sess: agent.initialize(sess) # Setup summary writer writer = _setup_tf_summary() writer.add_graph(sess.graph) stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) sess.graph.finalize() global_step = 0 obs = env.reset() agent.reset() for epoch in range(nb_epochs): for episode in range(nb_episodes): obs = env.reset() # Generate a trajectory for t in range(episode_length): # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=True, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} new_obs, r_t, done, info = env.step(max_action * a_t) # Store transition in the replay buffer agent.store_transition(obs, a_t, r_t, new_obs, done) obs = new_obs if done: agent.reset() obs = env.reset() break # End episode # Training phase for t_train in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if episode % eval_freq == 0: # Generate evaluation trajectories for eval_episode in range(nb_eval_episodes): obs = env.reset() for t in range(episode_length): env.render() # Select action a_t according to current policy and # exploration noise a_t, _ = agent.pi(obs, apply_noise=False, compute_Q=False) assert a_t.shape == env.action_space.shape # Execute action a_t and observe reward r_t and next state s_{t+1} obs, r_t, eval_done, info = env.step(max_action * a_t) stats.add_reward(r_t) if eval_done: obs = env.reset() break # Plot average reward stats.plot_reward(global_step)
def evaluate(env, nb_episodes, reward_scale, render, param_noise, action_noise, actor, critic, memory, critic_l2_reg, normalize_returns=False, normalize_observations=True, weight_file=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, normalize_returns=normalize_returns, normalize_observations=normalize_observations, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) with U.single_threaded_session() as sess: agent.initialize(sess) if weight_file: saver = tf.train.Saver(actor.trainable_vars + critic.trainable_vars) saver.restore(sess, weight_file) agent.actor_optimizer.sync() agent.critic_optimizer.sync() # sess.graph.finalize() agent.reset() obs = env.reset() total_reward = 0.0 max_steps = 2000 for ep in range(nb_episodes): i = 0 done = False episode_reward = 0.0 while not done and i < max_steps: action, q, all_actions, sample = agent.pi(obs, apply_noise=False, compute_Q=True) assert action.shape == env.action_space.shape assert max_action.shape == action.shape obs, r, done, info = env.step(max_action * action) episode_reward += r # env.render() # print('Action:{}, reward:{}'.format(action, r)) # time.sleep(0.1) i += 1 total_reward += episode_reward logger.info("Episode:{}, reward:{}, steps:{}".format( ep, episode_reward, i)) if done: obs = env.reset() logger.info("Average reward:{}, total reward:{}, episodes:{}".format( (total_reward / nb_episodes), total_reward, nb_episodes))
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, temperature, prioritization, binding, version, dump_buffer, n_cycles, rank_method, w_potential, w_linear, w_rotational, clip_energy, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: #whoami = mpi_fork(num_cpu, binding) whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) """ if logging: logdir = 'logs/'+str(env_name)+'-temperature'+str(temperature)+\ '-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\ '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+\ '-n_cycles'+str(n_cycles)+'-rank_method'+str(rank_method)+\ '-w_potential'+str(w_potential)+'-w_linear'+str(w_linear)+'-w_rotational'+str(w_rotational)+\ '-clip_energy'+str(clip_energy)+\ '-version'+str(version) else: logdir = osp.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) """ # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['temperature'] = temperature params['prioritization'] = prioritization params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['dump_buffer'] = dump_buffer params['n_cycles'] = n_cycles params['rank_method'] = rank_method params['w_potential'] = w_potential params['w_linear'] = w_linear params['w_rotational'] = w_rotational params['clip_energy'] = clip_energy params['n_epochs'] = n_epochs params['num_cpu'] = num_cpu if params['dump_buffer']: params['alpha'] = 0 if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, dump_buffer=dump_buffer, w_potential=params['w_potential'], w_linear=params['w_linear'], w_rotational=params['w_rotational'], rank_method=rank_method, clip_energy=clip_energy)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) num_cpu = 1 if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) print("fancy call succeeded") except CalledProcessError: print("fancy version of mpi call failed, try simple version") whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() # Configure logging rank = MPI.COMM_WORLD.Get_rank() logdir = '' if rank == 0: if logdir or logger_b.get_dir() is None: logger_b.configure(dir=logdir) else: logger_b.configure() logdir = logger_b.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = v['seed'] + 1000000 * rank set_global_seeds(rank_seed) def make_env(): return PnPEnv() env = make_env() test_env = make_env() env.reset() # for _ in range(1000): # env.render() # import pdb; pdb.set_trace() # env.step(env.action_space.sample()) params = config.DEFAULT_PARAMS params['action_l2'] = v['action_l2'] params['max_u'] = v['max_u'] params['gamma'] = v['discount'] params['env_name'] = 'FetchReach-v0' params['replay_strategy'] = v['replay_strategy'] params['lr'] = v['lr'] params['layers'] = v['layers'] params['hidden'] = v['hidden'] params['n_cycles'] = v['n_cycles'] # cycles per epoch params['n_batches'] = v['n_batches'] # training batches per cycle params['batch_size'] = v[ 'batch_size'] # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. params['n_test_rollouts'] = v[ 'n_test_rollouts'] # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts # exploration params['random_eps'] = 0.3 # percentage of time a random action is taken params['noise_eps'] = v['action_noise'] params['goal_weight'] = v['goal_weight'] params['scope'] = 'ddpg3' params['sample_expert'] = v['sample_expert'] params['expert_batch_size'] = v['expert_batch_size'] params['bc_loss'] = v['bc_loss'] params['anneal_bc'] = v['anneal_bc'] params['gail_weight'] = v['gail_weight'] params['terminate_bootstrapping'] = v['terminate_bootstrapping'] params['mask_q'] = int(v['mode'] == 'pure_bc') params['two_qs'] = v['two_qs'] params['anneal_discriminator'] = v['anneal_discriminator'] params['two_rs'] = v['two_qs'] or v['anneal_discriminator'] params['with_termination'] = v['rollout_terminate'] if 'clip_dis' in v and v['clip_dis']: params['dis_bound'] = v['clip_dis'] with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['T'] = v['horizon'] params['to_goal'] = v['to_goal'] params = config.prepare_params(params) params['make_env'] = make_env config.log_params(params, logger=logger_b) dims = config.configure_dims(params) # prepare GAIL if v['use_s_p']: discriminator = GAIL(dims['o'] + dims['o'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['o'], dims['g'], 0., gail_loss=v['gail_reward'], use_s_p=True, only_s=v['only_s']) else: discriminator = GAIL(dims['o'] + dims['u'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['u'], dims['g'], 0., gail_loss=v['gail_reward'], only_s=v['only_s']) params['discriminator'] = discriminator # configure replay buffer for expert buffer params_expert = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination' ] } params_expert[ 'replay_strategy'] = 'future' if v['relabel_expert'] else 'none' params_policy_buffer = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination' ] } params_policy_buffer['replay_strategy'] = 'future' params_empty = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'replay_strategy' ] } policy = config.configure_ddpg(dims=dims, params=params, clip_return=v['clip_return'], reuse=tf.AUTO_REUSE, env=env, to_goal=v['to_goal']) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker([env], policy, dims, logger_b, **rollout_params) # rollout_worker.seed(rank_seed) evaluator = RolloutWorker([env], policy, dims, logger_b, **eval_params) # evaluator.seed(rank_seed) n_traj = v['n_evaluation_traj'] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() inner_log_dir = osp.join(log_dir, 'inner_iters') report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) logger.log("Starting the outer iterations") logger.log("Generating heat map") def evaluate_pnp(env, policy, n_rollouts=100): goal_reached = [] distance_to_goal = [] for i in range(n_rollouts): traj = rollout(env, policy, max_path_length=v['horizon'], using_gym=True) goal_reached.append(np.max(traj['env_infos']['goal_reached'])) distance_to_goal.append(np.min(traj['env_infos']['distance'])) return np.mean(goal_reached), np.mean(distance_to_goal) from sandbox.experiments.goals.pick_n_place.pnp_expert import PnPExpert expert_policy = PnPExpert(env) expert_params = { 'exploit': not v['noisy_expert'], 'use_target_net': False, 'use_demo_states': False, 'compute_Q': False, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: expert_params[name] = params[name] expert_params['noise_eps'] = v['expert_noise'] expert_params['random_eps'] = v['expert_eps'] expert_worker = RolloutWorker([env], expert_policy, dims, logger_b, **expert_params) input_shapes = dims_to_shapes(dims) expert_sample_transitions = config.configure_her(params_expert) buffer_shapes = { key: (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], 3 if not v['full_space_as_goal'] else 6) buffer_shapes['ag'] = (v['horizon'] + 1, 3 if not v['full_space_as_goal'] else 6) buffer_shapes['successes'] = (v['horizon'], ) expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], expert_sample_transitions) policy.expert_buffer = expert_buffer sample_transitions_relabel = config.configure_her(params_policy_buffer) for _ in range(v['num_demos']): # rollout is generated by expert policy episode = expert_worker.generate_rollouts( slice_goal=(3, 6) if v['full_space_as_goal'] else None) # and is stored into the current expert buffer expert_buffer.store_episode(episode) # TODO: what is subsampling_rate uninitialized_vars = [] for var in tf.global_variables(): try: tf.get_default_session().run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) init_new_vars_op = tf.initialize_variables(uninitialized_vars) tf.get_default_session().run(init_new_vars_op) max_success, min_distance = evaluate_pnp(env, policy) outer_iter = 0 logger.record_tabular("Outer_iter", outer_iter) logger.record_tabular("Outer_Success", max_success) logger.record_tabular("MinDisToGoal", min_distance) logger.dump_tabular() for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) with ExperimentLogger(inner_log_dir, outer_iter, snapshot_mode='last', hold_outter_log=True): train( policy, discriminator, rollout_worker, v['inner_iters'], v['n_cycles'], v['n_batches'], v['n_batches_dis'], policy.buffer, expert_buffer, empty_buffer=empty_buffer if v['on_policy_dis'] else None, num_rollouts=v['num_rollouts'], feasible_states=feasible_states if v['query_expert'] else None, expert_policy=expert_policy if v['query_expert'] else None, agent_policy=policy if v['query_agent'] else None, train_dis_per_rollout=v['train_dis_per_rollout'], noise_expert=v['noise_dis_agent'], noise_agent=v['noise_dis_expert'], sample_transitions_relabel=sample_transitions_relabel if v['relabel_for_policy'] else None, outer_iter=outer_iter, annealing_coeff=v['annealing_coeff'], q_annealing=v['q_annealing']) print("evaluating policy performance") logger.log("Generating heat map") success, min_distance = evaluate_pnp(env, policy) logger.record_tabular("Outer_iter", outer_iter) logger.record_tabular("Outer_Success", max_success) logger.record_tabular("MinDisToGoal", min_distance) logger.dump_tabular() if success > max_success: print("% f >= %f, saving policy to params_best" % (success, max_success)) with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f: cloudpickle.dump({'env': env, 'policy': policy}, f) max_success = success report.save() report.new_row()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, teacher, tau=0.01, eval_env=True, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() t = datetime.now().strftime('%H-%M') PATH = 'results/ddpg'.format(t) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 agent.restore_model(PATH) for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( action) eval_env.background = get_q_background( eval_env, agent.q, eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t agent.save_model(PATH, epoch) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(args, seed, writer=None): from baselines.ppo1 import pposgd_simple_gcn, gcn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) if args.env == 'molecule': env = gym.make('molecule-v0') env.init( data_type=args.dataset, logp_ratio=args.logp_ratio, qed_ratio=args.qed_ratio, sa_ratio=args.sa_ratio, recons_ratio=args.recons_ratio, reward_step_total=args.reward_step_total, is_normalize=args.normalize_adj, reward_type=args.reward_type, reward_target=args.reward_target, has_feature=bool(args.has_feature), is_conditional=bool(args.is_conditional), conditional=args.conditional, max_action=args.max_action, min_action=args.min_action) # remember call this after gym.make!! elif args.env == 'graph': env = GraphEnv() env.init(reward_step_total=args.reward_step_total, is_normalize=args.normalize_adj, dataset=args.dataset) # remember call this after gym.make!! print(env.observation_space) # if not os.path.exists(args.traj_data_path): # env.store_all_expert_trajs(args) def policy_fn(name, ob_space, ac_space): return gcn_policy.GCNPolicy(name=name, ob_space=ob_space, ac_space=ac_space, atom_type_num=env.atom_type_num, char_type_num=len(env.smile_chars), args=args) env.seed(workerseed) #print(device_lib.list_local_devices()) pposgd_simple_gcn.learn(args, env, policy_fn, max_timesteps=args.num_steps, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=8, optim_stepsize=args.lr, optim_batchsize=32, gamma=1, lam=0.95, schedule='linear', writer=writer) env.close()
def main(): ''' Load and play trained policy ''' log_root = os.path.join(os.getcwd(), 'logs') extra_args = ExtraArgs(log_root=log_root) env = make_mujoco_env(extra_args.env_id, extra_args.seed) if isinstance(env.unwrapped, CeresEnv) and (len(extra_args.trained_cnet) > 0): env.unwrapped.init_ceres() env.unwrapped.init_constraint_prediction(extra_args.trained_cnet) episode_lengths = np.zeros(extra_args.max_episodes) episode_rewards = np.zeros(extra_args.max_episodes) ob = env.reset() do_save_render = extra_args.render and len(extra_args.save_render) > 0 if do_save_render: os.makedirs(extra_args.save_render, exist_ok=True) def save_render(i_step, max_step=300, verbose=True): n_digits = len(str(max_step)) do_save_step = (max_step <= 0) or (i_step <= max_step) if do_save_render and do_save_step: path_save = os.path.join(extra_args.save_render, str(i_step).zfill(n_digits) + '.png') env.unwrapped.save_render(path_save, verbose=verbose) ob_space = env.unwrapped.observation_space ac_space = env.unwrapped.action_space ob_space, policy_observation_filter= build_policy_observation_filter(extra_args, ob_space) env.unwrapped.set_ineq_margin(extra_args.conservative_exploration) if len(extra_args.trained_policy) > 0: assert os.path.exists(extra_args.trained_policy), 'Invalid path to model: \'{0}\''.format(extra_args.trained_policy) from ceres.baselines.ceres.mlp_policy_saver import MlpPolicySaver from baselines.common import tf_util as U sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicySaver(name, ob_space=ob_space, ac_space=ac_space, hid_size=extra_args.policy_hidden_size, num_hid_layers=extra_args.policy_hidden_layers) pi = policy_fn('pi', ob_space, ac_space) U.initialize() pi.restore_model(extra_args.trained_policy, session=sess) else: print('Invalid model path \'{0}\', use dummy agent'.format(extra_args.trained_policy)) pi = DummyPolicy('pi', ob_space, ac_space) time_total = 0. n_steps_global = -1 for i_episode in range(extra_args.max_episodes): print('Episode {0}'.format(i_episode)) time_episode_begin = time.time() ob = policy_observation_filter(ob) n_steps_global += 1 if extra_args.render: env.render() save_render(n_steps_global) done = False ep_rew = 0. i_step = 0 time.sleep(extra_args.play_step_duration) while not done: action, vpred = pi.act(True, ob) ob, rew, done, info = env.step(action) ob = policy_observation_filter(ob) ep_rew += rew i_step += 1 n_steps_global += 1 if extra_args.render: env.render() save_render(n_steps_global) time.sleep(extra_args.play_step_duration) episode_lengths[i_episode] = i_step episode_rewards[i_episode] = ep_rew time_episode = time.time() - time_episode_begin time_total += time_episode print(' Episode length: {0} (average {1:.1f}), episode reward {2:.1f} (average {5:.1f}), duration {3:.1f} ms (average {4:.1f})'.format(i_step, np.average(episode_lengths[:i_episode+1]), ep_rew, 1000.*time_episode, 1000.*time_total/(i_episode+1), np.average(episode_rewards[:i_episode+1]))) ob = env.reset()
def testModelPolicy(env, policy, eval_steps=4, gamma=1, render=False, checkpoint_file="tf_checkpoint/general/model.ckpt", restore_variables=False, save_variables=True, logdir=None, log=False, overwrite_log=False, theta=5, use_gp_env=False, gp_env=None, **kwargs): states = list() next_states = list() rewards = list() actions_one_hot = list() actions = list() timesteps = list() mask = None # statistics wins = 0 reward_list = list() paths = list() small_vel = 0 obs_size = 2 state_tf = tf.placeholder(tf.float32, (None, obs_size), name="states") policy_tf, _ = policy(state_tf) n_actions = 2 # Start TF session with U.single_threaded_session() as sess: # to save variables saver = tf.train.Saver() # initialize all if restore_variables: # Add ops to save and restore all the variables. saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file)) else: init = tf.global_variables_initializer() sess.run(init) # make sure all variables are initialized sess.run(tf.assert_variables_initialized()) pi = make_pi(policy_tf, sess, state_tf, n_actions) for n in range(10): paths.append(list()) rewards_i = list() states_i = list() next_states_i = list() mask_i = list() actions_i_one_hot = list() actions_i = list() done = False # gamma_cum is gamma^t gamma_cum = 1 gamma = 1 cum_reward = 0 reward = 0 timesteps_i = 0 # Sampling logic state = env.reset() paths[n].append(state) while not done: # Select action a_t according to current policy a_t = pi(state) env.render() newState, reward, done, info = env.step(a_t) # add to the buffer to remember # rewards_i.append(reward*gamma_cum) rewards.append(reward * gamma_cum) paths[n].append(newState) # works with two actions # actions_i.append(a_t-1) actions.append(a_t - 1) # create a one hot vector with the taken action and add to the action matrix action_blank = np.zeros(n_actions) action_blank[a_t] = 1 # actions_i_one_hot.append(action_blank) actions_one_hot.append(action_blank) # calculation of the reward cum_reward += reward * gamma_cum gamma_cum = gamma_cum * gamma # states_i.append(np.append(np.append(state,action),theta)) states_i.append(state) next_states_i.append(np.array(newState - state)) state = newState timesteps_i += 1 if info["goal_reached"]: wins += 1 print(gamma_cum) if info["small_vel"]: print("Small vel") small_vel += 1 states.append(states_i) next_states.append(next_states_i) # rewards.append(rewards_i) timesteps.append(timesteps_i) reward_list.append(cum_reward) # actions_one_hot.append(actions_i_one_hot) # actions.append(actions_i) stats = { "states": states, "next_states": next_states, "rewards": rewards, "timesteps": timesteps, "reward_list": reward_list, "actions_one_hot": actions_one_hot, "actions": actions, "wins": wins, "paths": paths, "small_vel": small_vel, } # print(stats) print(np.mean(stats["reward_list"])) return stats
def launch(env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, binding, logging, version, n_cycles, note, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu, binding) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if logging: logdir = 'logs/' + str(env_name) + '-replay_strategy' + str( replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str( num_cpu) + '-seed' + str(seed) + '-n_cycles' + str( n_cycles) + '-version' + str( version) + '-T-' + datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") else: logdir = osp.join( tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() # use temp folder for other rank logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['n_cycles'] = n_cycles params['num_cpu'] = num_cpu params['note'] = note or params['note'] if note: with open('params/' + env_name + '/' + note + '.json', 'r') as file: override_params = json.loads(file.read()) params.update(**override_params) if params['load_weight']: if type(params['load_weight']) is list: params['load_weight'] = params['load_weight'][seed] base = os.path.splitext(params['load_weight'])[0] policy_weight_file = open(base + '_weight.pkl', 'rb') pretrain_weights = pickle.load(policy_weight_file) policy_weight_file.close() else: pretrain_weights = None if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, pretrain_weights=pretrain_weights, clip_return=clip_return) render = False if params['collect_video']: render = 'rgb_array' rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'render': render, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, collect_data=params['collect_data'], collect_video=params['collect_video'], goal_generation=params['goal_generation'], num_skills=params['num_skills'], use_skill_n=params['use_skill_n'], batch_size=params['_batch_size'], mi_r_scale=params['mi_r_scale'], mi_end_epoch=params['mi_end_epoch'], sk_r_scale=params['sk_r_scale'], no_train_mi=params['no_train_mi'])
def train_trpo(num_timesteps, eval_episodes, seed, horizon, out_dir='.', load_path=None, checkpoint_path_in=None, gamma=0.99, grid_size=5, first_zone=-1.0, second_zone=-10., action=2., timesteps_per_batch=500, rand_initial=True, clip_mean=False, direction='border', fail_prob=0.1, border_width=2., continuous=True, n_basis=None, num_layers=0, num_hidden=32, checkpoint_freq=20, init_logstd=-1, trainable_variance=False, trainable_bias=False): if n_basis is None: #n_basis = np.array([grid_size, 2 * grid_size]) n_basis = np.array([2 * grid_size, 4 * grid_size]) start_time = time.time() clip = None if clip_mean: clip = (-5, 5) rew_wights = [first_zone, second_zone, action] print(rew_wights) print(fail_prob) print(horizon) if continuous: dir = 'cont_gridworld' env = GridWorldAction(shape=[grid_size, grid_size], rew_weights=rew_wights, horizon=horizon, randomized_initial=rand_initial, fail_prob=fail_prob, border_width=border_width, n_bases=n_basis, direction=direction) env_eval = GridWorldAction(shape=[grid_size, grid_size], rew_weights=rew_wights, horizon=horizon, randomized_initial=rand_initial, fail_prob=fail_prob, border_width=border_width, n_bases=n_basis, direction=direction) else: dir = 'gridworld' env = GridWorld(gamma=gamma, rew_weights=rew_wights, fail_prob=fail_prob, horizon=horizon, shape=(grid_size, grid_size), randomized_initial=rand_initial, direction=direction) env_eval = GridWorld(gamma=gamma, rew_weights=rew_wights, fail_prob=fail_prob, horizon=horizon, shape=(grid_size, grid_size), randomized_initial=rand_initial, direction=direction) directory_output = (dir + '/trpo-rews-' + str(first_zone) + '_' + str(second_zone) + '_' + str(action)) + '/' + direction def eval_policy_closure(**args): return eval_and_render_policy(env_eval, **args) tf.set_random_seed(seed) sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() time_str = str(start_time) if rank == 0: logger.configure(dir=out_dir + '/' + directory_output + '/logs', format_strs=['stdout', 'csv'], suffix=time_str) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) network = mlp(num_hidden=num_hidden, num_layers=num_layers) trpo_mpi.learn(network=network, env=env, eval_policy=eval_policy_closure, timesteps_per_batch=timesteps_per_batch, max_kl=0.001, cg_iters=10, cg_damping=1e-3, total_timesteps=num_timesteps, gamma=gamma, lam=1.0, vf_iters=3, vf_stepsize=1e-4, checkpoint_freq=checkpoint_freq, checkpoint_dir_out=out_dir + '/' + directory_output + '/models/' + time_str + '/', load_path=load_path, checkpoint_path_in=checkpoint_path_in, eval_episodes=eval_episodes, init_logstd=init_logstd, trainable_variance=trainable_variance, trainable_bias=trainable_bias, clip=None) print('TOTAL TIME:', time.time() - start_time) print("Time taken: %f seg" % ((time.time() - start_time))) print("Time taken: %f hours" % ((time.time() - start_time) / 3600)) env.close()
def run(config): sess = U.single_threaded_session(gpu=False) sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() is_chef = (rank == 0) workerseed = config.seed + 10000 * rank set_global_seeds(workerseed) if is_chef: logger.configure() else: logger.set_level(logger.DISABLED) config.render = False config.record = False env_name = config.env env = make_env(env_name, config) if is_chef and config.is_train: with open(osp.join(config.log_dir, "args.txt"), "a") as f: f.write("\nEnvironment argument:\n") for k in sorted(env.unwrapped._config.keys()): f.write("{}: {}\n".format(k, env.unwrapped._config[k])) networks = [] # build models if config.hrl: assert config.primitive_envs is not None and config.primitive_paths is not None logger.info('====== Module list ======') num_primitives = len(config.primitive_envs) for primitive_env_name, primitive_path in zip(config.primitive_envs, config.primitive_paths): logger.info('Env: {}, Dir: {}'.format(primitive_env_name, primitive_path)) meta_pi = MetaPolicy(name="%s/meta_pi" % env_name, env=env, ob_env_name=env_name, primitives=config.primitive_envs, config=config) meta_oldpi = MetaPolicy(name="%s/meta_oldpi" % env_name, env=env, ob_env_name=env_name, primitives=config.primitive_envs, config=config) primitive_pis = [ PrimitivePolicy(name="%s/pi" % primitive_env_name, env=env, ob_env_name=primitive_env_name, config=config) for primitive_env_name in config.primitive_envs ] trans_pis, trans_oldpis = None, None if config.use_trans: trans_pis = [ TransitionPolicy( name="%s/transition_pi" % primitive_env_name, env=env, ob_env_name=env_name if config.trans_include_task_obs else primitive_env_name, num_primitives=num_primitives, trans_term_activation=config.trans_term_activation, config=config) for primitive_env_name in config.primitive_envs ] trans_oldpis = [ TransitionPolicy( name="%s/transition_oldpi" % primitive_env_name, env=env, ob_env_name=env_name if config.trans_include_task_obs else primitive_env_name, num_primitives=num_primitives, trans_term_activation=config.trans_term_activation, config=config) for primitive_env_name in config.primitive_envs ] networks.extend(trans_pis) networks.extend(trans_oldpis) networks.append(meta_pi) networks.append(meta_oldpi) networks.extend(primitive_pis) # build proximity_predictor proximity_predictors = None if config.use_proximity_predictor: portion_start = [ float(v) for v in config.proximity_use_traj_portion_start ] portion_end = [ float(v) for v in config.proximity_use_traj_portion_end ] if len(portion_start) == 1: portion_start = portion_start * num_primitives if len(portion_end) == 1: portion_end = portion_end * num_primitives proximity_predictors = [ ProximityPredictor( name="%s/proximity_predictor" % primitive_env_name, path=path, env=env, ob_env_name=primitive_env_name, # make env for every primitive use_traj_portion_end=portion_end, use_traj_portion_start=portion_start, is_train=config.is_train, config=config ) for primitive_env_name, path, portion_start, portion_end in \ zip(config.primitive_envs, config.primitive_paths, portion_start, portion_end)] networks.extend(proximity_predictors) # build trainer from rl.trainer import Trainer trainer = Trainer(env, meta_pi, meta_oldpi, proximity_predictors, num_primitives, trans_pis, trans_oldpis, config) # build rollout rollout = rollouts.traj_segment_generator( # stochastic=config.is_train, config=config) env, meta_pi, primitive_pis, trans_pis, stochastic=True, config=config, proximity_predictors=proximity_predictors, ) else: # build vanilla TRPO policy = MlpPolicy(env=env, name="%s/pi" % env_name, ob_env_name=env_name, config=config) old_policy = MlpPolicy(env=env, name="%s/oldpi" % env_name, ob_env_name=env_name, config=config) networks.append(policy) networks.append(old_policy) # build trainer from rl.trainer_rl import RLTrainer trainer = RLTrainer(env, policy, old_policy, config) # build rollout rollout = rollouts.traj_segment_generator_rl( # env, policy, stochastic=config.is_train, config=config) env, policy, stochastic=not config.is_collect_state, config=config) # initialize models def load_model(load_model_path, var_list=None): if os.path.isdir(load_model_path): ckpt_path = tf.train.latest_checkpoint(load_model_path) else: ckpt_path = load_model_path if ckpt_path: U.load_state(ckpt_path, var_list) return ckpt_path if config.load_meta_path is not None: var_list = meta_pi.get_variables() + meta_oldpi.get_variables() ckpt_path = load_model(config.load_meta_path, var_list) logger.info( '* Load the meta policy from checkpoint: {}'.format(ckpt_path)) def tensor_description(var): description = '({} [{}])'.format( var.dtype.name, 'x'.join([str(size) for size in var.get_shape()])) return description var_list = [] for network in networks: var_list += network.get_variables() if is_chef: for var in var_list: logger.info('{} {}'.format(var.name, tensor_description(var))) if config.load_model_path is not None: # Load all the network if config.is_train: ckpt_path = load_model(config.load_model_path) if config.hrl: load_buffers(proximity_predictors, ckpt_path) else: ckpt_path = load_model(config.load_model_path, var_list) logger.info( '* Load all policies from checkpoint: {}'.format(ckpt_path)) elif config.is_train: ckpt_path = tf.train.latest_checkpoint(config.log_dir) if config.hrl: if ckpt_path: ckpt_path = load_model(ckpt_path) load_buffers(proximity_predictors, ckpt_path) else: # Only load the primitives for (primitive_name, primitive_pi) in zip(config.primitive_paths, primitive_pis): var_list = primitive_pi.get_variables() if var_list: primitive_path = osp.expanduser( osp.join(config.primitive_dir, primitive_name)) ckpt_path = load_model(primitive_path, var_list) logger.info("* Load module ({}) from {}".format( primitive_name, ckpt_path)) else: logger.info( "* Hard-coded module ({})".format(primitive_name)) logger.info("Loading modules is done.") else: if ckpt_path: ckpt_path = load_model(ckpt_path) else: logger.info('[!] Checkpoint for evaluation is not provided.') ckpt_path = load_model(config.log_dir, var_list) logger.info( "* Load all policies from checkpoint: {}".format(ckpt_path)) if config.is_train: trainer.train(rollout) else: if config.evaluate_proximity_predictor: trainer.evaluate_proximity_predictor(var_list) else: trainer.evaluate(rollout, ckpt_num=ckpt_path.split('/')[-1]) env.close()
def train( env: ConfMDP, policy: Policy, model_approximator: ModelApproximator, eval_steps: int = 4, eval_freq: int = 5, n_trajectories: int = 20, iteration_number: int = 2000, gamma: float = 1, render=False, checkpoint_file: str = "tf_checkpoint/general/model.ckpt", restore_variables: bool = False, save_variables: bool = True, logdir: str = None, log: bool = False, omega=5, kappa: float = 1e-5, training_set_size: int = 500, normalize_data: bool = False, dual_reg: float = 0.0, policy_reg: float = 0.0, exact: bool = False, num_processes: int = 1, load_data: bool = True, **kwargs, ): """ Runner for the REMPS algorithm. Setup logging, initialize agent, takes care of fitting or loading things. Executes the main training loop by managing workers :param env: Environment (Conf-MDP) :param policy: The agent policy :param model_approximator: the approximation of the model or the true model :param eval_steps: how many steps in order to perform evaluation :param eval_freq: the frequency of evaluation :param n_trajectories: number of trajectories to collect :param iteration_number: number of iterations of REMPS :param gamma: discount factor :param render: render or not episodes :param checkpoint_file: where to store checkpoints :param restore_variables: restore variables or not from checkpoint :param save_variables: save variables in checkpoint :param logdir: directory containing logs :param log: if true the agents logs the actions probability :param omega: initial environment parameters :param kappa: parameter of remps environment :param training_set_size: number of samples contained in the training set :param normalize_data: Whether to normalize data from the training set :param dual_reg: regularization on the dual :param policy_reg: regularization on the policy :param exact: whether the model approximation is exact or not :param num_processes: number of processing :param load_data: whether to load stored data :param kwargs: :return: """ # setup logging writer = tf.summary.FileWriter(logdir) logger.configure(dir=logdir, format_strs=["stdout", "csv"]) # setup agent agent = REMPS( policy=policy, model=model_approximator, env=env, kappa=kappa, projection_type=Projection.STATE_KERNEL, use_features=False, training_set_size=training_set_size, L2_reg_dual=dual_reg, L2_reg_loss=policy_reg, exact=exact, ) # create parallel samplers # Split work among workers n_steps = n_trajectories nb_episodes_per_worker = n_steps // num_processes inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() workers = [ SamplingWorker( policy, env, nb_episodes_per_worker, inputQs[i], outputQ, env.action_space.n, env.observation_space_size, ) for i in range(num_processes) ] # Start the workers for w in workers: w.start() # Collect data for model fitting # torcs model fitting needs to be done before the session initialization # due to multiprocessing issues if not load_data and isinstance(env, Torcs): if isinstance(env, Torcs): x, y, avg_rew, ret = collect_data( env, policy=policy, total_n_samples=training_set_size, n_params=2, initial_port=env.port + 1000, ) logger.log( f"Data collection terminate. Avg rew: {np.mean(avg_rew)}, Avg ret: {np.mean(ret)}", logger.INFO, ) with U.single_threaded_session() as sess: # initialization with session agent.initialize(sess, writer, omega) # to save variables saver = tf.train.Saver() # initialize all if restore_variables: # Add ops to save and restore all the variables. saver.restore(sess, tf.train.latest_checkpoint(checkpoint_file)) else: init = tf.global_variables_initializer() sess.run(init) # make sure all variables are initialized sess.run(tf.assert_variables_initialized()) logger.log("Collecting Data", level=logger.INFO) if not load_data and not isinstance(env, Torcs): x, y = run_env( env, episode_count=1, bins=200, omega_max=30, omega_min=1, n_samples_per_omega=500, policy=agent, grid=True, total_n_samples=training_set_size, ) # store data in the agent agent.store_data(x, y, normalize_data) logger.log("Data Stored", logger.INFO) # fit the model agent.fit() logger.log("Model fitted", logger.INFO) # set configurable parameters env.set_params(omega) get_parameters = U.GetFlat(agent.get_policy_params()) # ------------------------------------- # --------- Training Loop ------------- # ------------------------------------- for n in range(iteration_number): states = list() next_states = list() rewards = list() actions_one_hot = list() actions = list() timesteps = list() paths = list() # statistics wins = 0 small_vel = 0 traj = 0 confort_violation = 0 reward_list = list() policy_ws = get_parameters() # Run parallel sampling: # for each worker send message sample with # policy weights and environment parameters for i in range(num_processes): inputQs[i].put(("sample", policy_ws, omega)) # Collect results when ready with timed("sampling"): for i in range(num_processes): _, stats = outputQ.get() states.extend(stats["states"]) paths.extend(stats["paths"]) next_states.extend(stats["next_states"]) rewards.extend(stats["rewards"]) actions_one_hot.extend(stats["actions_one_hot"]) actions.extend(stats["actions"]) timesteps.extend(stats["timesteps"]) reward_list.extend(stats["reward_list"]) wins += stats["wins"] small_vel += stats["small_vel"] traj += stats["traj"] confort_violation += stats["confort_violation"] samples_data = { "actions": np.matrix(actions).transpose(), "actions_one_hot": np.array(actions_one_hot), "observations": states, "paths": paths, "rewards": np.transpose(np.expand_dims(np.array(rewards), axis=0)), "reward_list": reward_list, "timesteps": timesteps, "wins": (wins / traj) * 100, "omega": omega, "traj": traj, "confort_violation": confort_violation, } # print statistics logger.log(f"Training steps: {n}", logger.INFO) logger.log(f"Number of wins: {wins}", logger.INFO) logger.log(f"Percentage of wins: {(wins/n_trajectories)*100}", logger.INFO) logger.log(f"Average reward: {np.mean(reward_list)}", logger.INFO) logger.log(f"Avg timesteps: {np.mean(timesteps)}") # learning routine with timed("training"): omega = agent.train(samples_data) # Configure environments with # parameters returned by the agent env.set_params(omega) # Only TORCS: we kill torcs every 10 iterations due to a memory leak if n % 10 == 0 and isinstance(env, Torcs): print("Killing torcs") os.system("ps | grep torcs | awk '{print $1}' | xargs kill -9") # ------------------------------------- # --------- Evaluation ---------------- # ------------------------------------- if ((n + 1) % eval_freq) == 0: # for plotting eval_rewards = [] # evaluation loop for i in range(eval_steps): logger.log("Evaluating...", logger.INFO) state = env.reset() done = False # gamma_cum is gamma^t gamma_cum = 1 cum_reward = 0 t = 0 # here starts an episode while not done: if render: env.render() # sample one action at random action = agent.pi(state[np.newaxis, :], log=log) # observe the next state, reward etc newState, reward, done, info = env.step(action) cum_reward += reward * gamma_cum gamma_cum = gamma * gamma_cum state = newState if done: break t = t + 1 eval_rewards.append(cum_reward) # save variables if save_variables: save_path = saver.save(sess, checkpoint_file) logger.log(f"Steps: {n}", logger.INFO) logger.log(f"Model saved in path: {save_path}", logger.INFO) # Close the env env.close() # save variables if save_variables: save_path = saver.save(sess, checkpoint_file) logger.log(f"Model saved in path: {save_path}") # exit workers for i in range(num_processes): inputQs[i].put(("exit", None, None))
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment env = Create2DockerEnv(30, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, restore=False): rank = MPI.COMM_WORLD.Get_rank() max_action = np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2]) # min_action = np.array([-0.2, -0.2, -0.2, -0.2, -0.2, -0.2]) logger.info( 'scaling actions by {} before executing in env'.format(max_action)) model_directory = '/home/zhimin/PycharmProjects/RL_UA/Peg_in_Hole/1-baselines/baselines/ddpg/simulation_data' agent = DDPG(actor, critic, memory, env.state_dim, env.action_dim, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, restore=restore) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) saver = tf.train.Saver() """Set up logging stuff only for a single worker""" # if rank == 0: # saver = tf.train.Saver() # else: # saver = None # eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: """Prepare everything""" if restore: saver = tf.train.import_meta_graph(model_directory + 'model.meta') agent.restore_model(model_directory, saver, sess) else: agent.initialize(sess) sess.graph.finalize() """Agent Reset""" agent.reset() # episode_step = 0 # episodes = 0 # t = 0 """Force calibration""" # if env.robot_control.CalibFCforce() is False: # exit() delay_rate = np.power(10, 1 / nb_epochs) epoch_start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_adaptive_distances = [] epoch_episodes_discount_reward = [] epoch_episodes_average_reward = [] epoch_actions = [] epoch_qs = [] Force_moments = [] epoch_episodes = 0 Long_term_reward = -0.10 for epoch in range(nb_epochs): """Show the result for cycle 20 times and Save the model""" epoch_actor_losses = [] epoch_critic_losses = [] """Delay the learning rate""" epoch_actor_lr = actor_lr / delay_rate epoch_critic_lr = critic_lr / delay_rate for cycle in range(nb_epoch_cycles): """environment reset """ agent.reset() obs = env.reset() episode_reward = 0. episode_discount_reward = 0. q_value = 0. done = False forcement = [] Last_average_reward = 0. Number_episodes = 0. for t_rollout in range(nb_rollout_steps): """Predict next action""" action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape[0] == env.action_dim q_value += q """scale for execution in env""" new_obs, r, done, info, expert_action = env.step( action, t_rollout) episode_discount_reward += gamma * r """adapt_action_noise""" agent.feed_back_explore(action, expert_action) logger.info("The maximum force:" + str(max(abs(new_obs[0:3]))) + " The maximum moments:" + str(max(abs(new_obs[3:6])))) episode_reward += r delta = r - Long_term_reward # if memory.nb_entries >= batch_size and param_noise is not None: # agent.feed_back_explore(delta) Number_episodes = gamma + gamma * Number_episodes Last_average_reward = r + gamma * Last_average_reward """Plot the force and moments""" # if render: # forcement.append(new_obs[0:6]) # # print(forcement) # Force_moments.append(new_obs[0:6]) # env.plot_force(forcement, t_rollout+1) if epoch == 0 and cycle == 0: forcement.append(new_obs[0:6]) Force_moments.append(new_obs[0:6]) # env.plot_force(forcement, t_rollout + 1) if epoch == nb_epoch_cycles - 1 and cycle == nb_epoch_cycles - 1: forcement.append(new_obs[0:6]) Force_moments.append(new_obs[0:6]) # env.plot_force(forcement, t_rollout + 1) epoch_actions.append(action) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs """Episode done and start pull the pegs step by step""" if done: logger.info('Peg-in-hole assembly done!!!') epoch_episode_rewards.append(episode_reward) epoch_episodes_discount_reward.append( Last_average_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(t_rollout) epoch_episodes += 1 # pull_done = False # while pull_done is False and info: # pull_done, pull_safe = env.step_up() #Simulation env # pull_done, pull_safe = env.pull_up() #True env # # if pull_safe is False: # logger.info('Pull up the pegs failed for the exceed force!!!') # exit() break """Episode failed and start pull the pegs step by step""" if info is False: logger.info( 'Peg-in-hole assembly failed for the exceed force!!!' ) # pull_done = False # while pull_done is False and info: # pull_done, pull_safe = env.step_up() # pull_done, pull_safe = env.pull_up() # True env # # if pull_safe is False: # logger.info('Peg-in-hole assembly failed for the exceed force!!!') # exit() break Long_term_reward = Last_average_reward / Number_episodes epoch_qs.append(q_value) env.save_figure('force_moment') epoch_episodes_average_reward.append(Long_term_reward) agent.feedback_adptive_explore() if t_rollout == nb_rollout_steps - 1: logger.info( 'Peg-in-hole assembly failed for exceed steps!!!') logger.info('The deepest position'.format(obs[8])) """train model for nb_train_steps times""" for t_train in range(nb_train_steps): cl, al = agent.train(epoch_actor_lr, epoch_critic_lr) epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() """Adapt param noise, if necessary""" if memory.nb_entries >= batch_size and param_noise is not None: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) """write the result into the summary""" agent.log_scalar("actor_loss", mpi_mean(epoch_actor_losses), epoch_episodes) agent.log_scalar("critic_loss", mpi_mean(epoch_critic_losses), epoch_episodes) agent.log_scalar("episode_score", mpi_mean(epoch_episode_rewards), epoch_episodes) agent.log_scalar("episode_steps", mpi_mean(epoch_episode_steps), epoch_episodes) agent.log_scalar("episode_average_reward", mpi_mean(epoch_episodes_average_reward), epoch_episodes) agent.log_scalar("episode_discount_score", mpi_mean(epoch_episodes_discount_reward), epoch_episodes) """Log stats.""" epoch_train_duration = time.time() - epoch_start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) """Rollout statistics. compute the mean of the total nb_epoch_cycles""" combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) """Train statistics""" combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) """save the model and the result""" saver.save(sess, model_directory + 'simulation_model') # re_rewards = pd.DataFrame(epoch_episode_rewards) # re_rewards.to_csv("re_rewards.csv", sep=',', header=False, index=False) re_forcement = pd.DataFrame(Force_moments) re_forcement.to_csv(model_directory + 'simulation_forcement', sep=',', header=False, index=False) # re_steps = pd.DataFrame(epoch_episode_steps) # re_steps.to_csv("re_steps.csv", sep=',', header=False, index=False) # nf = pd.read_csv("data.csv", sep=',', header=None) for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train_copos(env_id, compatible_policy, num_timesteps, timesteps_per_batch, seed, filepath, visualize, n_policy, retrace, trpo, entropy_bonus, epsilon, beta): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(dir=filepath) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed # + 10000 * MPI.COMM_WORLD.Get_rank() if compatible_policy: def policy_fn(name, ob_space, ac_space): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) else: assert (trpo) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = gym.make(env_id) env.seed(workerseed) if beta < 0: nr_episodes = num_timesteps // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) if visualize: # Load existing policy and visualize copos_mpi.visualize(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=epsilon, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, entcoeff=entropy_bonus, vf_iters=5, vf_stepsize=1e-3, TRPO=trpo, n_policy=n_policy, policy_type=1, filepath=filepath, session=sess, retrace=retrace) env.close() else: # Train policy and save it copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=epsilon, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, entcoeff=entropy_bonus, vf_iters=5, vf_stepsize=1e-3, TRPO=trpo, n_policy=n_policy, policy_type=1, filepath=filepath, session=sess, retrace=retrace) env.close() saver = tf.train.Saver() saver.save(sess, filepath + "_final")
def train_copos(env_id, num_timesteps, seed, trial, hist_len, block_high, nsteps, method, hid_size, give_state, vf_iters): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed * 10000 def policy_fn(name, ob_space, ac_space, ob_name): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2, ob_name=ob_name) set_global_seeds(workerseed) env = make_control_env(env_id, seed, hist_len=hist_len, block_high=block_high, version0=False, give_state=give_state) env.seed(workerseed) timesteps_per_batch = nsteps beta = -1 if beta < 0: nr_episodes = num_timesteps // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space, ob_name="tmp_ob") sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, method=method, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=vf_iters, vf_stepsize=1e-3, trial=trial, crosskl_coeff=0.01, kl_target=0.01, sess=sess) env.close()
from baselines.trpo_mpi import trpo_mpi import gym import tensorflow as tf import argparse import baselines.common.tf_util as U from baselines.common import set_global_seeds from mpi4py import MPI #parser parser = argparse.ArgumentParser() parser.add_argument('--environment', dest='environment', type=str, default='MountainCarContinuous-v0') parser.add_argument('--num_timesteps', dest='num_timesteps', type=int, default=10000) parser.add_argument('--seed', help='RNG seed', type=int, default=0) args = parser.parse_args() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) # create the environment env = gym.make(str(args.environment)) # initial_observation = env.reset() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2)
def learn(*, network, env, total_timesteps, num_cpu, allow_run_as_root, seed=None, eval_env=None, replay_strategy='future', save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): rank = MPI.COMM_WORLD.Get_rank() logger.info('before mpi_fork: rank', rank, 'num_cpu', MPI.COMM_WORLD.Get_size()) if num_cpu > 1: if allow_run_as_root: whoami = mpi_fork_run_as_root(num_cpu) else: whoami = mpi_fork(num_cpu) if whoami == 'parent': logger.info('parent exiting with code 0...') sys.exit(0) U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu) override_params = override_params or {} # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter params['rollout_batch_size'] = env.num_envs params['num_cpu'] = num_cpu with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size logger.info("actual total timesteps : {}".format( n_epochs * n_cycles * rollout_worker.T * rollout_worker.rollout_batch_size)) return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], save_interval=save_interval, demo_file=demo_file)
def train(env, nb_epochs, nb_episodes, nb_epoch_cycles, episode_length, nb_train_steps, eval_freq, save_freq, nb_eval_episodes, actor, critic, memory, gamma, normalize_returns, normalize_observations, critic_l2_reg, action_noise, param_noise, popart, clip_norm, batch_size, reward_scale, action_repeat, full, exclude_centering_frame, visualize, fail_reward, num_processes, num_processes_to_wait, num_testing_processes, learning_session, min_buffer_length, integrator_accuracy=5e-5, max_env_traj=100, tau=0.01): """ Parameters ---------- nb_epochs : the number of epochs to train. nb_episodes : the number of episodes for each epoch. episode_length : the maximum number of steps for each episode. gamma : discount factor. tau : soft update coefficient. clip_norm : clip on the norm of the gradient. """ assert action_repeat > 0 assert nb_episodes >= num_processes # Get params from learning session checkpoint_dir = learning_session.checkpoint_dir log_dir = learning_session.log_dir training_step = learning_session.last_training_step # Initialize DDPG agent (target network and replay buffer) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=None, critic_l2_reg=critic_l2_reg, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, training_step=training_step) # We need max_action because the NN output layer is a tanh. # So we must scale it back. max_action = env.action_space.high # Build Workers events = [Event() for _ in range(num_processes)] inputQs = [Queue() for _ in range(num_processes)] outputQ = Queue() # Split work among workers nb_episodes_per_worker = nb_episodes // num_processes workers = [ SamplingWorker(i, actor, critic, episode_length, nb_episodes_per_worker, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, param_noise, critic_l2_reg, popart, clip_norm, reward_scale, events[i], inputQs[i], outputQ, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) for i in range(num_processes) ] # Run the Workers for w in workers: w.start() # Create Round Robin tester tester = RoundRobinTester( num_testing_processes, actor, critic, episode_length, nb_eval_episodes, action_repeat, max_action, gamma, tau, normalize_returns, batch_size, normalize_observations, critic_l2_reg, popart, clip_norm, reward_scale, full, exclude_centering_frame, integrator_accuracy, max_env_traj, visualize, fail_reward) # Start training loop with U.single_threaded_session() as sess: agent.initialize(sess) writer = tf.summary.FileWriter(log_dir) writer.add_graph(sess.graph) # Initialize writer and statistics stats = EvaluationStatistics(tf_session=sess, tf_writer=writer) # setup saver saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=2) get_parameters = U.GetFlat(actor.trainable_vars) global_step = 0 obs = env.reset() agent.reset() # Processes waiting for a new sampling task waiting_indices = [i for i in range(num_processes)] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # If we have sampling workers waiting, dispatch a sampling job if waiting_indices: actor_ws = get_parameters() # Run parallel sampling for i in waiting_indices: inputQs[i].put(('sample', actor_ws)) events[i].set() # Notify worker: sample baby, sample! waiting_indices.clear() # Collect results when ready for i in range(num_processes_to_wait): process_index, transitions = outputQ.get() waiting_indices.append(process_index) print('Collecting transition samples from Worker {}/{}'. format(i + 1, num_processes_to_wait)) for t in transitions: agent.store_transition(*t) # try to collect other samples if available for i in range(num_processes): try: process_index, transitions = outputQ.get_nowait() if process_index not in waiting_indices: waiting_indices.append(process_index) print('Collecting transition samples from Worker {}'. format(process_index)) for t in transitions: agent.store_transition(*t) except queue.Empty: # No sampling ready, keep on training. pass # Training phase if agent.memory.nb_entries > min_buffer_length: for _ in range(nb_train_steps): critic_loss, actor_loss = agent.train() agent.update_target_net() # Plot statistics stats.add_critic_loss(critic_loss, global_step) stats.add_actor_loss(actor_loss, global_step) global_step += 1 # Evaluation phase if cycle % eval_freq == 0: print("Cycle number: ", cycle + epoch * nb_epoch_cycles) print("Sending testing job...") actor_ws = get_parameters() # Send a testing job tester.test(actor_ws, global_step) # Print stats (if any) tester.log_stats(stats, logger) if cycle % save_freq == 0: # Save weights save_path = saver.save(sess, checkpoint_dir, global_step=global_step) print("Model saved in path: %s" % save_path) # Dump learning session learning_session.dump(agent.training_step) print("Learning session dumped to: %s" % str(learning_session.session_path)) else: print("Not enough entry in memory buffer") # Stop workers for i in range(num_processes): inputQs[i].put(('exit', None)) events[i].set() # Notify worker: exit! tester.close() # Stop testing workers env.close()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, gamma_reward_shaping=0.1, start_reward_shaping=10000): logger.info(sys._getframe().f_code.co_name) rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() max_action = env.action_space.high logger.info("scale actions by {} before executing in env".format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info("Using agent with the following configuration:") logger.info(str(agent.__dict__.items())) if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 episode_sample = [] for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): for t_rollout in range(nb_eval_steps): action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 epoch_actions.append(action) epoch_qs.append(q) episode_sample.append((obs, action, r, new_obs, done)) if t <= start_reward_shaping: agent.store_transition(obs, action, r, new_obs, done) if done: if t > start_reward_shaping: logger.info("start reward shaping") reward = r agent.store_transition(obs, action, reward, new_obs, done) # episode_sample.append() for i in range(len(episode_sample) - 1): obs_tmp, action_tmp, rew_tmp, new_obs_tmp, done_tmp = \ episode_sample[len(episode_sample) - i - 1] reward = round(reward * gamma_reward_shaping, 5) reward = reward + rew_tmp agent.store_transition(obs_tmp, action_tmp, reward, new_obs_tmp, done) epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() obs = new_obs epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats["rollout/return"] = np.mean(epoch_episode_rewards) combined_stats["rollout/return_history"] = np.mean(episode_rewards_history) combined_stats["rollout/episode_steps"] = np.mean(epoch_episode_steps) combined_stats["rollout/actions_mean"] = np.mean(epoch_actions) combined_stats["rollout/Q_mean"] = np.mean(epoch_qs) combined_stats["train/loss_actor"] = np.mean(epoch_actor_losses) combined_stats["train/loss_critic"] = np.mean(epoch_critic_losses) combined_stats["train/param_noise_distance"] = np.mean(epoch_adaptive_distances) combined_stats["total/duration"] = duration combined_stats["total/steps_per_second"] = float(t) / float(duration) combined_stats["total/episodes"] = episodes combined_stats["rollout/episodes"] = epoch_episodes combined_stats["rollour/actions_std"] = np.std(epoch_actions) if eval_env is not None: combined_stats["eval/return"] = eval_episode_rewards combined_stats["eval/return_history"] = np.mean(eval_episode_rewards_history) combined_stats["eval/Q"] = eval_qs combined_stats["eval/episodes"] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError("expected scalar, got %s" % x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} combined_stats["total/epochs"] = epoch + 1 combined_stats["total/steps"] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info("") logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, "get_state"): with open(os.path.join(logdir, "env_state.pkl"), "wb") as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, "get_state"): with open(os.path.join(logdir, "eval_env_state.pkl"), "wb") as f: pickle.dump(eval_env.get_state(), f)
def run(self): """Override Process.run()""" # Create environment env = create_environment( action_repeat=self.action_repeat, full=self.full, exclude_centering_frame=self.exclude_centering_frame, visualize=self.visualize, fail_reward=self.fail_reward, integrator_accuracy=self.integrator_accuracy) nb_actions = env.action_space.shape[-1] # keep tracks of the number of trajectory done num_traj = 0 env.seed(os.getpid()) set_global_seeds(os.getpid()) # Create OU Noise action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=0.2, theta=0.1) # Allocate ReplayBuffer memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) # Create DPPG agent agent = DDPG(self.actor, self.critic, memory, env.observation_space.shape, env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, action_noise=action_noise, param_noise=self.param_noise, critic_l2_reg=self.critic_l2_reg, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) # Build the sampling logic fn sampling_fn = make_sampling_fn(agent, env, self.episode_length, self.action_repeat, self.max_action, self.nb_episodes, self.action_noise_prob) # Start TF session with U.single_threaded_session() as sess: agent.initialize(sess) set_parameters = U.SetFromFlat(self.actor.trainable_vars) # Start sampling-worker loop. while True: # self.event.wait() # Wait for a new message # self.event.clear() # Upon message receipt, mark as read message, actor_ws = self.inputQ.get() # Pop message if message == 'sample': # Set weights set_parameters(actor_ws) # Do sampling transitions = sampling_fn() self.outputQ.put((self.process_index, transitions)) # update number of trajectories num_traj += self.nb_episodes # restore environment if needed if num_traj >= self.max_env_traj: env.restore() num_traj = 0 elif message == 'exit': print('[Worker {}] Exiting...'.format(os.getpid())) env.close() break
def main(port, id, baud): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', dxl_dev_path=port, idn=id, baudrate=baud, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state ) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def main(): with U.single_threaded_session() as sess: batch_size = 64 current_noise_type = 'adaptive-param_0.2' _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) param_noise_adaption_interval = 2 env = gym.make("Pendulum-v0") nb_actions = env.action_space.shape[-1] layer_norm = True # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = int(1000000 * np.random.rand()) logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, batch_size=batch_size, param_noise=param_noise) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() for t in itertools.count(): episode_rewards = [] done = False while not done: env.render() # Take action and update exploration to the newest value action, q = agent.pi(obs, apply_noise=True, compute_Q=True) new_obs, rew, done, _ = env.step(max_action * action) # Book-keeping. agent.store_transition(obs, action, rew, new_obs, done) obs = new_obs episode_rewards.append(rew) if done: agent.reset() obs = env.reset() nb_train_steps = 100 epoch_adaptive_distances = [] epoch_critic_losses = [] epoch_actor_losses = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() if t % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1)) logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses))) logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses))) logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances))) logger.dump_tabular()
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] mean_episode_rewards = [] # mean_100_episode_rewards = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 mean_episode_rewards.append( np.mean(episode_rewards_history)) if episodes == 500: print( "epoch_episode_rewards*************************************" ) print(epoch_episode_rewards) print( "mean_episode_rewards*************************************" ) print(mean_episode_rewards) return agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50, save_path=None, restore_path=None, hindsight_mode=None): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. transitions = [] for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) transitions.append((obs, action, r, new_obs, done)) #agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # store regular transitions into replay memory for (obs, action, r, new_obs, done) in transitions: agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['final', 'future']: for (obs, action, r, new_obs, done) in replay_final(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) if hindsight_mode in ['future']: for (obs, action, r, new_obs, done) in replay_future(transitions, env.env): agent.store_transition(obs, action, r, new_obs, done) # store hindsight transitions. '''for i in range(3): # sample a random point in the trajectory idx = np.random.randint(0, len(transitions)) obs, action, r, new_obs, done = transitions[idx] # create a goal from that point goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions[:idx+1], goal, env.env): agent.store_transition(obs, action, r, new_obs, done) obs, action, r, new_obs, done = transitions[-1] # store a "final" transition. goal = env.env.obs_to_goal(new_obs) for (obs, action, r, new_obs, done) in replay_with_goal(transitions, goal, env.env): agent.store_transition(obs, action, r, new_obs, done)''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['reward'] = mpi_mean(epoch_episode_rewards) # combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history)) combined_stats['episode_steps'] = mpi_mean(epoch_episode_steps) combined_stats['episodes'] = mpi_sum(epoch_episodes) # combined_stats['actions_mean'] = mpi_mean(epoch_actions) combined_stats['actions_std'] = mpi_std(epoch_actions) combined_stats['Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['policy_loss'] = mpi_mean(epoch_actor_losses) combined_stats['value_loss'] = mpi_mean(epoch_critic_losses) combined_stats['param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/reward'] = mpi_mean(eval_episode_rewards) # combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history)) combined_stats['eval/Q_mean'] = mpi_mean(eval_qs) # combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards)) # Total statistics. # combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) # combined_stats['total/episodes'] = mpi_mean(episodes) # combined_stats['total/epochs'] = epoch + 1 # combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
from baselines.common import set_global_seeds, tf_util as U from baselines.agent.utility.general_utils import get_ee_points, get_position from baselines.ppo1.mlp_policy import MlpPolicy from baselines.common.mpi_fork import mpi_fork from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U env = gym.make('GazeboModularScara3DOF-v3') initial_observation = env.reset() print("Initial observation: ", initial_observation) env.render() seed = 0 sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2)
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = ReacherEnv(setup="UR5_default", host=None, dof=2, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=150000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, fb, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir, fb=fb) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps', 'sg_regenerate', 'goals_noise_eps', 'goals_random_eps', 'n_subgoals', 'n_steps_per_subgoal' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume, agentName, logdir, hid_size, num_hid_layers, noisy_nets, clip_param, entcoeff, optim_epochs, optim_batchsize, optim_stepsize, optim_schedule, desired_kl, gamma, lam, portnum, num_parallel): from baselines.ppo1 import mlp_policy, pposgd_parallel print("num cpu = " + str(num_cpu)) if (num_cpu > 1) and (num_parallel > 1): print( "num_cpu > 1 and num_parallel > 0 can't be used together at the moment!" ) exit(0) whoami = mpi_fork(num_cpu) if whoami == "parent": return rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) utils.portnum = portnum + rank workerseed = seed + 10000 * rank if utils.server_list != "": servers = utils.server_list.split(",") num_thread = utils.num_thread_list.split(",") tmp = 0 a = 0 snum = -1 num_total = 0 for t in num_thread: num_total += int(t) for t in num_thread: if rank < tmp + int(t): snum = a break tmp += int(t) a += 1 if num_total != num_cpu: print("Sum of num_thread_list must be equal to num_cpu") quit() print("Connect to tcp://" + servers[snum] + ":" + str(utils.portnum)) utils.server_ip = servers[snum] set_global_seeds(workerseed) if num_parallel > 1: env = CustomParallelEnv(num_parallel) else: env = gym.make(env_id) env.seed(seed) if logger.get_dir(): if num_parallel <= 1: env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json")) def policy_fn(name, ob_space, ac_space, noisy_nets=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, noisy_nets=noisy_nets) gym.logger.setLevel(logging.WARN) pposgd_parallel.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, schedule=optim_schedule, desired_kl=desired_kl, gamma=gamma, lam=lam, resume=resume, noisy_nets=noisy_nets, agentName=agentName, logdir=logdir, num_parallel=num_parallel, num_cpu=num_cpu) if num_parallel <= 1: env.close()
def main(env_name, seed, run_num, data_saving_path, batch_size_per_process, num_iterations, autoencoder_base="./novelty_data/local/autoencoders/"): num_processes = MPI.COMM_WORLD.Get_size() num_timesteps_per_process = batch_size_per_process num_iterations_enforce = num_iterations import baselines.common.tf_util as U comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() tf.reset_default_graph() with U.single_threaded_session() as sess: autoencoder_list = [] for i in range(run_num): autoencoder_model = load_model(autoencoder_base + env_name + '_autoencoder_seed_' + str(seed) + '_run_' + str(i) + '.h5') autoencoder_list.append(autoencoder_model) U.ALREADY_INITIALIZED.update(set(tf.global_variables())) logger.reset() # logger.configure( # '../data/ppo_' + enforce_env_name + '_autoencoder_' + str(len(autoencoder_list)) + '_seed=' + str( # seed) + '/' + str(st)) logger.configure(data_saving_path) model = train(sess, env_name, num_timesteps=num_iterations_enforce * num_processes * num_timesteps_per_process, timesteps_per_actor=num_timesteps_per_process, autoencoders=autoencoder_list, seed=seed) if mpi_rank == 0: env = gym.make(env_name) env.env.novel_autoencoders = autoencoder_list if hasattr(env.env, 'disableViewer'): env.env.disableViewer = False env = wrappers.Monitor(env, logger.get_dir() + '/../results', force=True) obs = env.reset() step = 0 while (True): env.render() actions = model._act(False, obs) obs, _, done, _ = env.step(actions[0][0]) env.render() if done: obs = env.reset() if done: print("Visualization is Done") break step += 1
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)