def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) print('Evaluating {}'.format(args.env)) bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, False, 'BC') print('Evaluation for {}'.format(args.env)) print(bc_log) gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, args.stochastic_policy, True, 'gail') print('Evaluation for {}'.format(args.env)) print(gail_log) plot(args.env, bc_log, gail_log, args.stochastic_policy)
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format(k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def main(): set_global_seeds(1) args = parse_args() with U.make_session(4) as sess: # noqa _, env = make_env(args.env) act = deepq.build_act( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=dueling_model if args.dueling else model, num_actions=env.action_space.n) U.load_state(os.path.join(args.model_dir, "saved")) wang2015_eval(args.env, act, stochastic=args.stochastic)
def test_microbatches(): def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) env_ref = DummyVecEnv([env_fn]) sess_ref = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_ref) vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} env_test = DummyVecEnv([env_fn]) sess_test = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) # learn_fn(env=env_test) vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} for v in vars_ref: np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def load(path, act_params, num_cpu=16): with open(path, "rb") as f: model_data = dill.load(f) act = deepq.build_act(**act_params) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act)
def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(): # Create the environment env = gym.make("Acrobot-v1") exp_demo = [] temp_list = [] N = 1000 # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), )
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None, trained_model=None): """Train a deepq model. Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs, starttime, durationtime = env.reset() #i = 0 #noise = 0.01 * np.random.randn(4,8,301) #np.save("./noise", noise) with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs #* (1 + noise[:,:,i].flatten()) #i += 1 episode_rewards[-1] += rew if done: #i = 0 obs, starttime, durationtime = env.reset() episode_rewards.append(0.0) """ obs = env.reset() with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) """ if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() #print(np.abs(env.action_space.low)) #print(np.abs(env.action_space.high)) #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) if load_memory: memory = pickle.load( open( "/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryNorm300000.pickle", "rb")) ''' samps = memoryPrev.sample(batch_size=memoryPrev.nb_entries) print(len(samps['obs0'][1])) for i in range(memoryPrev.nb_entries): memory.append(samps['obs0'][i], samps['actions'][i], samps['rewards'][i], samps['obs1'][i], samps['terminals1'][i]) print("=============memory loaded================") ''' agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) envs = [make_env(seed) for seed in range(nproc)] envs = SubprocVecEnv(envs) ''' # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None ''' saver = tf.train.Saver() step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=10) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() if restore: filename = r"C:\Users\DELL\Desktop\MODELS\2d\tfSteps" + str( 25000) + ".model" saver.restore(sess, filename) print("loaded!!!!!!!!!!!!!") #p=[v.name for v in tf.all_variables()] #print(p) obs = envs.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_reward3 = 0. episode_step = 0 episode_step3 = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = deque(maxlen=10) epoch_episode_steps3 = deque(maxlen=10) epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 learning_starts = 10000 for epoch in range(nb_epochs): print("cycle-memory") print(max_action) for cycle in range(nb_epoch_cycles): print(cycle, "-", memory.nb_entries, end=" ") sys.stdout.flush() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action = np.stack([ agent.pi(obs[i], apply_noise=True, compute_Q=False)[0] for i in range(nproc) ]) q = np.stack([ agent.pi(obs[i], apply_noise=True, compute_Q=True)[1] for i in range(nproc) ]) # action, q = agent.pi(obs, apply_noise=True, compute_Q=True) #assert action.shape == env.action_space.shape #print(i) # Execute next action in parallel. if rank == 0 and render: env.render() #assert max_action.shape == action.shape new_obs, r, done, info = envs.step( action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() #print(r) #print(r[1]) sys.stdout.flush() episode_reward += r[1] #episode_reward3 += r[2] episode_step += 1 #episode_step3 += 1 ''' if episode_step==300: e=episode_step re=episode_reward if episode_step>300: episode_step=e episode_reward=re ''' #print(episode_step) book_keeping_obs = obs obs = new_obs #print(envs[1]) #print(episode_reward) # Book-keeping in parallel. epoch_actions.append(np.mean(action)) epoch_qs.append(np.mean(q)) for i in range(nproc): agent.store_transition(book_keeping_obs[i], action[i], r[i], new_obs[i], done[i]) #print(done) if done[i]: # Episode done. #print("====done====",episode_reward) if i == 1: epoch_episode_rewards.append(episode_reward) #rint(epoch_episode_rewards) #episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. #episode_reward3 = 0 episode_step = 0 epoch_episodes += 1 episodes += 1 ''' if i==2: #epoch_episode_rewards.append(episode_reward3) #rint(epoch_episode_rewards) episode_rewards_history.append(episode_reward3) epoch_episode_steps3.append(episode_step3) episode_reward3 = 0 episode_step3 = 0 ''' agent.reset() temp = envs.reset() obs[i] = temp[i] ''' Variables in TensorFlow only have values inside sessions. Once the session is over, the variables are lost. saver,save and saver .restore depends on session and has to be inside the session. ''' # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_rl eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. #print(episode_rewards_history) if (t) % 7500 == 0: fname = "/home/vaisakhs_shaj/Desktop/BIG-DATA/memoryNorm" + str( memory.nb_entries) + ".pickle" pickle.dump(memory, open(fname, "wb"), protocol=-1) if t % 5000 == 0: print("=======saving interim model==========") filename = "/home/vaisakhs_shaj/Desktop/MODEL/normal/tfSteps" + str( t) + ".model" saver.save(sess, filename) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps2'] = np.mean( epoch_episode_steps) combined_stats['rollout/episode_steps3'] = np.mean( epoch_episode_steps3) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = np.mean(eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() print(logdir) if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def prepare_env(env_id, seed, num_cpu): sess = U.make_session(num_cpu=num_cpu) sess.__enter__() set_global_seeds(seed) env = gym.make(env_id) return env
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq") # # act_y, train_y, update_target_y, debug_y = deepq.build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=num_actions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # scope="deepq_y" # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, # initial_p=prioritized_replay_beta0, # final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) # replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule = None # beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): screen = shift(LEFT, player[0] - 16, screen) elif (player[0] < 16): screen = shift(RIGHT, 16 - player[0], screen) if (player[1] > 16): screen = shift(UP, player[1] - 16, screen) elif (player[1] < 16): screen = shift(DOWN, 16 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 if (action == 0): #UP if (player[1] >= 8): coord = [player[0], player[1] - 8] #path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] #path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 23): coord = [player[0], player[1] + 8] #path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif (player[1] > 23): coord = [player[0], 31] #path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 8): coord = [player[0] - 8, player[1]] #path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif (player[0] < 8): coord = [0, player[1]] #path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 23): coord = [player[0] + 8, player[1]] #path_memory_[player[1], player[0] : player[0] + 16] = -1 elif (player[0] > 23): coord = [31, player[1]] #path_memory_[player[1], player[0] : 63] = -1 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): new_screen = shift(LEFT, player[0] - 16, new_screen) elif (player[0] < 16): new_screen = shift(RIGHT, 16 - player[0], new_screen) if (player[1] > 16): new_screen = shift(UP, player[1] - 16, new_screen) elif (player[1] < 16): new_screen = shift(DOWN, 16 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size) # weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps # new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, aux_apply, aux_tasks, tc_lambda, prop_lambda, caus_lambda, repeat_lambda, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) # Setup aux tasks' lambdas aux_lambdas = { 'tc': tc_lambda, 'prop': prop_lambda, 'caus': caus_lambda, 'repeat': repeat_lambda } # Create agent agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, aux_tasks=aux_tasks, aux_lambdas=aux_lambdas) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.make_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): ep_rollout_times = [] ep_train_times = [] for cycle in range(nb_epoch_cycles): rollout_startt = time.time() # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape #print("action mean:{} -- Q: {}".format(np.mean(action), q)) # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # for the first 5 cycles just gather data if epoch == 0 and cycle < 5: continue train_startt = time.time() ep_rollout_times.append(train_startt - rollout_startt) # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_aux_losses = {} epoch_aux_losses['grads/actor_grads'] = [] epoch_aux_losses['grads/critic_grads'] = [] epoch_aux_losses['grads/aux_grads'] = [] for name in aux_tasks: epoch_aux_losses['aux/' + name] = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al, auxl = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) for name, value in auxl.items(): if 'grads' in name: epoch_aux_losses['grads/' + name].append( np.abs(value)) else: epoch_aux_losses['aux/' + name].append( np.abs(value)) agent.update_target_net() ep_train_times.append(time.time() - train_startt) if eval_env is not None: # Evaluate. eval_episode_rewards = [] eval_qs = [] eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. print('rollout avg time (s): {}'.format(np.mean(ep_rollout_times))) print('train avg time (s): {}'.format(np.mean(ep_train_times))) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Auxiliary statistics. if aux_tasks is not None: for name, values in epoch_aux_losses.items(): combined_stats[name] = np.mean(values) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def train(args): env_id = args.env num_timesteps = args.num_timesteps seed = args.seed from baselines.ppo1 import mlp_policy import pposgd_simple_modified_final U.make_session(num_cpu=1).__enter__() # set random seed for tf, numpy.random, random # in common/misc_util.py set_global_seeds(seed) def policy_fn(name, ob_space, ac_space): # mlp: Multi-Layer Perceptron # state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value) return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # ================================== modification 1 ================================== # """ ppo_learn input: replace "env" (env class) with "env_id" (string) add input "seed" (int) reason: to enable env.make() during training modification detail: move following lines into learn() env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) env.close() """ # ====================================== hyperparameter begins ====================================== # joint_optimization_iters = args.joint_iters design_iters = args.design_iters # number of robots sampled when updating physical design policy_iters = args.policy_iters # number of robots sampled when updating policy policy_episodes = 1 # for each robot, number of epsiodes conducted to update policy policy_timesteps = 1e5 design_learning_rate = 1e-4 # ======================================= hyperparameter ends ======================================= # if 'Ant' in env_id: robot_name = 'ant' elif 'Hopper' in env_id: robot_name = 'hopper' elif 'Walker' in env_id: robot_name = 'Walker2d' else: print('!' * 50) print('Unknown Environment') print('!' * 50) exit(1) robot = GMM(robot_name=robot_name, m=design_iters, learning_rate=design_learning_rate) # ================================== modification 1 ================================== # gym.logger.setLevel(logging.WARN) pposgd_simple_modified_final.learn( # =========== modified part begins =========== # env_id, seed, robot, # robot class with GMM params joint_optimization_iters, # total number of joint optimization iterations design_iters, # number of samples when updating physical design in each joint optimization iteration policy_iters, # ============ modified part ends ============ # policy_fn, max_timesteps=policy_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', )
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def train(): from linear_schedule import Linear ledger = defaultdict(lambda: MovingAverage(Reporting.reward_average)) M.config(file=os.path.join(RUN.log_directory, RUN.log_file)) M.diff() with U.make_session( RUN.num_cpu), Logger(RUN.log_directory) as logger, contextify( gym.make(G.env_name)) as env: env = ScaledFloatFrame(wrap_dqn(env)) if G.seed is not None: env.seed(G.seed) logger.log_params(G=vars(G), RUN=vars(RUN), Reporting=vars(Reporting)) inputs = TrainInputs(action_space=env.action_space, observation_space=env.observation_space) trainer = QTrainer(inputs=inputs, action_space=env.action_space, observation_space=env.observation_space) if G.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(size=G.buffer_size, alpha=G.alpha) else: replay_buffer = ReplayBuffer(size=G.buffer_size) class schedules: # note: it is important to have this start from the begining. eps = Linear(G.n_timesteps * G.exploration_fraction, 1, G.final_eps) if G.prioritized_replay: beta = Linear(G.n_timesteps - G.learning_start, G.beta_start, G.beta_end) U.initialize() trainer.update_target() x = np.array(env.reset()) ep_ind = 0 M.tic('episode') for t_step in range(G.n_timesteps): # schedules eps = 0 if G.param_noise else schedules.eps[t_step] if G.prioritized_replay: beta = schedules.beta[t_step - G.learning_start] x0 = x M.tic('sample', silent=True) (action, *_), action_q, q = trainer.runner.act([x], eps) x, rew, done, info = env.step(action) ledger['action_q_value'].append(action_q.max()) ledger['action_q_value/mean'].append(action_q.mean()) ledger['action_q_value/var'].append(action_q.var()) ledger['q_value'].append(q.max()) ledger['q_value/mean'].append(q.mean()) ledger['q_value/var'].append(q.var()) ledger['timing/sample'].append(M.toc('sample', silent=True)) # note: adding sample to the buffer is identical between the prioritized and the standard replay strategy. replay_buffer.add(s0=x0, action=action, reward=rew, s1=x, done=float(done)) logger.log( t_step, { 'q_value': ledger['q_value'].latest, 'q_value/mean': ledger['q_value/mean'].latest, 'q_value/var': ledger['q_value/var'].latest, 'q_value/action': ledger['action_q_value'].latest, 'q_value/action/mean': ledger['action_q_value/mean'].latest, 'q_value/action/var': ledger['action_q_value/var'].latest }, action=action, eps=eps, silent=True) if G.prioritized_replay: logger.log(t_step, beta=beta, silent=True) if done: ledger['timing/episode'].append(M.split('episode', silent=True)) ep_ind += 1 x = np.array(env.reset()) ledger['rewards'].append(info['total_reward']) silent = (ep_ind % Reporting.print_interval != 0) logger.log(t_step, timestep=t_step, episode=green(ep_ind), total_reward=ledger['rewards'].latest, episode_length=info['timesteps'], silent=silent) logger.log(t_step, { 'total_reward/mean': yellow(ledger['rewards'].mean, lambda v: f"{v:.1f}"), 'total_reward/max': yellow(ledger['rewards'].max, lambda v: f"{v:.1f}"), "time_spent_exploring": default(eps, percent), "timing/episode": green(ledger['timing/episode'].latest, sec), "timing/episode/mean": green(ledger['timing/episode'].mean, sec), }, silent=silent) try: logger.log(t_step, { "timing/sample": default(ledger['timing/sample'].latest, sec), "timing/sample/mean": default(ledger['timing/sample'].mean, sec), "timing/train": default(ledger['timing/train'].latest, sec), "timing/train/mean": green(ledger['timing/train'].mean, sec), "timing/log_histogram": default(ledger['timing/log_histogram'].latest, sec), "timing/log_histogram/mean": default(ledger['timing/log_histogram'].mean, sec) }, silent=silent) if G.prioritized_replay: logger.log(t_step, { "timing/update_priorities": default(ledger['timing/update_priorities'].latest, sec), "timing/update_priorities/mean": default(ledger['timing/update_priorities'].mean, sec) }, silent=silent) except Exception as e: pass if G.prioritized_replay: logger.log( t_step, {"replay_beta": default(beta, lambda v: f"{v:.2f}")}, silent=silent) # note: learn here. if t_step >= G.learning_start and t_step % G.learn_interval == 0: if G.prioritized_replay: experiences, weights, indices = replay_buffer.sample( G.replay_batch_size, beta) logger.log_histogram(t_step, weights=weights) else: experiences, weights = replay_buffer.sample( G.replay_batch_size), None M.tic('train', silent=True) x0s, actions, rewards, x1s, dones = zip(*experiences) td_error_val, loss_val = trainer.train(s0s=x0s, actions=actions, rewards=rewards, s1s=x1s, dones=dones, sample_weights=weights) ledger['timing/train'].append(M.toc('train', silent=True)) M.tic('log_histogram', silent=True) logger.log_histogram(t_step, td_error=td_error_val) ledger['timing/log_histogram'].append( M.toc('log_histogram', silent=True)) if G.prioritized_replay: M.tic('update_priorities', silent=True) new_priorities = np.abs(td_error_val) + eps replay_buffer.update_priorities(indices, new_priorities) ledger['timing/update_priorities'].append( M.toc('update_priorities', silent=True)) if t_step % G.target_network_update_interval == 0: trainer.update_target() if t_step % Reporting.checkpoint_interval == 0: U.save_state(os.path.join(RUN.log_directory, RUN.checkpoint))
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs = init(env, player_relative, obs) group_id = 0 reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True group_list = update_group_list(obs) if (check_group_list(env, obs)): obs = init(env, player_relative, obs) group_list = update_group_list(obs) # if(len(group_list) == 0): # obs = init(env, player_relative, obs) # group_list = update_group_list(obs) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] i = 0 friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() danger_closest, danger_min_dist = None, None for e in zip(enemy_x, enemy_y): for p in zip(friendly_x, friendly_y): dist = np.linalg.norm(np.array(p) - np.array(e)) if not danger_min_dist or dist < danger_min_dist: danger_closest, danger_min_dist = p, dist marine_closest, marine_min_dist = None, None for e in zip(friendly_x, friendly_y): for p in zip(friendly_x, friendly_y): dist = np.linalg.norm(np.array(p) - np.array(e)) if not marine_min_dist or dist < marine_min_dist: if dist >= 2: marine_closest, marine_min_dist = p, dist if (danger_min_dist != None and danger_min_dist <= 5): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], danger_closest]) ]) selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] elif (marine_closest != None and marine_min_dist <= 3): obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_POINT, [[0], marine_closest]) ]) selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] else: # If there is no marine in danger, select random while (len(group_list) > 0): # units = env._obs.observation.raw_data.units # marine_list = [] # for unit in units: # if(unit.alliance == 1): # marine_list.append(unit) group_id = np.random.choice(group_list) #xy = [int(unit.pos.y - 10), int(unit.pos.x+8)] #print("check xy : %s - %s" % (xy, player_relative[xy[0],xy[1]])) obs = env.step(actions=[ sc2_actions.FunctionCall( _SELECT_CONTROL_GROUP, [[_CONTROL_GROUP_RECALL], [group_id]]) ]) selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = ( selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] break else: group_list.remove(group_id) if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 new_action = None player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] coord = [player[0], player[1]] enemy_y, enemy_x = (player_relative == _PLAYER_HOSTILE).nonzero() closest, min_dist = None, None for p in zip(enemy_x, enemy_y): dist = np.linalg.norm(np.array(player) - np.array(p)) if not min_dist or dist < min_dist: closest, min_dist = p, dist player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() closest_friend, min_dist_friend = None, None for p in zip(friendly_x, friendly_y): dist = np.linalg.norm(np.array(player) - np.array(p)) if not min_dist_friend or dist < min_dist_friend: closest_friend, min_dist_friend = p, dist if (closest == None): new_action = [sc2_actions.FunctionCall(_NO_OP, [])] elif (action == 0 and closest_friend != None and min_dist_friend < 5): # Friendly marine is too close => Sparse! diff = np.array(player) - np.array(closest_friend) norm = np.linalg.norm(diff) if (norm != 0): diff = diff / norm coord = np.array(player) + diff * 3 if (coord[0] < 0): coord[0] = 0 elif (coord[0] > 63): coord[0] = 63 if (coord[1] < 0): coord[1] = 0 elif (coord[1] > 63): coord[1] = 63 new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] elif (action <= 1): #Attack # nearest enemy coord = closest new_action = [ sc2_actions.FunctionCall(_ATTACK_SCREEN, [_NOT_QUEUED, coord]) ] #print("action : %s Attack Coord : %s" % (action, coord)) elif (action == 2): # Oppsite direcion from enemy # nearest enemy opposite diff = np.array(player) - np.array(closest) norm = np.linalg.norm(diff) if (norm != 0): diff = diff / norm coord = np.array(player) + diff * 3 if (coord[0] < 0): coord[0] = 0 elif (coord[0] > 63): coord[0] = 63 if (coord[1] < 0): coord[1] = 0 elif (coord[1] > 63): coord[1] = 63 new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] elif (action == 4): #UP coord = [player[0], player[1] - 3] new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] elif (action == 5): #DOWN coord = [player[0], player[1] + 3] new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] elif (action == 6): #LEFT coord = [player[0] - 3, player[1]] new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] elif (action == 7): #RIGHT coord = [player[0] + 3, player[1]] new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] #print("action : %s Back Coord : %s" % (action, coord)) army_count = env._obs.observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[ "available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: #print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if (len(player_y) > 0): player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): new_screen = shift(LEFT, player[0] - 32, new_screen) elif (player[0] < 32): new_screen = shift(RIGHT, 32 - player[0], new_screen) if (player[1] > 32): new_screen = shift(UP, player[1] - 32, new_screen) elif (player[1] < 32): new_screen = shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = init(env, player_relative, obs) # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def create_session(num_cpu=None): U.make_session(num_cpu=num_cpu).__enter__()
from baselines.deepq.replay_buffer import ReplayBuffer from baselines.deepq.utils import ObservationInput from baselines.common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
import gym import tensorflow as tf from baselines.common import set_global_seeds, tf_util as U from baselines.ppo1 import mlp_policy, pposgd_simple env = gym.make("MountainCarContinuous-v0") def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # define the policy pi = policy_fn('pi', env.observation_space, env.action_space) # Define a TF session and restore graph sess = U.make_session(num_cpu=1) sess.__enter__() # Load the previous trained graph tf.train.Saver().restore(sess, '/tmp/experiments/continuous/PPO/models/TimeLimit_afterIter_80.model') # tf.train.Saver().restore(sess, '/tmp/experiments/continuous/PPO/models/TimeLimit_afterIter_24.model') env.render() while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(pi.act(True, obs)[0]) episode_rew += rew print("Episode reward", episode_rew)
def enjoy(env_id, seed): with tf.device('/cpu'): sess = U.make_session(num_cpu=1) sess.__enter__() env = gym.make(env_id) #env = gym.make('Mujoco-planar-snake-cars-cam-v1') #env = gym.make('Mujoco-planar-snake-cars-cam-dist-zigzag-v1') #env = gym.make('Mujoco-planar-snake-cars-cam-dist-random-v1') #env = gym.make('Mujoco-planar-snake-cars-cam-dist-line-v1') #env = gym.make('Mujoco-planar-snake-cars-cam-dist-circle-v1') #env = gym.make('Mujoco-planar-snake-cars-cam-dist-wave-v1') check_for_new_models = True # more steps #env._max_episode_steps = env.spec.max_episode_steps * 3 #obs = env.reset() max_timesteps = 3000000 # model_index = 254 # 251 # best # Select model file ..... #check_for_new_models = False # # modelverion_in_k_ts = 2000 modelverion_in_k_ts = 3000 # good modelverion_in_k_ts = 2510 # better model_index = int(max_timesteps / 1000 / 10 - modelverion_in_k_ts / 10) # TOdo last saved model model_index = 0 print("actionspace", env.action_space) print("observationspace", env.observation_space) gym.logger.setLevel(logging.WARN) # init load model_dir = get_model_dir(env_id, 'ppo') model_files = get_model_files(model_dir) #model_file = get_latest_model_file(model_dir) print('available models: ', len(model_files)) model_file = model_files[model_index] #model_file = model_files[75] logger.log("load model_file: %s" % model_file) sum_info = None pi = policy_fn('pi', env.observation_space, env.action_space) while True: # run one episode # TODO specify target velocity # only takes effect in angle envs #env.unwrapped.metadata['target_v'] = 0.05 env.unwrapped.metadata['target_v'] = 0.15 #env.unwrapped.metadata['target_v'] = 0.25 #env._max_episode_steps = env._max_episode_steps * 3 done, number_of_timesteps, info_collector = run_environment_episode( env, pi, seed, model_file, env._max_episode_steps, render=True, stochastic=False) info_collector.episode_info_print() check_model_file = get_latest_model_file(model_dir) if check_model_file != model_file and check_for_new_models: model_file = check_model_file logger.log('loading new model_file %s' % model_file) print('timesteps: %d, info: %s' % (number_of_timesteps, str(sum_info))) """
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def get_task_name(args): task_name = args.algo + "_gail." if args.pretrained: task_name += "with_pretrained." if args.traj_limitation != np.inf: task_name += "transition_limitation_%d." % args.traj_limitation task_name += args.env_id.split("-")[0] task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \ ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff) task_name += ".seed_" + str(args.seed) return task_name args = argsparser() U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
subdir = (datetime.datetime.now() ).strftime("%m-%d-%Y-%H:%M:%S") + " " + args.comment tf_writer = tf.summary.FileWriter(os.path.join(args.log_dir, subdir), tf.get_default_graph()) value_summary = tf.Summary() qec_summary = tf.Summary() value_summary.value.add(tag='discount_reward_mean') value_summary.value.add(tag='non_discount_reward_mean') # value_summary.value.add(tag='episode') qec_summary.value.add(tag='qec_mean') qec_summary.value.add(tag='qec_fount') value_summary.value.add(tag='steps') value_summary.value.add(tag='episodes') with U.make_session(4) as sess: # EMDQN ec_buffer = [] buffer_size = int(1000000 / env.action_space.n) # input_dim = 1024 for i in range(env.action_space.n): ec_buffer.append( LRU_KNN_UCB(buffer_size, args.latent_dim, 'game', mode=args.mode)) # rng = np.random.RandomState(123456) # deterministic, erase 123456 for stochastic # rp = rng.normal(loc=0, scale=1. / np.sqrt(latent_dim), size=(latent_dim, input_dim)) qecwatch = [] update_counter = 0
def train(env_id, max_iter, inner_iter, seed, skilldim, tasknum, warmstart, mirror, dyn_params): from policy_transfer.meta_strategy_optimization import ars_mso U.make_session(num_cpu=1).__enter__() set_global_seeds(seed + MPI.COMM_WORLD.Get_rank()) env = gym.make(env_id) env.env.param_manager.activated_param = dyn_params env.env.param_manager.controllable_param = dyn_params if hasattr(env.env, 'obs_perm') and skilldim > 0: cur_perm = env.env.obs_perm beginid = len(cur_perm) obs_perm_base = np.concatenate( [cur_perm, np.arange(beginid, beginid + skilldim)]) env.env.obs_perm = obs_perm_base with open(logger.get_dir() + "/envinfo.txt", "w") as text_file: text_file.write(str(env.env.__dict__)) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) def policy_mirror_fn(name, ob_space, ac_space): return MirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, observation_permutation=env.env.env.obs_perm, action_permutation=env.env.env.act_perm, soft_mirror=False) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) with open(logger.get_dir() + "/config.txt", "w") as text_file: text_file.write(str(locals())) if hasattr(env.env.env, "param_manager"): with open(logger.get_dir() + "/params.txt", "w") as text_file: text_file.write(str(env.env.env.param_manager.__dict__)) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) pol_func = policy_fn if mirror: pol_func = policy_mirror_fn if len(warmstart) != 0: if 'pickle' in warmstart: warstart_params = pickle.load(open(warmstart, 'rb')) else: warstart_params = joblib.load(warmstart) else: warstart_params = None ars_mso.ars_optimize( env, pol_func, perturb_mag=0.02, learning_rate=0.005, eval_epoch=1, params_per_thread=8, top_perturb=8, maxiter=max_iter, callback=callback, init_policy_params=warstart_params, skilldim=skilldim, task_num=tasknum, inner_iters=inner_iter, ) env.close()
from baselines.deepq.replay_buffer import ReplayBuffer from baselines.deepq.utils import ObservationInput from baselines.common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(num_cpu=8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
def train_mirror_sig(env, num_timesteps, seed, obs_perm, act_perm): from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) def policy_fn(name, ob_space, ac_space): return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, gmm_comp=1, mirror_loss=True, observation_permutation=obs_perm, action_permutation=act_perm) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed+MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) previous_params = None iter_num = 0 last_iter = False # if initialize from previous runs #previous_params = joblib.load('') #env.env.env.assist_schedule = [] joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) reward_threshold = None while True: if not last_iter: rollout_length_thershold = env.env.env.assist_schedule[2][0] / env.env.env.dt else: rollout_length_thershold = None opt_pi, rew = pposgd_mirror.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2500), clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=4.0, positive_rew_enforce=False, init_policy_params = previous_params, reward_drop_bound=500, rollout_length_thershold = rollout_length_thershold, policy_scope='pi' + str(iter_num), return_threshold = reward_threshold, ) if iter_num == 0: reward_threshold = 0.7 * rew if last_iter: break iter_num += 1 opt_variable = opt_pi.get_variables() previous_params = {} for i in range(len(opt_variable)): cur_val = opt_variable[i].eval() previous_params[opt_variable[i].name] = cur_val # update the assist schedule for s in range(len(env.env.env.assist_schedule)-1): env.env.env.assist_schedule[s][1] = np.copy(env.env.env.assist_schedule[s+1][1]) env.env.env.assist_schedule[-1][1][0] *= 0.75 env.env.env.assist_schedule[-1][1][1] *= 0.75 if env.env.env.assist_schedule[-1][1][0] < 5.0: env.env.env.assist_schedule[-1][1][0] = 0.0 if env.env.env.assist_schedule[-1][1][1] < 5.0: env.env.env.assist_schedule[-1][1][1] = 0.0 zero_assist = True for s in range(len(env.env.env.assist_schedule)-1): for v in env.env.env.assist_schedule[s][1]: if v != 0.0: zero_assist = False print('Current Schedule: ', env.env.env.assist_schedule) if zero_assist: last_iter = True print('Entering Last Iteration!') env.close()
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] reset = False new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
def evaluate_target_tracking(env_id): # there is somehow a bug somewhere. somehow always one run fails... therefore run everything twice seed = [1, 2] max_timesteps = 3000000 # model select # #modelverion_in_k_ts = 2000 modelverion_in_k_ts = 3000 # good modelverion_in_k_ts = 2510 # better model_index = int(max_timesteps / 1000 / 10 - modelverion_in_k_ts / 10) #TOdo last saved model model_index = 0 # uncomment fo select model by modelverion_in_k_ts # envs eval_env_id = [ 'Mujoco-planar-snake-cars-cam-dist-line-v1', 'Mujoco-planar-snake-cars-cam-dist-wave-v1', 'Mujoco-planar-snake-cars-cam-dist-zigzag-v1', 'Mujoco-planar-snake-cars-cam-dist-random-v1', ] grid = ParameterGrid(param_grid={'eval_env_id': eval_env_id, 'seed': seed}) paras = list(grid) render = False info_dict_collector = InfoDictCollector(None) # init load model_dir = get_model_dir(env_id, 'ppo') model_files = get_model_files(model_dir) model_file = model_files[model_index] # model_file = model_files[75] logger.log("load model_file: %s" % model_file) sess = U.make_session(num_cpu=1) sess.__enter__() gym.logger.setLevel(logging.WARN) env = gym.make(env_id) pi = policy_fn('pi', env.observation_space, env.action_space) env.close() with tf.device('/cpu'): for i, para in enumerate(paras): eval_env_id = para['eval_env_id'] seed = int(para['seed']) env = gym.make(eval_env_id) # 3000 timesteps, default for evaluation env._max_episode_steps = env._max_episode_steps * 3 done, number_of_timesteps, info_collector = \ run_environment_episode(env, pi, seed, model_file, env._max_episode_steps, render, stochastic=False) print('run {}/{} para: {}, timesteps: {}'.format( i, len(paras), para, number_of_timesteps)) info_dict_collector.add_info_collector(info_collector) env.close() modelversion = modelverion_in_k_ts info_dict_collector.following_eval_save(modelversion) # plot import_plots.evaluate_target_tracking()
from baselines import deepq from baselines.deepq.replay_buffer import ReplayBuffer from baselines.common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
def train_mirror(args, num_timesteps): from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env) env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank()) env.env.init_params(args) U.ALREADY_INITIALIZED = set() U.ALREADY_INITIALIZED.update(set(tf.global_variables())) obs_per = np.array([ 0.0001, -1, 2, -3, -4, 11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17, 18, -19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, -30, 31, -32, -33, 40, 41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48, 53, 54, 55, 56, 49, 50, 51, 52 ]) if env.env.include_additional_info: obs_per = np.concatenate((obs_per, np.array([58, 57]))) obs_per = np.concatenate((obs_per, np.array([59]))) obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61, -62]))) obs_per = np.concatenate((obs_per, np.array([66, 67, -68]))) obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70, -71]))) obs_per = np.concatenate((obs_per, np.array([75, 76, -77]))) obs_per = np.concatenate((obs_per, np.array([78, 79, -80]))) assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3) assert env.env.act_dim == 21 # change action/state permutation if change action/state in env def policy_fn(name, ob_space, ac_space): if env.env.env.state_self_standardize: return mlp_mirror_norms_policy.MlpMirrorNormsPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=np.array([ 5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18, 19, 20, 13, 14, 15, 16 ])) else: return mlp_mirror_policy.MlpMirrorPolicy( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1, mirror_loss=True, observation_permutation=obs_per, action_permutation=np.array([ 5, 6, 7, 8, 9, 0.0001, 1, 2, 3, 4, -10, 11, -12, 17, 18, 19, 20, 13, 14, 15, 16 ])) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True) with open(logger.get_dir() + '/env_specs.txt', 'w') as f: pprint.pprint(env.env.env.__dict__, f) f.close() shutil.copyfile(env.env.env.model_file_name, logger.get_dir() + '/using_model.skel') cur_sym_loss = 3.0 iter_num = 0 previous_params = None # previous_params = joblib.load('') reward_threshold = None rollout_length_threshold = None pposgd_mirror.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=int(2000), clip_param=args.clip, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, sym_loss_weight=cur_sym_loss, init_policy_params=previous_params, reward_drop_bound=None, rollout_length_threshold=rollout_length_threshold, policy_scope='pi' + str(iter_num), return_threshold=reward_threshold, ) env.close()
""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp") parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") parser.add_argument('--max-timesteps', default=50000, type=int, help="Maximum number of timesteps when not rendering") args = parser.parse_args() with tf_utils.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
return act.copy() def observation_wrapper(obs): pov = obs['pov'].astype(np.float32) / 255.0 - 0.5 #compass = obs['compassAngle'] #compass_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*compass #compass_channel /= 180.0 #return np.concatenate([pov, compass_channel], axis=-1) return pov if __name__ == '__main__': with U.make_session(32): # Create the environment env = gym.make("MineRLTreechop-v0") spaces = env.observation_space.spaces['pov'] shape = list(spaces.shape) #shape[-1] += 1 # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(shape, name=name), q_func=model, num_actions=5, gamma=0.99, optimizer=tf.train.AdamOptimizer(learning_rate=1e-3), ) # Create the replay buffer
def main(): # tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.allow_growth = True FLAGS(sys.argv) # steps_left = FLAGS.timesteps logdir = "tensorboard" if (FLAGS.algorithm == "deepq"): logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction, FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time) elif (FLAGS.algorithm == "acktr"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) elif (FLAGS.algorithm == "BicNet"): logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % ( FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr, start_time) if (FLAGS.log == "tensorboard"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) elif (FLAGS.log == "stdout"): Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat( feature_dimensions=sc2_env.Dimensions( screen=32, minimap=32 ), #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64) 将他俩处理成32*32的矩阵 use_feature_units=True) lr = FLAGS.lr buffer_size = 60000 # 50000 减少一下,尽量是训练步数的1/10 70000 test 200 70000 batch_size = 32 # 32 gamma = 0.99 num_agents = 2 #9 vector_obs_len = 736 #33 #4096 # 32*32 1024 output_len = 4 #3 hidden_vector_len = 128 #128 #1 tau = 0.001 # stddev = 0.1 sess = U.make_session() sess.__enter__() actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len) critic = tb.CriticNetwork(sess, lr, tau, gamma, actor.get_num_trainable_vars(), num_agents, vector_obs_len, output_len, hidden_vector_len) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1)) action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps - buffer_size) # while(steps_left > 0): with sc2_env.SC2Env( map_name="CollectMineralShards", #DefeatZerglingsAndBanelings # step_mul=step_mul, agent_interface_format=AGENT_INTERFACE_FORMAT, visualize=False, #True game_steps_per_episode=steps * step_mul) as env: learn( env, sess=sess, max_timesteps=FLAGS.timesteps, train_freq=1, save_freq=10000, target_network_update_freq=1, #1000 gamma=gamma, # callback=BicNet_callback, actor=actor, critic=critic, replay_buffer=replay_buffer, num_agents=num_agents, action_noise=action_noise, output_len=output_len, num_exploring=buffer_size #buffer_size )
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc): from baselines.ppo15 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) from gym.envs.registration import register # Potential Pendulum Env if (True): register( id='Pendulumnf-v0', entry_point='nfunk.envs_nf.pendulum_nf:PendulumEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Pendulumnf-v0') # Potential Scalar Env if (False): register( id='Scalarnf-v0', entry_point='nfunk.envs_nf.gym_scalar_nf:GymScalarEnv', max_episode_steps=400, #kwargs = vars(args), ) env = gym.make('Scalarnf-v0') if (False): env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_options=num_options, dc=dc) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options == 1: optimsize = 64 elif num_options == 2: optimsize = 32 else: print("Only two options or primitive actions is currently supported.") sys.exit() pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc) env.close()
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc, path, render, official, orig_ppo): U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) # Episode len determines the length of the rollout. Remember: length of 20 means 1s. episode_len = 400 from gym.envs.registration import register # add the current path to the repo -> we are loading exactly the repo it has been trained on!!! sys.path.append(path) print(sys.path) from src_code import mlp_policy # Depending on the environment argument, the right environment is selected if (env_id == 'Pendulumnf-v0'): register( id='Pendulumnf-v0', entry_point='src_code.pendulum_nf:PendulumEnv', max_episode_steps=episode_len, #kwargs = vars(args), ) env = gym.make('Pendulumnf-v0') # Potential Scalar Env elif (env_id == 'Scalarnf-v0'): register( id='Scalarnf-v0', entry_point='src_code.gym_scalar_nf:GymScalarEnv', max_episode_steps=episode_len, #kwargs = vars(args), ) env = gym.make('Scalarnf-v0') elif (env_id == 'CartPole-v9'): register( id='CartPole-v9', entry_point='src_code.cartpole:CartPoleEnv', max_episode_steps=episode_len, #kwargs = vars(args), ) env = gym.make('CartPole-v9') else: env = gym.make(env_id) # Create the policies needed def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc) #was 64,32 or 15 env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options == 1: optimsize = 64 elif num_options == 2: optimsize = 32 else: print("Only two options or primitive actions is currently supported.") sys.exit() # Start the visualization script visual.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc, episode_len=episode_len, path=path, render=render, official=official, orig_ppo=orig_ppo) env.close()
else: container = None # Create and seed the env. env, monitored_env = make_env(args.env) if args.seed > 0: set_global_seeds(args.seed) env.unwrapped.seed(args.seed) if args.gym_monitor and savedir: env = gym.wrappers.Monitor(env, os.path.join(savedir, 'gym_monitor'), force=True) if savedir: with open(os.path.join(savedir, 'args.json'), 'w') as f: json.dump(vars(args), f) with U.make_session(4) as sess: # Create training graph and replay buffer def model_wrapper(img_in, num_actions, scope, **kwargs): actual_model = dueling_model if args.dueling else model return actual_model(img_in, num_actions, scope, layer_norm=args.layer_norm, **kwargs) act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=model_wrapper, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4), gamma=0.99, grad_norm_clipping=10, double_q=args.double_q, param_noise=args.param_noise )
def main(_): print("Used flags:", FLAGS) config = configparser.ConfigParser() config.read(FLAGS.config_file) timer = time.time() ps_hosts = FLAGS.ps_hosts.split(",") if FLAGS.ps_hosts else config.get(FLAGS.config, 'ps_hosts').split(",") worker_hosts = FLAGS.worker_hosts.split(",") if FLAGS.worker_hosts else config.get(FLAGS.config, 'worker_hosts').split(",") job = FLAGS.job_name task = FLAGS.task_index learning_rate = config.getfloat(FLAGS.config, 'learning_rate') batch_size = config.getint(FLAGS.config, 'batch_size') memory_size = config.getint(FLAGS.config, 'memory_size') target_update = config.getint(FLAGS.config, 'target_update') seed = FLAGS.seed if FLAGS.seed else config.getint(FLAGS.config, 'seed') max_comm_rounds = config.getint(FLAGS.config, 'comm_rounds') epochs = config.getint(FLAGS.config, 'start_epoch') end_epoch = config.getint(FLAGS.config, 'end_epoch') epoch_decay = config.getint(FLAGS.config, 'epoch_decay') # epoch_decay_rate = (epochs - end_epoch) / epoch_decay epoch = LinearSchedule(epoch_decay, end_epoch, epochs) backup = config.getint(FLAGS.config, 'backup') # unused in async sync = config.getboolean(FLAGS.config, 'sync') gradient_prio = False if not sync else config.getboolean(FLAGS.config, 'gradient_prio') sync_workers = len(worker_hosts)-backup mute = FLAGS.mute if FLAGS.mute else config.getboolean(FLAGS.config, 'mute') animate = 0 draw = 0 print("Config:\nps_hosts={}\nworker_hosts={}\njob_name={}\ntask_index={}\nlearning_rate={}\n" "batch_size={}\nmemory_size={}\ntarget_update={}\nseed={}\ncomm_rounds={}\nepochs={}\n" "end_epoch={}\nepoch_decay={}\nnbackup={}\nsync={}" .format(ps_hosts, worker_hosts, job, task, learning_rate, batch_size, memory_size, target_update, seed, max_comm_rounds, epochs, end_epoch, epoch_decay, backup, sync)) cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) chief = True if job == 'worker' and task == 0 else False print("/job:", job, "/task:", task, " - Chief: ", chief, sep='') # Create server server = tf.train.Server(cluster, job_name=job, task_index=task) run_code = "{}-{}-p-{}-w-{}-E-{}-b-{}-m-{}-N-{}-lr-{}-B-{}-s-{}-".\ format(datetime.now().strftime("%y%m%d-%H%M%S"), env_name, len(ps_hosts), len(worker_hosts), epochs, batch_size, memory_size, target_update, learning_rate, backup, seed) run_code += "-sync" if sync else "-async" # Set a unique random seed for each client seed = ((seed * 10) + task) random.seed(seed) if not mute: print("Run code:", run_code) # Start parameter servers if job == 'ps': server.join() # Start training with U.make_session(num_cpu=4, target=server.target) as sess: # Create the environment env = gym.make(env_name) env.seed(seed) tf.set_random_seed(seed) # Create all the functions necessary to train the model act, train, global_opt, update_target, update_weights, sync_opt, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), # optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), chief=chief, server=server, workers=sync_workers ) # Create the replay buffer replay_buffer = ReplayBuffer(memory_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) if not chief: if not mute: print("Worker {}/{} will sleep (3s) for chief to initialize variables".format(task+1, len(worker_hosts))) time.sleep(4) # Initialize the parameters and copy them to the target network. U.initialize(chief=chief) if chief: sess.run(debug['run_code'].assign(run_code)) if not mute: print("Set global run code to:", run_code) if not mute: print("initialized variables, sleeping for 1 sec") time.sleep(2) if not chief: while not sess.run(tf.is_variable_initialized(debug['run_code'])): if not mute: print("Global run code not yet initialized") time.sleep(2) run_code = str(sess.run(debug['run_code']).decode()) if run_code == '': if not mute: print("Run code empty. Trying to fetch again...") time.sleep(5) if not mute: print("Read global run code:", run_code) run_code += "(w" + str(task) + ")" print("Final run_code:", run_code) t_global_old = update_weights()[0][0] update_target() exp_gen = 1000 # For how many timesteps sould we only generate experience (not train) t_start = exp_gen comm_rounds = 0 comm_rounds_global = 0 dt = 0 write_csv(run_code, log=["episode", "reward" + str(task), "avg_reward" + str(task), "t_global", "cr"]) episode_rewards = [0.0] cr_reward = 0 obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew cr_reward += rew # Animate every <animate> episodes if not mute and chief and animate > 0 and (len(episode_rewards) % animate) == 0: if done: print("ep", len(episode_rewards), "ended with reward:", episode_rewards[-1]) env.render() if done: if not mute and chief and draw > 0 and len(episode_rewards) % draw == 0: env.render() avg_rew = np.round(np.mean(np.array(episode_rewards[-100:])), 1) write_csv(run_code, [len(episode_rewards), episode_rewards[-1], avg_rew, debug['t_global']()[0], comm_rounds_global]) obs = env.reset() episode_rewards.append(0) [converged] = sync_opt['check_converged']() is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= max_reward or converged if is_solved or comm_rounds >= max_comm_rounds: sync_opt['set_converged']([True]) if not mute: print("Converged was set to", sync_opt['check_converged']()[0]) write_csv_final(run_code, str(len(episode_rewards)), worker_hosts, chief, comm_rounds_global, mute) print("Converged after: ", len(episode_rewards), "episodes") print("Agent total steps:", t) print("Global steps: ", debug['t_global']()[0]) sec = round(time.time() - timer) print("Total time:", sec // 3600, "h", (sec % 3600) // 60, "min", sec % 60, "s") return else: if t >= exp_gen: # if t >= batch_size: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) td_error = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) if t - t_start >= np.round(epoch.value(comm_rounds)): cr_old = comm_rounds_global # Apply gradients to weights in PS if sync: # Tell the ps we are done and want to submit score [[comm_rounds_global], [worker_count]] = sync_opt['request_submit']() if comm_rounds_global == comm_rounds: if worker_count <= sync_workers: # If allowed to submit score, do it [comm_rounds_global] = sync_opt['submit_score']([cr_reward]) if chief: [submits] = sync_opt['set_submit']([0]) while worker_count != sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_wc") break worker_count = sync_opt['check_wc']()[0] while sync_opt['check_submit']()[0] == -1: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit") break pass if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Continuing before submit") continue # Now all eligible workers have sent their score and gradient round has started # Submit gradient # TODO 4th argument overrides everything else unles it is set to -1 in the code [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [cr_reward], [1/len(worker_hosts)], [True]) submits = sync_opt['inc_submit']() if chief: while not sync_opt['check_submit']()[0] == sync_workers: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_submit (chief)") break pass # print("Round", comm_rounds, "finished") [w] = sync_opt['reset_wc']()[0] # print("Worker count reset to:", w) sync_opt['reset_score']() submits = sync_opt['set_submit']([-1]) # print("Submit round finished. Submits set to:", submits[0]) [r] = sync_opt['inc_comm_round']()[0] # print("New round started:", r) # Normal workers wait until GCR > CR if not chief: while sync_opt['check_round']()[0] <= comm_rounds: if sync_opt['check_converged']()[0]: if not mute: print("Other worker converged! Finishing in check_round") break # print("Worker submitted, waiting for next round:", comm_rounds + 1) # time.sleep(0.1) pass else: #elif worker_count > sync_workers: # If not allowed to submit score, wait for next round to start if not mute: print("Worker finished too late but before new round started (", comm_rounds_global, ")") print("WC(", worker_count, ") > N(", sync_workers, ")", sep="") target = np.floor(comm_rounds_global + 1) # +1 if x.0, +0.5 if x.5 while not sync_opt['check_round']()[0] >= target: pass elif comm_rounds_global > comm_rounds: # This means the worker is behind. Do nothing and start next round if not mute: print("Communication round ", comm_rounds, "missed. Actual round:", comm_rounds_global) # TODO How to handle round count when skipping rounds? comm_rounds = comm_rounds_global - 1 elif comm_rounds_global < comm_rounds: print("WARNING! Worker ahead of global:", comm_rounds, ">", comm_rounds_global) time.sleep(5) else: sync_opt['inc_comm_round']() [[dt], [comm_rounds_global], [factor]] = global_opt([t - t_start], [t_global_old], [0], [-1], [False]) # Update the local weights with the new global weights from PS t_global_old = update_weights()[0][0] comm_rounds += 1 # print("Round finished. Increasing local comm_round to:", comm_rounds) cr_reward = 0 # TODO RE-ENABLE comm-rounds LOGGING # write_csv(run_code, [comm_rounds, t, dt, epoch.value(comm_rounds)], comm_rounds=True) t_start = t if t % target_update == 0: update_target() if not mute and done and len(episode_rewards) % 10 == 0: last_rewards = episode_rewards[-101:-1] logger.record_tabular("steps", t) logger.record_tabular("global steps", debug['t_global']()[0]) logger.record_tabular("communication rounds", comm_rounds) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean episode reward", np.round(np.mean(episode_rewards[-101:-1]), 4)) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.record_tabular("last gradient factor", np.round(factor, 4)) logger.dump_tabular() rew_ill = ['●' if x >= max_reward else str(int(np.floor(x / (max_reward/10)))) if x >= (max_reward/10) else '_' for x in last_rewards] streak = 0 for i in reversed(rew_ill): if i == "●": streak += 1 else: break #print("[" + ''.join(rew_ill) + "] ([● " + str(rew_ill.count('●')) + " | " + str(rew_ill.count('9')) + " | " + str(rew_ill.count('8')) + " | " + str(rew_ill.count('7')) + " | " + str(rew_ill.count('6')) + " | " + str(rew_ill.count('5')) + " | " + str(rew_ill.count('4')) + " | " + str(rew_ill.count('3')) + " | " + str(rew_ill.count('2')) + " | " + str(rew_ill.count('1')) + " | " + str(rew_ill.count('_')) + " _]/" + str(len(rew_ill)) + " {S:" + str(streak) + "})", sep='')
def train(env, policy, policy_init, n_episodes, horizon, seed, njobs=1, save_weights=False, **alg_args): if env.startswith('rllab.'): # Get env name and class env_name = re.match('rllab.(\S+)', env).group(1) env_rllab_class = rllab_env_from_name(env_name) # Define env maker def make_env(): env_rllab = env_rllab_class() _env = Rllab2GymWrapper(env_rllab) return _env # Used later env_type = 'rllab' else: # Normal gym, get if Atari or not. env_type = get_env_type(env) assert env_type is not None, "Env not recognized." # Define the correct env maker if env_type == 'atari': # Atari, custom env creation def make_env(): _env = make_atari(env) return wrap_deepmind(_env) else: # Not atari, standard env creation def make_env(): env_rllab = gym.make(env) return env_rllab if policy == 'linear': hid_size = num_hid_layers = 0 elif policy == 'nn': hid_size = [100, 50, 25] num_hid_layers = 3 if policy_init == 'xavier': policy_initializer = tf.contrib.layers.xavier_initializer() elif policy_init == 'zeros': policy_initializer = U.normc_initializer(0.0) else: raise Exception('Unrecognized policy initializer.') if policy == 'linear' or policy == 'nn': def make_policy(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) elif policy == 'cnn': def make_policy(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, gaussian_fixed_var=True, use_bias=False, use_critic=False, hidden_W_init=policy_initializer, output_W_init=policy_initializer) else: raise Exception('Unrecognized policy type.') sampler = ParallelSampler(make_policy, make_env, n_episodes, horizon, True, n_workers=njobs, seed=seed) try: affinity = len(os.sched_getaffinity(0)) except: affinity = njobs sess = U.make_session(affinity) sess.__enter__() set_global_seeds(seed) gym.logger.setLevel(logging.WARN) pois.learn(make_env, make_policy, n_episodes=n_episodes, horizon=horizon, sampler=sampler, save_weights=save_weights, **alg_args) sampler.close()
def behavioral_cloning_nn(num_epochs, num_layers, num_hidden, X, Y, validation=0.2, lr=1e-4, l2=0., batch_size=128, init_logstd=1., state_dependent_variance=True, starting_point='', discrete=False, beta=1.0): input_dim = X.shape[-1] output_dim = Y.shape[-1] observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=len(np.unique(Y))) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) policy_train = build_policy( observation_space, action_space, network, l2=l2, lr=lr, trainable_variance=state_dependent_variance, init_logstd=init_logstd, beta=beta, state_dependent_variance=state_dependent_variance)() U.initialize() if starting_point != '': policy_train.load(starting_point) # dataset build states = X actions = Y if discrete: print("Original Dataset Size:", states.shape[0]) classes = np.unique(Y) class_counts = np.array([np.sum(Y == cl) for cl in classes]) max_count = max(class_counts) ratios = class_counts / max_count print("Class Distribution:", class_counts / states.shape[0]) print("Class ratios:", ratios) states_to_add = [] actions_to_add = [] for j, ratio in enumerate(ratios): if ratio != 1: for i in range(int(1 / ratio)): states_to_add += states[actions == classes[j]].tolist() actions_to_add += actions[actions == classes[j]].tolist() remaining = int((1 / ratio - int(1 / ratio)) * class_counts[j]) all_indexes = np.array([x for x in range(class_counts[j])]) random.shuffle(all_indexes) shuffled_indexes = all_indexes[0:remaining] states_to_add += states[actions == classes[j]][shuffled_indexes].tolist() actions_to_add += actions[ actions == classes[j]][shuffled_indexes].tolist() states_to_add = np.array(states_to_add) actions_to_add = np.array(actions_to_add) states = np.concatenate([states, states_to_add], axis=0) actions = np.concatenate([actions, actions_to_add], axis=0) print("Oversampled Dataset Size", states.shape[0]) dataset = list(zip(states, actions)) random.shuffle(dataset) if validation > 0.: k = math.floor(validation * len(dataset)) dataset_training = dataset[:-k] dataset_validation = dataset[-k:] else: dataset_training = dataset[:] # pre-processing statistics num_batches = len(dataset_training) // batch_size num_batches += (0 if len(dataset_training) % batch_size == 0 else 1) print('# batches: ', num_batches) print('# training samples: ', len(dataset_training)) logger = { 'training_samples': len(dataset_training), 'batch_size': batch_size, 'num_batches': num_batches, 'num_epochs': num_epochs } if validation > 0.: print('# validation samples: ', len(dataset_validation)) logger['validation_samples'] = len(dataset_validation) # validation samples built X_val, y_val = zip(*dataset_validation) X_val, y_val = np.array(X_val), np.array(y_val) # train + accuracy over epochs counter = 0 best_loss = np.inf for epoch in trange(num_epochs): # train batches built random.shuffle(dataset_training) batches = [] for i in range(num_batches): base = batch_size * i batches.append(dataset_training[base:base + batch_size]) # train if validation > 0.: target = y_val accuracy, _, loss = policy_train.evaluate(X_val[:], target, False) if epoch % 1 == 0 and loss <= best_loss: best_loss = loss else: pass for batch in batches: batch_X, batch_y = zip(*batch) target = batch_y output = policy_train.fit(batch_X, target) summaries = [ tf.Summary.Value(tag="loss", simple_value=output[0]), tf.Summary.Value(tag="r2", simple_value=output[1]) ] if not discrete: summaries += [ tf.Summary.Value(tag="mean_std", simple_value=output[2]), tf.Summary.Value(tag="min_std", simple_value=output[3]), tf.Summary.Value(tag="max_std", simple_value=output[4]) ] else: summaries += [ tf.Summary.Value(tag="entropy", simple_value=output[2]), tf.Summary.Value(tag="stochastic_accuracy", simple_value=output[3]) ] counter += 1 # validation if validation > 0.: target = y_val accuracy, _, loss = policy_train.evaluate(X_val[:], target, False) summary = tf.Summary(value=[ tf.Summary.Value(tag="accuracy", simple_value=accuracy), tf.Summary.Value(tag="test_loss", simple_value=loss) ]) if num_epochs % 1 == 0 and loss <= best_loss: best_loss = loss batch_X, batch_Y = zip(*dataset) _, _, loss, ll = policy_train.evaluate(batch_X[:], batch_Y[:], False) logger['cost'] = loss logger['ll'] = ll return policy_train, logger, None