def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders, seed): from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection, mlp_policy_mirror_novelty, \ pposgd_mirror_novelty, pposgd_mirror_novelty_projection rank = MPI.COMM_WORLD.Get_rank() workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return mlp_policy_mirror_novelty.MlpPolicyMirrorNovelty(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, mirror_loss=True, observation_permutation=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 11, 12, 13, 14, 23, 24, 25, 26, 19, 20, 21, 22], action_permutation=[4, 5, 6, 7, 0, 1, 2, 3] ) env.env.novel_autoencoders = autoencoders env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(seed + rank) print("AUTOENCODER LENGTH: ", len(autoencoders)) model = pposgd_novelty.learn(env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, sym_loss_weight=1, ) env.close() return model
def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders, seed): from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection rank = MPI.COMM_WORLD.Get_rank() workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return mlp_policy_novelty.MlpPolicyNovelty( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, ) env.env.novel_autoencoders = autoencoders env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(seed + rank) if len(autoencoders) == 0: print("NO AUTOENCODER!!") model = pposgd_novelty.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, ) else: print("AUTOENCODER LENGTH: ", len(autoencoders)) model = pposgd_novelty_projection.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, ) env.close() return model