def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if n_reactive > 1: # Reactive policy needs fixed-length histories env = HistoryWrapper(env, n_reactive) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) return env
def train(env_id, num_timesteps, seed, theta, name, decay, lr, time_param): import os.path import datetime print(name) if name != "": name = name + "theta-" + str(theta) + "-decay-" + str( decay) + "-lr-" + str(lr) + "-time_param-" + str(time_param) + '/' print(name) from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) tmp = os.path.join( './../LOG/' + name + 'mujoco_' + str(env_id) + '/', str(seed) + '--' + datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) print(tmp) logger.configure(dir=tmp) env = bench.Monitor(env, tmp) env.seed(seed) gym.logger.setLevel(logging.WARN) print("Starting training") pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=lr, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', theta=theta, decay=decay, time_param=time_param) env.close()
def _thunk(): if "Custom" in env_id: env = gym.make(env_id, env_settings=env_settings) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, os.path.join(log_dir, str(rank))) if is_atari: env = wrap_deepmind(env, clip_rewards=clip_rewards, env_settings=env_settings) env = WrapPyTorch(env, env_settings=env_settings) return env
def run_baselines(env, seed, log_dir): """Create baselines model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) env = DummyVecEnv([ lambda: bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) ]) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=hyper_parameters['batch_size'], nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, max_grad_norm=None, lr=3e-4, cliprange=0.2, total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs']) # yapf: disable # noqa: E501 return osp.join(log_dir, 'progress.csv')
def _thunk(): env = gym.make(env_id, **kwargs) env.seed(seed + rank) # obs_shape = env.observation_space.shape # amirabdi: my understanding is that timestep is when "time" is part of the # state definition... I don't see it play a role anytime soon. # if add_timestep and len(obs_shape) == 1 and str(env).find('TimeLimit') > -1: # env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) return env
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) if 'micropolis' in env_id.lower(): print("ENV RANK: ", rank) if rank == 0: env.setMapSize(map_width, print_map=print_map, parallel_gui=parallel_py2gui, render_gui=render_gui, empty_start=True, noreward=noreward, max_step=max_step, rank=rank) else: env.setMapSize(map_width, rank=rank) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def _thunk(): env = gym_super_mario_bros.make(env_id) env.seed(seed + rank) env = BinarySpaceToDiscreteSpaceEnv(env, ACTIONS) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank))) env = ProcessFrameMario(env, reward_type=reward_type) env = smb_warp_frame(env) env = smb_scale_frame(env) env = smb_stack_and_repeat(env, stack_frames, action_repeat) env = WrapPyTorch(env) return env
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) print("task name: {}".format(task_name)) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': # dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) # reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, args.algo, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, task_name) elif args.task == 'visualize': policy_run(env, policy_fn, args.checkpoint_dir, number_rollouts=10, stochastic_policy=args.stochastic_policy) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def _thunk(): """Creates an env and manualy sets its seed, log directory and timestep.""" # env_id = 'Reacher' env = gym.make(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) return env
def train(args): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) task_name = "trpo." + args.env_id.split("-")[0] + "." + ("%.2f" % args.entcoeff) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path) env.close()
def get_player(self, train=False): if self.env: return env if self.config['ENV_TYPE'] == 'Classic': env = gym.make(self.config['ENV_NAME']) elif self.config['ENV_TYPE'] == 'Atari': if train: env = make_atari(self.config['ENV_NAME']) env = bench.Monitor(env, self.logger.get_dir()) env = deepq.wrap_atari_dqn(env) else: env = gym.make(self.config['ENV_NAME']) env = deepq.wrap_atari_dqn(env) else: raise Exception('Environment Type %s - Not Supported' % self.config['ENV_TYPE']) return env
def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) # obs_shape = env.observation_space.shape # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # env = TransposeImage(env, op=[2, 0, 1]) return env
def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank))) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions print env.observation_space obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = WrapPyTorch(env) return env
def train(env_id, args): from baselines.ppo1 import cnn_policy import baselines.common.tf_util as U if args.nokl: from baselines.ppo1 import nokl_pposgd_simple as pposgd_simple else: from baselines.ppo1 import pposgd_simple rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() print('_'.join([str(arg) for arg in vars(args)])) logdir = osp.join( './result/', '_'.join([str(getattr(args, arg)) for arg in vars(args)])) logger.configure(dir=logdir) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(args.num_timesteps * 1.1), timesteps_per_actorbatch=args.timesteps_per_actorbatch, clip_param=args.clip, entcoeff=args.entcoeff, optim_epochs=args.optim_epochs, optim_stepsize=args.optim_stepsize, optim_batchsize=args.optim_batchsize, gamma=0.99, lam=0.95, schedule='linear') env.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--train-with-latency', type=int, default=0) parser.add_argument('--train-with-all-latency-mode', type=int, default=0) args = parser.parse_args() loggerid = "L" + (("M" + str(args.train_with_all_latency_mode)) if (args.train_with_all_latency_mode != 0) else str( args.train_with_latency)) loggerdir = "./data." + loggerid + "/" logger.configure(dir=loggerdir) set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), print_freq=1, train_with_latency=args.train_with_latency, train_with_all_latency_mode=args.train_with_all_latency_mode) act.save(loggerdir + args.env + "." + loggerid + ".pkl") env.close()
def worker_process(remote: multiprocessing.connection.Connection, parameters, worker_id, seed): """ This function is used as target by each of the threads in the multiprocess to build environment instances and define the commands that can be executed by each of the workers. """ # The Atari wrappers are now imported from openAI baselines # https://github.com/openai/baselines log_dir = './log' if parameters['env_type'] == 'atari': env = make_atari(parameters['scene']) env = bench.Monitor( env, os.path.join(log_dir, str(worker_id)), allow_early_resets=False) env = wrap_deepmind(env, True) if parameters['env_type'] == 'warehouse': env = Warehouse(seed, parameters) if parameters['env_type'] == 'sumo': env = LoopNetwork(parameters, seed) if parameters['env_type'] == 'minigrid': env = gym.make(parameters['scene']) # env = RGBImgPartialObsWrapper(env, tile_size=12) # Get pixel observations env = ImgObsWrapper(env) # Get rid of the 'mission' field env = wrappers.GrayScaleObservation(env, keep_dim=True) # Gray scale env = FeatureVectorWrapper(env) env.seed(seed) while True: cmd, data = remote.recv() if cmd == 'step': obs, reward, done, info = env.step(data) if done: obs = env.reset() remote.send((obs, reward, done, info)) elif cmd == 'reset': remote.send(env.reset()) elif cmd == 'action_space': remote.send(env.action_space.n) elif cmd == 'close': remote.close() break else: raise NotImplementedError
def _thunk(): env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari( env_id ) # --> check that NoFrameskip-v4-env has been chosen; NoopReset if train_mode == "probe": env = AtariARIWrapper( env ) # --> add the labels based on the RAM-state to the info-dict elif train_mode == "train_encoder": pass else: raise ValueError env.seed(seed + rank) if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=False) # env = gym.wrappers.Monitor(env, '/home/cathrin/MA/datadump/videos/' + env_id, force=True) # --> in the following order: # EpisodicLifeEnv; FireResetEnv, Grayscaling, just for Pong: overlay Scores # ScaleObservations to [0, 1]; ClipRewards; # for framestacking: MaxAndSkipAndFramestack, without Framstacking: MaxAndSkipEnv env = wrap_deepmind(env, downsample=downsample, color=color, frame_stack=frame_stack, use_extended_wrapper=use_extended_wrapper, train_mode=train_mode) # convert to pytorch-style (C, H, W) env = ImageToPyTorch(env) return env
def _thunk(): print("CUSTOM GYM:", custom_gym) if custom_gym is not None and custom_gym != "": module = importlib.import_module(custom_gym, package=None) print("imported env '{}'".format((custom_gym))) if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if not navi: if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) elif len(env.observation_space.shape) == 3: raise NotImplementedError( "CNN models work only for atari,\n" "please use a custom wrapper for a custom pixel input env.\n" "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env, op=[2, 0, 1]) return env
def train(env_id, num_timesteps, seed): from baselines.ppo1_cmaes_layer_pl import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure(filename="PPO1-" + env_id, format_strs=['stdout', 'log', 'csv']) else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) # test_env = bench.Monitor(test_env, logger.get_dir() and # osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.1, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-6, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', seed=seed, env_id=env_id) env.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close()
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) #추가코드 env.init_dart() env.init_sim(True, False) #env.start_render() obs_shape = env.observation_space.shape if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) elif len(env.observation_space.shape) == 3: raise NotImplementedError( "CNN models work only for atari,\n" "please use a custom wrapper for a custom pixel input env.\n" "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env, op=[2, 0, 1]) return env
def _thunk(): print("CUSTOM GYM:", custom_gym) if custom_gym is not None and custom_gym != "": module = importlib.import_module(custom_gym, package=None) print("imported env '{}'".format((custom_gym))) if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) if not is_atari and scale_img: env = WarpFrame(env, color_img) if duckietown: env = DuckietownRewardWrapper(env) if dt_discrete: env = DuckietownDiscreteWrapper(env) env = Normalize(env) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=True) if is_atari: env = wrap_deepmind(env) # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = WrapPyTorch(env) return env
def _thunk(): env = EnvInterface( use_vision=use_vision, use_pos=use_pos, episode_length=episode_length, level=level_script, ) random_seed(seed) env.seed(seed + rank) if log_dir is not None: # env = Monitor(env=env, filename=os.path.join(log_dir, str(rank)), allow_early_resets=True) env = bench.Monitor(env=env, filename=os.path.join(log_dir, str(rank)), allow_early_resets=True) return env
def __init__(self, sc2env=None, thread_num=999, log_data=False, brain=None, stop=None, t_queue=None, none_state=None): super(Environment, self).__init__() self.logger = logging.getLogger('sc2rl.' + __name__ + " | " + str(thread_num)) self.start_time = time.time() self.episodes = 0 self.rewards = [] self.steps = [] self.log_data = log_data self.brain = brain self.stop = stop if sc2env is not None: self.env = sc2env else: self.env = bench.Monitor(helpers.get_env_wrapper(render=FLAGS.render), os.path.join('logs/', '{}.monitor.json'.format(thread_num))) self.agent = Agent(self.env.action_space.n, brain=brain, t_queue=t_queue, none_state=none_state)
def _thunk(): if rep_type == "seg": n_channels = 1 else: n_channels = 3 env = VizdoomEnv(env_id=env_id, scenario=scenario, seed=seed, rep_type=rep_type, resolution=resolution, n_channels=n_channels, patch_count=patch_count, reverse_green=reverse_green ) env.seed(seed + rank) obs_shape = env.observation_space.shape if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor( env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) # if is_atari: # if len(env.observation_space.shape) == 3: # env = wrap_deepmind(env) # elif len(env.observation_space.shape) == 3: # raise NotImplementedError( # "CNN models work only for atari,\n" # "please use a custom wrapper for a custom pixel input env.\n" # "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions #obs_shape = env.observation_space.shape #if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: # env = TransposeImage(env, op=[2, 0, 1]) return env
def _thunk(): if env_id == 'MinitaurKirEnv': # env = gym.make(env_id) env = MinitaurKirEnv(energy_weight=energy) elif env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) else: env = gym.make(env_id) env.energy_weight = energy # is_atari = hasattr(gym.envs, 'atari') and isinstance( # env.unwrapped, gym.envs.atari.atari_env.AtariEnv) # if is_atari: # env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if add_timestep and len( obs_shape) == 1 and str(env).find('TimeLimit') > -1: env = AddTimestep(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) # if is_atari: # if len(env.observation_space.shape) == 3: # env = wrap_deepmind(env) # elif len(env.observation_space.shape) == 3: # raise NotImplementedError("CNN models work only for atari,\n" # "please use a custom wrapper for a custom pixel input env.\n" # "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env) return env
def main(envName='BreakoutNoFrameskip-v4', bufferSize=10000, timesteps=3e6): # parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') # parser.add_argument('--buffer', type=int, default=10000) # parser.add_argument('--seed', help='RNG seed', type=int, default=0) # parser.add_argument('--prioritized', type=int, default=1) # parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) # parser.add_argument('--dueling', type=int, default=1) # parser.add_argument('--num-timesteps', type=int, default=int(10e6)) # parser.add_argument('--checkpoint-freq', type=int, default=10000) # parser.add_argument('--checkpoint-path', type=str, default=os.getcwd()+"/logs") # args = parser.parse_args() # logger.configure(dir=args.checkpoint_path) logger.configure(dir=os.getcwd() + "/logs/" + str(envName) + "_" + str(bufferSize)) set_global_seeds(0) env = make_atari(envName) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=bool(1), ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=int(timesteps), buffer_size=bufferSize, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(1), prioritized_replay_alpha=0.6, checkpoint_freq=10000, ) act.save(os.getcwd() + "/logs/" + str(envName) + "_" + str(bufferSize) + "/model.pkl") env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo_abysmal2 import mlp_policy, pposgd_simple_test, cnn_policy, capsule_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) #env = make_atari(env_id) env = gym.make('Abysmal-v0') def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return capsule_policy.Capsule_policy(name=name, ob_space=ob_space, ac_space=ac_space) #return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=4) #return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) #env.seed(workerseed) gym.logger.setLevel(logging.WARN) #env = wrap_deepmind(env) #env.seed(workerseed) pposgd_simple_test.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=0, clip_param=0.2, entcoeff=0.01, optim_epochs=10, optim_stepsize=1e-3, optim_batchsize=32, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, hid_size, num_hid_layers, resume, agentName, logdir, desired_kl, gamma, lam, portnum, num_parallel): if num_parallel > 1: env = CustomParallelEnv(num_parallel) else: env = gym.make(env_id) env.seed(seed) # Todo: add seed to the random env too if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim, hid_size=512, num_hid_layers=2) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim, hid_size=512, num_hid_layers=2) learn(env, policy=policy, vf=vf, gamma=gamma, lam=0.97, timesteps_per_batch=timesteps_per_batch, resume=resume, desired_kl=desired_kl, agentName=agentName, logdir=logdir, num_timesteps=num_timesteps, animate=False) env.close()
def ppo_baselines(log_dir, env_id, seed): """Create baselines model and training. Args: log_dir (str): Experiment log directory. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ # Set up TF Session ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up baselines logger configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) set_global_seeds(seed) env = DummyVecEnv([ lambda: bench.Monitor(gym.make(env_id), baselines_logger.get_dir(), allow_early_resets=True) ]) ppo2.learn(network='mlp', env=env, nsteps=hyper_parameters['batch_size'], nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, max_grad_norm=None, lr=3e-4, cliprange=0.2, total_timesteps=(hyper_parameters['batch_size'] * hyper_parameters['n_epochs']))