def __init__(self, trial_context: PyTorchTrialContext) -> None: self.context = trial_context self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}" # self.logger = TorchWriter() self.n_stack = self.context.get_hparam("n_stack") self.env_name = self.context.get_hparam("env_name") self.num_envs = self.context.get_hparam("num_envs") self.rollout_size = self.context.get_hparam("rollout_size") self.curiousity = self.context.get_hparam("curiousity") self.lr = self.context.get_hparam("lr") self.icm_beta = self.context.get_hparam("icm_beta") self.value_coeff = self.context.get_hparam("value_coeff") self.entropy_coeff = self.context.get_hparam("entropy_coeff") self.max_grad_norm = self.context.get_hparam("max_grad_norm") env = make_atari_env(self.env_name, num_env=self.num_envs, seed=42) self.env = VecFrameStack(env, n_stack=self.n_stack) eval_env = make_atari_env(self.env_name, num_env=1, seed=42) self.eval_env = VecFrameStack(eval_env, n_stack=self.n_stack) # constants self.in_size = self.context.get_hparam("in_size") # in_size self.num_actions = env.action_space.n def init_(m): return init(m, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) self.feat_enc_net = self.context.Model( FeatureEncoderNet(self.n_stack, self.in_size)) self.actor = self.context.Model( init_(nn.Linear(self.feat_enc_net.hidden_size, self.num_actions))) self.critic = self.context.Model( init_(nn.Linear(self.feat_enc_net.hidden_size, 1))) self.set_recurrent_buffers(self.num_envs) params = list(self.feat_enc_net.parameters()) + list( self.actor.parameters()) + list(self.critic.parameters()) self.opt = self.context.Optimizer(torch.optim.Adam(params, self.lr)) self.is_cuda = torch.cuda.is_available() self.storage = RolloutStorage(self.rollout_size, self.num_envs, self.env.observation_space.shape[0:-1], self.n_stack, is_cuda=self.is_cuda, value_coeff=self.value_coeff, entropy_coeff=self.entropy_coeff) obs = self.env.reset() self.storage.states[0].copy_(self.storage.obs2tensor(obs)) self.writer = SummaryWriter(log_dir="/tmp/tensorboard") self.global_eval_count = 0
def train(env_id, num_timesteps, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps)
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ env = make_atari_env(env_id, n_envs, seed) env = VecFrameStack(env, 4) policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy] model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) model.save('/serverdata/rohit/stablebaselines/atari/ppo/{}'.format(env_id), 'csv') env.close() # Free memory del model
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def main(): env_id = 'PongNoFrameskip-v4' # env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'BreakoutNoFrameskip-v4' num_env = 16 num_steps = 5 num_batch = num_env * num_steps seed = 0 env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False, 'transpose_image': True} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) network = ConvVAE([84, 84], 2048) observs = [] actions = [] next_observs = [] observ = env.reset() observ = observ.transpose(0, 3, 2, 1) observ = tensor(observ) print(observ.shape) out = network(observ)[0] print(out.shape)
def train(env_id, num_timesteps, seed, policy, attack=False, n_envs=8, nminibatches=4, n_steps=128): model = PPO2.load("model.pkl") env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if attack: env = VecFrameStack( make_adversarial_atari_env(env_id, n_envs, seed, model), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, # lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, # learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) model.save("model") env.close() # Free memory del model
def test_generate(generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) dataset = generate_expert_traj(model, 'expert', n_timesteps=1000, n_episodes=n_episodes, image_folder='test_recorded_images') assert set(dataset.keys()).issuperset( ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']) assert sum(dataset['episode_starts']) == n_episodes assert len(dataset['episode_returns']) == n_episodes n_timesteps = len(dataset['episode_starts']) for key, val in dataset.items(): if key != 'episode_returns': assert val.shape[ 0] == n_timesteps, "inconsistent number of timesteps at '{}'".format( key) dataset_loaded = np.load('expert.npz') assert dataset.keys() == dataset_loaded.keys() for key in dataset.keys(): assert (dataset[key] == dataset_loaded[key] ).all(), "different data at '{}'".format(key)
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): """ train an ACER model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy else: warnings.warn("Policy {} not implemented".format(policy)) return model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close() # Free memory del model
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format( normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env, sil_update, sil_beta, tensorboard_log, tb_log_name): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) model = SelfImitationA2C(policy_fn, env, lr_schedule=lr_schedule, tensorboard_log=tensorboard_log, verbose=1, sil_update=sil_update, sil_beta=sil_beta) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed, tb_log_name=tb_log_name) env.close()
def test_generate(tmp_path, generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) dataset = generate_expert_traj(model, str(tmp_path / 'expert'), n_timesteps=300, n_episodes=n_episodes, image_folder=str(tmp_path / 'test_recorded_images')) assert set(dataset.keys()).issuperset( ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts']) assert sum(dataset['episode_starts']) == n_episodes assert len(dataset['episode_returns']) == n_episodes n_timesteps = len(dataset['episode_starts']) for key, val in dataset.items(): if key != 'episode_returns': assert val.shape[ 0] == n_timesteps, "inconsistent number of timesteps at '{}'".format( key) dataset_loaded = np.load(str(tmp_path / 'expert.npz'), allow_pickle=True) assert dataset.keys() == dataset_loaded.keys() for key in dataset.keys(): assert (dataset[key] == dataset_loaded[key] ).all(), "different data at '{}'".format(key) # Cleanup folder if os.path.isdir(str(tmp_path / 'test_recorded_images')): shutil.rmtree(str(tmp_path / 'test_recorded_images'))
def main(cfg, run_dir): run_name = make_run_name(cfg) output_dir = run_dir / run_name output_dir.mkdir(parents=True) with (output_dir / 'config.json').open('w') as fp: json.dump(cfg, fp, indent=2) # Setting log levels to cut out minor errors os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.ERROR) log_dir = output_dir / cfg['log_dir'] tensorboard_dir = output_dir / cfg['tb_dir'] configure(log_dir=str(log_dir), format_strs=['log', 'csv', 'tensorboard'], tensorboard_dir=str(tensorboard_dir)) # Create and wrap the environment logging.info('Starting {env_name}'.format(**cfg)) env = make_atari_env(env_id=cfg['env_name'], num_env=8, seed=cfg['train_seed']) env = VecFrameStack(env, n_stack=4) if cfg['normalize']: env = VecNormalize(env) # Setting all known random seeds (Python, Numpy, TF, Gym if available) set_global_seeds(cfg['train_seed']) logging.info('Running {algo}'.format(**cfg)) algo = get_algo(cfg['algo']) policy = cfg['policy_type'] feature_extractor = get_network_builder(cfg['network']) attn_loss = get_loss(cfg['attn_loss'])() model = algo( policy=policy, env=env, verbose=1, learning_rate=lambda frac: 0.00025 * frac, attn_loss=attn_loss, attn_coef=cfg['attn_coef'], policy_kwargs={ 'cnn_extractor': feature_extractor, }, tensorboard_log=str(tensorboard_dir), ) logging.info('Training for {time_steps} steps'.format(**cfg)) # Training model.learn( total_timesteps=cfg['time_steps'], log_interval=cfg['log_interval'], tb_log_name=None, callback=Callback(output_dir), )
def main(): env_id = 'BreakoutNoFrameskip-v4' num_env = 5 seed = 0 env_args = {'episode_life': False, 'clip_rewards': False} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) graph = tf.Graph() with graph.as_default(): sess = tf_util.make_session(graph=graph) with tf.variable_scope('input', reuse=False): input_x, process_x = observation_input(env.observation_space, num_env) print(env.action_space.shape) pdtype = make_proba_dist_type(env.action_space) actions_ph = pdtype.sample_placeholder([num_env], name="action_ph") one_hot_actions = tf.one_hot(actions_ph, env.action_space.n) print(input_x, process_x) print('action', actions_ph, one_hot_actions) beta = 0.1 mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions) print(mu) print(sigma_sq) print(recons_x) with tf.name_scope('losses'): recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss') kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)), name='kl_divergence') loss = tf.add(recons_loss, tf.multiply( kl_divergence, beta), name='objective') print(loss) summary = utility.summary({recons_loss: 'recons_loss', kl_divergence: 'kl_divergence', mu: 'phi_mu', sigma_sq: 'sigma_sq', recons_x: 'recons_x', input_x: 'input_x', }, env.observation_space.shape) optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5) train_op = optimizer.minimize(loss) for event_file in LOG_DIR.glob('event*'): event_file.unlink() writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph) sess.run(tf.global_variables_initializer()) observ = env.reset() actions = [env.action_space.sample() for _ in range(num_env)] print(env.observation_space) print(observ.shape) recons_image, summary_ = sess.run([recons_x, summary], feed_dict={input_x: observ, actions_ph: actions}) writer.add_summary(summary_, 0)
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): if hyperparams is None: hyperparams = {} if env_kwargs is None: env_kwargs = {} # Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) else: # start_method = 'spawn' for thread safe env = DummyVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=None, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def train(env_id, num_timesteps, seed, num_cpu): """ train an ACKTR model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) model = ACKTR(CnnPolicy, env, nprocs=num_cpu) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def create_env(n_envs): """ Create the environment and wrap it if necessary :param n_envs: (int) :return: (gym.Env) """ global hyperparams if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print( "WARNING: normalization not supported yet for DDPG/DQN" ) # No env_wrapper applied for now as not using make_env() env = gym.make(env_id) env.seed(args.seed) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def test_generate(generate_env): model, policy, env_name, n_env, n_episodes = generate_env if n_env > 1: env = make_atari_env(env_name, num_env=n_env, seed=0) model = model(policy, env, verbose=0) else: model = model(policy, env_name, verbose=0) generate_expert_traj(model, 'expert', n_timesteps=1000, n_episodes=n_episodes, image_folder='test_recorded_images')
def evaluate(self, n_episodes=2): logging.basicConfig(level=logging.INFO) id = 'BreakoutNoFrameskip-v4' num_env = 1 n_stack = 4 left_lives = 5 seed = 0 episodes = 0 score = 0 frames = 0 frames_per_episode = list() scores = [list() for i in range(n_episodes)] env = make_atari_env(id, num_env=num_env, seed=seed) env = VecFrameStack(env, n_stack=n_stack) obs = env.reset() while (n_episodes - episodes) > 0: frames += 1 action, _states = self.predict(obs) obs, rewards, dones, info = env.step(action) env.render() score += rewards[0] if dones: logging.debug('You died') logging.debug(f'Score = {score}') scores[episodes].append(score) score = 0 left_lives -= 1 if not left_lives: logging.debug('Episode ended') logging.info(f'Scores per life: {scores[episodes]}') frames_per_episode.append(frames) frames = 0 episodes += 1 left_lives = 5 s = list(map(sum, scores)) avg_s = int(sum(s) / len(s)) avg_f = int(sum(frames_per_episode) / len(frames_per_episode)) logging.info(f'Played {n_episodes} episodes') logging.info(f'Scores per episode : {s}') logging.info(f'Average score per episode : {avg_s}') logging.info(f'Average number of frames per episode : {avg_f}') return avg_f, avg_s
def test_pretrain_images(tmp_path): env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) env = VecFrameStack(env, n_stack=4) model = PPO2('CnnPolicy', env) generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1, image_folder=str(tmp_path / 'pretrain_recorded_images')) expert_path = str(tmp_path / 'expert_pong.npz') dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32, sequential_preprocessing=True) model.pretrain(dataset, n_epochs=2) shutil.rmtree(str(tmp_path / 'pretrain_recorded_images')) env.close() del dataset, model, env
def __init__(self, env_list=default_envs, algos_list=default_algos): self.env_list = env_list self.algos_list = algos_list self.n_algos = len(self.algos_list) self.envs = dict() self.rewards = defaultdict(dict) self.models = defaultdict(dict) # HAY QUE GUARDAR LOS MODELOS PARA ENSEMBLE for env_name in self.env_list: new_env = make_atari_env(env_name, num_env=1, seed=0) new_env = VecFrameStack(new_env, n_stack=4) self.envs[env_name] = new_env for algo in self.algos_list: for env_name, env in self.envs.items(): self.models[env_name][algo] = loader(algo, env_name)
def test(): model = PPO2.load("model.pkl") sess = model.sess env = VecFrameStack(make_atari_env("SpaceInvadersNoFrameskip-v0", 1, 123), 4) pi = model.act_model action_dist = pi.action action_one = pi.deterministic_action o = env.reset() while(True): env.render() # a, _, _, _ = pi.step(obs=o, deterministic=True) a = sess.run(action_one, {pi.obs_ph: o}) o, r, d, _ = env.step(a)
def test_ppo(env_id, seed, path_to_policy_params, n_envs = 1): """ env_id: typr str, identifies each environment uniquely num_timesteps: number of timesteps to run the algorithm seed: initial random seed policy: policy to be followed (mlp, cnn, lstm, etc) n_env: number of envs to run in parallel nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params n_steps: number of steps in each update """ # Train PPO algorithm for num_timesteps # stack 4 frames for the vectorized environment # Note: PPO2 works only with vectorized environment env = VecFrameStack(make_atari_env(env_id = env_id, num_env = n_envs, seed=seed), 4) # define the policy # create model object for class PPO2 # The policy is CnnPolicy from stable baselines and has been trained for 2e7 time steps on Pong model = PPO2.load(path_to_policy_params) vr = video_recorder.VideoRecorder(env, base_path="./videos/Pong_test_without_attack", enabled="./videos/Pong_test_without_attack" is not None) obs = env.reset() ep_rew = [0.0] ep = 0 for i in range(50000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) ep_rew[-1] += rewards env.render() vr.capture_frame() if dones: obs = env.reset() print('Net reward for episode ',ep,': ',ep_rew[-1]) if((ep+1)%10 == 0): print('Mean reward for last 10 episodes: ',np.mean(ep_rew[-10:])) ep_rew.append(0.0) ep += 1 print('Number of timesteps completed: ', i+1) env.close() vr.close()
def run_gail(): parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, default=None, help='Expert path (*.npz)') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--note', type=str, default='test') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--num-steps', type=int, default=1000000) parser.add_argument('--policy', type=str, default='CnnPolicy', choices=[ 'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy', 'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy' ], help='Policy architecture') args = parser.parse_args() logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) model = GAIL(args.policy, env, dataset, timesteps_per_batch=1280, verbose=1) model.learn(len(dataset.train_loader) * 1280)
def main(): args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) set_seed(config['seed']) writer = None # Will ERROR if outdir already exists if not os.path.exists(config['outdir']): os.makedirs(config['outdir']) if config['use_tensorboard']: os.makedirs(os.path.join(config['outdir'], 'tensorboard')) writer = SummaryWriter( os.path.join(config['outdir'], 'tensorboard')) # save a copy of the config file shutil.copyfile(args.config, os.path.join(config['outdir'], 'config.yaml')) else: print("ERROR: directory \'./{}\' already exists!".format( config['outdir'])) raise EnvironmentError logger = get_logger(config) # create environment env = make_atari_env(config['task'], num_env=config['parallel_envs'], seed=config['seed']) env = VecFrameStack(env, n_stack=config['state_frames']) # default device for torch tensors device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu') # start training a2c = A2C(config, env, device, logger, writer) a2c.train()
from threading import Thread parser = argparse.ArgumentParser() parser.add_argument("--angle", type=float, default=0.0) # Kamerawinkel: 0 15 30 45 60 parser.add_argument("--system", type=str, default="Windows") parser.add_argument("--factor", type=int, default=0) args = parser.parse_args() scale_factor_arr = [0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6] scale_factor_ind = args.factor game_system = args.system env_name = f"jupong-3D-{game_system}-v0" env = make_atari_env(env_name, num_env=1, seed=0) env.envs[0].reset() env.envs[0].scale_paddles(scale_factor_arr[scale_factor_ind]) env = VecFrameStack(env, n_stack=4) save_path = f"ppo2_save/ppo2_save_cam_angle_{args.angle}_4" model = PPO2.load(save_path, env=None) model.set_env(env) def process_environment(file_path, scale_factor_ind): reward_arr = [] mean_reward = 0.0 obs = env.reset() reward_sum = 0.0 while True: action, _states = model.predict(obs)
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] try: class_ = load(spec.entry_point) except AttributeError: # Backward compatibility with gym class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument render_name = None use_subproc = 'renders' not in inspect.getfullargspec( class_.__init__).args if not use_subproc: render_name = 'renders' # Dev branch of pybullet # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args # if not use_subproc and render_name is None: # render_name = 'render' # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, **{render_name: should_render}) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper) ]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv( [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines import ACER # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = ACER('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) # save model.save("cnn_pong") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
hout, wout = get_ops(get_ops(get_ops(hin, 8, 4), 4, 2), 3, 1), get_ops(get_ops(get_ops(win, 8, 4), 4, 2), 3, 1) self.linear = 64 * hout * wout self.lin0 = nn.Linear(self.linear, 512, bias = True) self.lin1 = nn.Linear(512, actions) def forward(self, t): t = f.relu(self.conv0(t)) t = f.relu(self.conv1(t)) t = f.relu(self.conv2(t)) t = t.permute(0,2,3,1).reshape(-1) t = f.relu(self.lin0(t)) t = self.lin1(t) # Output will be the logits return t env = VecFrameStack(make_atari_env(env_id = 'PongNoFrameskip-v4', num_env = 1, seed = 2), 4) env.reset().shape h, w = env.reset().shape[1], env.reset().shape[2] dqn_net = network(h, w, env.action_space.n) ppo_net = network(h, w, env.action_space.n) ppo2_net = network(h, w, env.action_space.n) # params ppo # Transfer of weights from saved model to newly created network # conv weights x = np.transpose(ppo_params[1][0], [3,2,0,1]) ppo_net.conv0.weight = torch.nn.parameter.Parameter(torch.tensor(x)) x = np.transpose(ppo_params[1][2], [3,2,0,1]) ppo_net.conv1.weight = torch.nn.parameter.Parameter(torch.tensor(x)) x = np.transpose(ppo_params[1][4], [3,2,0,1]) ppo_net.conv2.weight = torch.nn.parameter.Parameter(torch.tensor(x))
def main(cfg, model_path, video_path, visualization_method, n_gradient_samples, obs_style): set_global_seeds(cfg['eval_seed']) env = make_atari_env(cfg['env_name'], num_env=1, seed=cfg['eval_seed']) env = VecFrameStack(env, n_stack=4) # stack 4 frames if cfg['normalize']: # Not setting training=False because that seems to ruin performance env = VecNormalize(env) model = get_algo(cfg['algo']).load( str(model_path), env, verbose=1, learning_rate=lambda frac: 0.00025 * frac, attn_loss=get_loss(cfg['attn_loss'])(), attn_coef=cfg['attn_coef'], policy_kwargs={'cnn_extractor': get_network_builder(cfg['network'])}, ) observations = [] saliency_maps = [] input_tensor = model.sess.graph.get_tensor_by_name("input/Ob:0") input_cast_tensor = model.sess.graph.get_tensor_by_name("input/Cast:0") a2_activations = model.sess.graph.get_tensor_by_name("model/a2/add:0") attn_tensor = model.sess.graph.get_tensor_by_name('model/attn:0') attn_tensor = tf.reduce_sum(attn_tensor, axis=-1) sr = SaliencyRenderer( sess=model.sess, gradient_source_tensor=input_cast_tensor, attention_tensor=a2_activations, selection_method='SUM', ) obs = env.reset() for _ in tqdm(range(300), postfix='playing', ncols=76): if obs_style == 'human': stored_obs = np.stack(env.get_images()) / 255 else: stored_obs = obs[:, :, :, -1].copy() observations.append(stored_obs) if visualization_method == 'conv2d_transpose': action, _states, attn = model.predict(obs, extra=attn_tensor) saliency_maps.append(attn) else: action, _states = model.predict(obs) smap = sr.get_basic_input_saliency_map( input_tensor, obs, n_gradient_samples=n_gradient_samples, gradient_sigma_spread=0.15, aggregation_method={ 'simonyan': None, 'smoothgrad': 'smoothgrad', 'vargrad': 'vargrad', }[visualization_method])[..., -1] saliency_maps.append(smap) obs, rewards, dones, info = env.step(action) if visualization_method == 'conv2d_transpose': saliency_maps = render_attn(saliency_maps, 36, 8, 0) saliency_cutoff = max(np.percentile(attn, 99) for attn in saliency_maps) for smap in saliency_maps: smap /= saliency_cutoff np.clip(smap, a_min=0, a_max=1, out=smap) with VideoWriter(video_path, fps=10) as writer: for obs, smap in tqdm(zip(observations, saliency_maps), postfix='writing video', total=len(observations), ncols=76): if obs_style == 'human': b, h, w = obs.shape[:-1] assert obs.shape[-1] == 3 resized_attn = np.stack( [resize(smap[bb, ...], (h, w)) for bb in range(b)]) frame = 0.5 * (obs + resized_attn[..., np.newaxis]) else: frame = np.stack( [ np.zeros_like(obs), smap, obs.astype(np.float32) # / 255 ], axis=-1) frame = resize(frame, (1, 160, 160, 3)) writer.write_frame(frame)
help="interval between saving model (default: 0, means don't save)") args = parser.parse_args() dtype = torch.float64 torch.set_default_dtype(dtype) if args.cuda: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') else: args.device = torch.device('cpu') args.num_threads = mp.cpu_count() - 1 """environment""" env = make_atari_env(args.env_name, 1, args.seed) env = VecFrameStack(env, n_stack=4) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # load trajectory args.expert_traj_path = "assets/expert_traj/{}_ppo_0.p".format( args.env_name) expert_trajs, _, _ = pickle.load(open(args.expert_traj_path, "rb")) imitator = GAILAtari(args, state_dim, action_dim)