Example #1
0
    def __init__(self, trial_context: PyTorchTrialContext) -> None:
        self.context = trial_context
        self.download_directory = f"/tmp/data-rank{self.context.distributed.get_rank()}"
        # self.logger = TorchWriter()
        self.n_stack = self.context.get_hparam("n_stack")
        self.env_name = self.context.get_hparam("env_name")
        self.num_envs = self.context.get_hparam("num_envs")
        self.rollout_size = self.context.get_hparam("rollout_size")
        self.curiousity = self.context.get_hparam("curiousity")
        self.lr = self.context.get_hparam("lr")
        self.icm_beta = self.context.get_hparam("icm_beta")
        self.value_coeff = self.context.get_hparam("value_coeff")
        self.entropy_coeff = self.context.get_hparam("entropy_coeff")
        self.max_grad_norm = self.context.get_hparam("max_grad_norm")

        env = make_atari_env(self.env_name, num_env=self.num_envs, seed=42)
        self.env = VecFrameStack(env, n_stack=self.n_stack)
        eval_env = make_atari_env(self.env_name, num_env=1, seed=42)
        self.eval_env = VecFrameStack(eval_env, n_stack=self.n_stack)

        # constants
        self.in_size = self.context.get_hparam("in_size")  # in_size
        self.num_actions = env.action_space.n

        def init_(m):
            return init(m, nn.init.orthogonal_,
                        lambda x: nn.init.constant_(x, 0))

        self.feat_enc_net = self.context.Model(
            FeatureEncoderNet(self.n_stack, self.in_size))
        self.actor = self.context.Model(
            init_(nn.Linear(self.feat_enc_net.hidden_size, self.num_actions)))
        self.critic = self.context.Model(
            init_(nn.Linear(self.feat_enc_net.hidden_size, 1)))
        self.set_recurrent_buffers(self.num_envs)

        params = list(self.feat_enc_net.parameters()) + list(
            self.actor.parameters()) + list(self.critic.parameters())
        self.opt = self.context.Optimizer(torch.optim.Adam(params, self.lr))

        self.is_cuda = torch.cuda.is_available()
        self.storage = RolloutStorage(self.rollout_size,
                                      self.num_envs,
                                      self.env.observation_space.shape[0:-1],
                                      self.n_stack,
                                      is_cuda=self.is_cuda,
                                      value_coeff=self.value_coeff,
                                      entropy_coeff=self.entropy_coeff)

        obs = self.env.reset()
        self.storage.states[0].copy_(self.storage.obs2tensor(obs))

        self.writer = SummaryWriter(log_dir="/tmp/tensorboard")
        self.global_eval_count = 0
Example #2
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
Example #3
0
def train(env_id, num_timesteps, seed, policy,
          n_envs=8, nminibatches=4, n_steps=128):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
        the number of environments run in parallel should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    """
    env = make_atari_env(env_id, n_envs, seed)
    env = VecFrameStack(env, 4)
    policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy]
    model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches,
                 lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)
    model.learn(total_timesteps=num_timesteps)
    model.save('/serverdata/rohit/stablebaselines/atari/ppo/{}'.format(env_id), 'csv')

    env.close()
    # Free memory
    del model
Example #4
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
Example #5
0
def main():
  env_id = 'PongNoFrameskip-v4'
  # env_id = 'MsPacmanNoFrameskip-v4'
  # env_id = 'BreakoutNoFrameskip-v4'
  num_env = 16
  num_steps = 5
  num_batch = num_env * num_steps

  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False,
              'transpose_image': True}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

  network = ConvVAE([84, 84], 2048)

  observs = []
  actions = []
  next_observs = []

  observ = env.reset()
  observ = observ.transpose(0, 3, 2, 1)
  observ = tensor(observ)
  print(observ.shape)
  out = network(observ)[0]
  print(out.shape)
Example #6
0
def train(env_id,
          num_timesteps,
          seed,
          policy,
          attack=False,
          n_envs=8,
          nminibatches=4,
          n_steps=128):

    model = PPO2.load("model.pkl")
    env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4)
    if attack:
        env = VecFrameStack(
            make_adversarial_atari_env(env_id, n_envs, seed, model), 4)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    #    model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches,
    #                lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01,
    #                 learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)
    model.learn(total_timesteps=num_timesteps)
    model.save("model")
    env.close()
    # Free memory
    del model
def test_generate(generate_env):
    model, policy, env_name, n_env, n_episodes = generate_env

    if n_env > 1:
        env = make_atari_env(env_name, num_env=n_env, seed=0)
        model = model(policy, env, verbose=0)
    else:
        model = model(policy, env_name, verbose=0)

    dataset = generate_expert_traj(model,
                                   'expert',
                                   n_timesteps=1000,
                                   n_episodes=n_episodes,
                                   image_folder='test_recorded_images')

    assert set(dataset.keys()).issuperset(
        ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts'])
    assert sum(dataset['episode_starts']) == n_episodes
    assert len(dataset['episode_returns']) == n_episodes
    n_timesteps = len(dataset['episode_starts'])
    for key, val in dataset.items():
        if key != 'episode_returns':
            assert val.shape[
                0] == n_timesteps, "inconsistent number of timesteps at '{}'".format(
                    key)

    dataset_loaded = np.load('expert.npz')
    assert dataset.keys() == dataset_loaded.keys()
    for key in dataset.keys():
        assert (dataset[key] == dataset_loaded[key]
                ).all(), "different data at '{}'".format(key)
Example #8
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
    """
    train an ACER model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    else:
        warnings.warn("Policy {} not implemented".format(policy))
        return

    model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
    # Free memory
    del model
Example #9
0
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper) for i in range(n_envs)
                ])
            if normalize:
                if args.verbose > 0:
                    if len(normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **normalize_kwargs)
        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
            del hyperparams['frame_stack']
        return env
Example #10
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env,
          sil_update, sil_beta,
          tensorboard_log, tb_log_name):
  """
  Train A2C model for atari environment, for testing purposes

  :param env_id: (str) Environment ID
  :param num_timesteps: (int) The total number of samples
  :param seed: (int) The initial seed for training
  :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
  :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                               'double_linear_con', 'middle_drop' or 'double_middle_drop')
  :param num_env: (int) The number of environments
  """
  policy_fn = None
  if policy == 'cnn':
    policy_fn = CnnPolicy
  elif policy == 'lstm':
    policy_fn = CnnLstmPolicy
  elif policy == 'lnlstm':
    policy_fn = CnnLnLstmPolicy
  if policy_fn is None:
    raise ValueError("Error: policy {} not implemented".format(policy))

  env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

  model = SelfImitationA2C(policy_fn, env, lr_schedule=lr_schedule, tensorboard_log=tensorboard_log,
                           verbose=1, sil_update=sil_update, sil_beta=sil_beta)
  model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed, tb_log_name=tb_log_name)
  env.close()
Example #11
0
def test_generate(tmp_path, generate_env):
    model, policy, env_name, n_env, n_episodes = generate_env

    if n_env > 1:
        env = make_atari_env(env_name, num_env=n_env, seed=0)
        model = model(policy, env, verbose=0)
    else:
        model = model(policy, env_name, verbose=0)

    dataset = generate_expert_traj(model,
                                   str(tmp_path / 'expert'),
                                   n_timesteps=300,
                                   n_episodes=n_episodes,
                                   image_folder=str(tmp_path /
                                                    'test_recorded_images'))

    assert set(dataset.keys()).issuperset(
        ['actions', 'obs', 'rewards', 'episode_returns', 'episode_starts'])
    assert sum(dataset['episode_starts']) == n_episodes
    assert len(dataset['episode_returns']) == n_episodes
    n_timesteps = len(dataset['episode_starts'])
    for key, val in dataset.items():
        if key != 'episode_returns':
            assert val.shape[
                0] == n_timesteps, "inconsistent number of timesteps at '{}'".format(
                    key)

    dataset_loaded = np.load(str(tmp_path / 'expert.npz'), allow_pickle=True)
    assert dataset.keys() == dataset_loaded.keys()
    for key in dataset.keys():
        assert (dataset[key] == dataset_loaded[key]
                ).all(), "different data at '{}'".format(key)
    # Cleanup folder
    if os.path.isdir(str(tmp_path / 'test_recorded_images')):
        shutil.rmtree(str(tmp_path / 'test_recorded_images'))
Example #12
0
def main(cfg, run_dir):
    run_name = make_run_name(cfg)
    output_dir = run_dir / run_name
    output_dir.mkdir(parents=True)

    with (output_dir / 'config.json').open('w') as fp:
        json.dump(cfg, fp, indent=2)

    # Setting log levels to cut out minor errors
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    tf.logging.set_verbosity(tf.logging.ERROR)

    log_dir = output_dir / cfg['log_dir']
    tensorboard_dir = output_dir / cfg['tb_dir']

    configure(log_dir=str(log_dir),
              format_strs=['log', 'csv', 'tensorboard'],
              tensorboard_dir=str(tensorboard_dir))

    # Create and wrap the environment
    logging.info('Starting {env_name}'.format(**cfg))
    env = make_atari_env(env_id=cfg['env_name'],
                         num_env=8,
                         seed=cfg['train_seed'])
    env = VecFrameStack(env, n_stack=4)
    if cfg['normalize']:
        env = VecNormalize(env)

    # Setting all known random seeds (Python, Numpy, TF, Gym if available)
    set_global_seeds(cfg['train_seed'])

    logging.info('Running {algo}'.format(**cfg))

    algo = get_algo(cfg['algo'])
    policy = cfg['policy_type']
    feature_extractor = get_network_builder(cfg['network'])
    attn_loss = get_loss(cfg['attn_loss'])()
    model = algo(
        policy=policy,
        env=env,
        verbose=1,
        learning_rate=lambda frac: 0.00025 * frac,
        attn_loss=attn_loss,
        attn_coef=cfg['attn_coef'],
        policy_kwargs={
            'cnn_extractor': feature_extractor,
        },
        tensorboard_log=str(tensorboard_dir),
    )

    logging.info('Training for {time_steps} steps'.format(**cfg))

    # Training
    model.learn(
        total_timesteps=cfg['time_steps'],
        log_interval=cfg['log_interval'],
        tb_log_name=None,
        callback=Callback(output_dir),
    )
def main():
  env_id = 'BreakoutNoFrameskip-v4'
  num_env = 5
  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)
  graph = tf.Graph()
  with graph.as_default():
    sess = tf_util.make_session(graph=graph)
    with tf.variable_scope('input', reuse=False):
      input_x, process_x = observation_input(env.observation_space, num_env)
      print(env.action_space.shape)
      pdtype = make_proba_dist_type(env.action_space)
      actions_ph = pdtype.sample_placeholder([num_env], name="action_ph")
      one_hot_actions = tf.one_hot(actions_ph, env.action_space.n)
      
    print(input_x, process_x)
    print('action', actions_ph, one_hot_actions)

    beta = 0.1
    mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions)
    print(mu)
    print(sigma_sq)
    print(recons_x)

    with tf.name_scope('losses'):
      recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss')
      kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)),
                                      name='kl_divergence')
      loss = tf.add(recons_loss,
                    tf.multiply(
                      kl_divergence,
                      beta), name='objective')
      print(loss)
    summary = utility.summary({recons_loss: 'recons_loss',
                               kl_divergence: 'kl_divergence',
                               mu: 'phi_mu',
                               sigma_sq: 'sigma_sq',
                               recons_x: 'recons_x',
                               input_x: 'input_x',
                               }, env.observation_space.shape)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5)
    train_op = optimizer.minimize(loss)

    for event_file in LOG_DIR.glob('event*'):
      event_file.unlink()
    writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph)
    sess.run(tf.global_variables_initializer())

    observ = env.reset()
    actions = [env.action_space.sample() for _ in range(num_env)]
    print(env.observation_space)
    print(observ.shape)

    recons_image, summary_ = sess.run([recons_x, summary],
                                      feed_dict={input_x: observ,
                                                 actions_ph: actions})
    writer.add_summary(summary_, 0)
Example #14
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None,
                    env_kwargs=None):

    if hyperparams is None:
        hyperparams = {}

    if env_kwargs is None:
        env_kwargs = {}

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    else:
        # start_method = 'spawn' for thread safe
        env = DummyVecEnv([
            make_env(env_id,
                     i,
                     seed,
                     log_dir,
                     wrapper_class=None,
                     env_kwargs=env_kwargs) for i in range(n_envs)
        ])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #15
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
Example #16
0
        def create_env(n_envs):
            """
            Create the environment and wrap it if necessary
            :param n_envs: (int)
            :return: (gym.Env)
            """
            global hyperparams

            if is_atari:
                if args.verbose > 0:
                    print("Using Atari wrapper")
                env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
                # Frame-stacking with 4 frames
                env = VecFrameStack(env, n_stack=4)
            elif args.algo in ['dqn', 'ddpg']:
                if hyperparams.get('normalize', False):
                    print(
                        "WARNING: normalization not supported yet for DDPG/DQN"
                    )
                # No env_wrapper applied for now as not using make_env()
                env = gym.make(env_id)
                env.seed(args.seed)
            else:
                if n_envs == 1:
                    env = DummyVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = DummyVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                        for i in range(n_envs)
                    ])
                if normalize:
                    if args.verbose > 0:
                        print("Normalizing input and return")
                    env = VecNormalize(env, **normalize_kwargs)
            # Optional Frame-stacking
            if hyperparams.get('frame_stack', False):
                n_stack = hyperparams['frame_stack']
                env = VecFrameStack(env, n_stack)
                print("Stacking {} frames".format(n_stack))
                del hyperparams['frame_stack']
            return env
Example #17
0
def test_generate(generate_env):
    model, policy, env_name, n_env, n_episodes = generate_env

    if n_env > 1:
        env = make_atari_env(env_name, num_env=n_env, seed=0)
        model = model(policy, env, verbose=0)
    else:
        model = model(policy, env_name, verbose=0)

    generate_expert_traj(model,
                         'expert',
                         n_timesteps=1000,
                         n_episodes=n_episodes,
                         image_folder='test_recorded_images')
Example #18
0
    def evaluate(self, n_episodes=2):

        logging.basicConfig(level=logging.INFO)

        id = 'BreakoutNoFrameskip-v4'
        num_env = 1
        n_stack = 4
        left_lives = 5
        seed = 0
        episodes = 0
        score = 0
        frames = 0
        frames_per_episode = list()
        scores = [list() for i in range(n_episodes)]

        env = make_atari_env(id, num_env=num_env, seed=seed)
        env = VecFrameStack(env, n_stack=n_stack)
        obs = env.reset()

        while (n_episodes - episodes) > 0:
            frames += 1
            action, _states = self.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            score += rewards[0]
            if dones:
                logging.debug('You died')
                logging.debug(f'Score = {score}')
                scores[episodes].append(score)
                score = 0
                left_lives -= 1
            if not left_lives:
                logging.debug('Episode ended')
                logging.info(f'Scores per life: {scores[episodes]}')
                frames_per_episode.append(frames)
                frames = 0
                episodes += 1
                left_lives = 5

        s = list(map(sum, scores))
        avg_s = int(sum(s) / len(s))
        avg_f = int(sum(frames_per_episode) / len(frames_per_episode))

        logging.info(f'Played {n_episodes} episodes')
        logging.info(f'Scores per episode : {s}')
        logging.info(f'Average score per episode : {avg_s}')
        logging.info(f'Average number of frames per episode : {avg_f}')

        return avg_f, avg_s
def test_pretrain_images(tmp_path):
    env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)
    model = PPO2('CnnPolicy', env)
    generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1,
                         image_folder=str(tmp_path / 'pretrain_recorded_images'))

    expert_path = str(tmp_path / 'expert_pong.npz')
    dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32,
                            sequential_preprocessing=True)
    model.pretrain(dataset, n_epochs=2)

    shutil.rmtree(str(tmp_path / 'pretrain_recorded_images'))
    env.close()
    del dataset, model, env
Example #20
0
    def __init__(self, env_list=default_envs, algos_list=default_algos):
        self.env_list = env_list
        self.algos_list = algos_list
        self.n_algos = len(self.algos_list)
        self.envs = dict()
        self.rewards = defaultdict(dict)
        self.models = defaultdict(dict)  # HAY QUE GUARDAR LOS MODELOS PARA ENSEMBLE

        for env_name in self.env_list:
            new_env = make_atari_env(env_name, num_env=1, seed=0)
            new_env = VecFrameStack(new_env, n_stack=4)
            self.envs[env_name] = new_env

        for algo in self.algos_list:
            for env_name, env in self.envs.items():
                self.models[env_name][algo] = loader(algo, env_name)
Example #21
0
def test():
    model = PPO2.load("model.pkl")
    sess = model.sess
    
    env = VecFrameStack(make_atari_env("SpaceInvadersNoFrameskip-v0", 1, 123), 4)

    pi = model.act_model
    action_dist = pi.action
    action_one = pi.deterministic_action

    o = env.reset()

    while(True):
        env.render()
        # a, _, _, _ = pi.step(obs=o, deterministic=True)
        a = sess.run(action_one, {pi.obs_ph: o})
        o, r, d, _ = env.step(a)
Example #22
0
def test_ppo(env_id, seed, path_to_policy_params, n_envs = 1):
    
    """
     env_id: typr str, identifies each environment uniquely
     num_timesteps: number of timesteps to run the algorithm
     seed: initial random seed
     policy: policy to be followed (mlp, cnn, lstm, etc)
     n_env: number of envs to run in parallel
     nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params
     n_steps: number of steps in each update
    """
    
    # Train PPO algorithm for num_timesteps
    # stack 4 frames for the vectorized environment
    # Note: PPO2 works only with vectorized environment
    env = VecFrameStack(make_atari_env(env_id = env_id, num_env = n_envs, seed=seed), 4)
    # define the policy
    # create model object for class PPO2
    # The policy is CnnPolicy from stable baselines and has been trained for 2e7 time steps on Pong
    
    model = PPO2.load(path_to_policy_params)
    vr = video_recorder.VideoRecorder(env, base_path="./videos/Pong_test_without_attack", enabled="./videos/Pong_test_without_attack" is not None)
    
    obs = env.reset()
    ep_rew = [0.0]
    ep = 0
    for i in range(50000):
      action, _states = model.predict(obs)
      obs, rewards, dones, info = env.step(action)
      ep_rew[-1] += rewards
      env.render()
      vr.capture_frame()
      if dones:
        obs = env.reset()
        print('Net reward for episode ',ep,': ',ep_rew[-1])
        if((ep+1)%10 == 0):
          print('Mean reward for last 10 episodes: ',np.mean(ep_rew[-10:]))
        ep_rew.append(0.0)
        ep += 1
        print('Number of timesteps completed: ', i+1)
    env.close()
    vr.close()
Example #23
0
def run_gail():
    parser = argparse.ArgumentParser()
    parser.add_argument('expert',
                        type=str,
                        default=None,
                        help='Expert path (*.npz)')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--note', type=str, default='test')
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--num-steps', type=int, default=1000000)
    parser.add_argument('--policy',
                        type=str,
                        default='CnnPolicy',
                        choices=[
                            'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy',
                            'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'
                        ],
                        help='Policy architecture')
    args = parser.parse_args()

    logger.configure(os.path.join('logs', args.env, args.note))
    logger.info(args)

    if 'NoFrameskip' in args.env:
        env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    else:
        import gym
        env = gym.make(args.env)

    dataset = ExpertDataset(expert_path=args.expert,
                            batch_size=128,
                            train_fraction=0.99,
                            verbose=1)
    model = GAIL(args.policy,
                 env,
                 dataset,
                 timesteps_per_batch=1280,
                 verbose=1)
    model.learn(len(dataset.train_loader) * 1280)
Example #24
0
def main():
    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    set_seed(config['seed'])

    writer = None
    # Will ERROR if outdir already exists
    if not os.path.exists(config['outdir']):
        os.makedirs(config['outdir'])
        if config['use_tensorboard']:
            os.makedirs(os.path.join(config['outdir'], 'tensorboard'))
            writer = SummaryWriter(
                os.path.join(config['outdir'], 'tensorboard'))
        # save a copy of the config file
        shutil.copyfile(args.config,
                        os.path.join(config['outdir'], 'config.yaml'))
    else:
        print("ERROR: directory \'./{}\' already exists!".format(
            config['outdir']))
        raise EnvironmentError

    logger = get_logger(config)

    # create environment
    env = make_atari_env(config['task'],
                         num_env=config['parallel_envs'],
                         seed=config['seed'])
    env = VecFrameStack(env, n_stack=config['state_frames'])

    # default device for torch tensors
    device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu')

    # start training
    a2c = A2C(config, env, device, logger, writer)
    a2c.train()
Example #25
0
from threading import Thread


parser = argparse.ArgumentParser()
parser.add_argument("--angle", type=float, default=0.0) # Kamerawinkel: 0 15 30 45 60
parser.add_argument("--system", type=str, default="Windows")
parser.add_argument("--factor", type=int, default=0)
args = parser.parse_args()

scale_factor_arr = [0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6]      
scale_factor_ind = args.factor

game_system = args.system
env_name = f"jupong-3D-{game_system}-v0"
env = make_atari_env(env_name, num_env=1, seed=0)
env.envs[0].reset()
env.envs[0].scale_paddles(scale_factor_arr[scale_factor_ind])
env = VecFrameStack(env, n_stack=4)

save_path = f"ppo2_save/ppo2_save_cam_angle_{args.angle}_4"
model = PPO2.load(save_path, env=None)
model.set_env(env)

def process_environment(file_path, scale_factor_ind):
    reward_arr = []
    mean_reward = 0.0
    obs = env.reset()
    reward_sum = 0.0
    while True:
        action, _states = model.predict(obs)
Example #26
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper)
            for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        try:
            class_ = load(spec.entry_point)
        except AttributeError:
            # Backward compatibility with gym
            class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        render_name = None
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args
        if not use_subproc:
            render_name = 'renders'
        # Dev branch of pybullet
        # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args
        # if not use_subproc and render_name is None:
        #     render_name = 'render'

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, **{render_name: should_render})
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([
                make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)
            ])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv(
            [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])
            env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #27
0
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines import ACER

# There already exists an environment generator
# that will make and wrap atari environments correctly.
# Here we are also multiprocessing training (num_env=4 => 4 processes)
env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0)
# Frame-stacking with 4 frames
env = VecFrameStack(env, n_stack=4)

model = ACER('CnnPolicy', env, verbose=1)
model.learn(total_timesteps=25000)

# save
model.save("cnn_pong")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    hout, wout = get_ops(get_ops(get_ops(hin, 8, 4), 4, 2), 3, 1), get_ops(get_ops(get_ops(win, 8, 4), 4, 2), 3, 1)
    self.linear = 64 * hout * wout
    self.lin0 = nn.Linear(self.linear, 512, bias = True)
    self.lin1 = nn.Linear(512, actions)

  def forward(self, t):
    t = f.relu(self.conv0(t))
    t = f.relu(self.conv1(t))
    t = f.relu(self.conv2(t))
    t = t.permute(0,2,3,1).reshape(-1)
    t = f.relu(self.lin0(t))
    t = self.lin1(t)
    # Output will be the logits
    return t

env = VecFrameStack(make_atari_env(env_id = 'PongNoFrameskip-v4', num_env = 1, seed = 2), 4)
env.reset().shape
h, w = env.reset().shape[1], env.reset().shape[2]
dqn_net = network(h, w, env.action_space.n)
ppo_net = network(h, w, env.action_space.n)
ppo2_net = network(h, w, env.action_space.n)

# params ppo
# Transfer of weights from saved model to newly created network
# conv weights
x = np.transpose(ppo_params[1][0], [3,2,0,1])
ppo_net.conv0.weight = torch.nn.parameter.Parameter(torch.tensor(x))
x = np.transpose(ppo_params[1][2], [3,2,0,1])
ppo_net.conv1.weight = torch.nn.parameter.Parameter(torch.tensor(x))
x = np.transpose(ppo_params[1][4], [3,2,0,1])
ppo_net.conv2.weight = torch.nn.parameter.Parameter(torch.tensor(x))
def main(cfg, model_path, video_path, visualization_method, n_gradient_samples,
         obs_style):
    set_global_seeds(cfg['eval_seed'])

    env = make_atari_env(cfg['env_name'], num_env=1, seed=cfg['eval_seed'])
    env = VecFrameStack(env, n_stack=4)  # stack 4 frames
    if cfg['normalize']:
        # Not setting training=False because that seems to ruin performance
        env = VecNormalize(env)

    model = get_algo(cfg['algo']).load(
        str(model_path),
        env,
        verbose=1,
        learning_rate=lambda frac: 0.00025 * frac,
        attn_loss=get_loss(cfg['attn_loss'])(),
        attn_coef=cfg['attn_coef'],
        policy_kwargs={'cnn_extractor': get_network_builder(cfg['network'])},
    )

    observations = []
    saliency_maps = []

    input_tensor = model.sess.graph.get_tensor_by_name("input/Ob:0")
    input_cast_tensor = model.sess.graph.get_tensor_by_name("input/Cast:0")
    a2_activations = model.sess.graph.get_tensor_by_name("model/a2/add:0")

    attn_tensor = model.sess.graph.get_tensor_by_name('model/attn:0')
    attn_tensor = tf.reduce_sum(attn_tensor, axis=-1)

    sr = SaliencyRenderer(
        sess=model.sess,
        gradient_source_tensor=input_cast_tensor,
        attention_tensor=a2_activations,
        selection_method='SUM',
    )

    obs = env.reset()
    for _ in tqdm(range(300), postfix='playing', ncols=76):
        if obs_style == 'human':
            stored_obs = np.stack(env.get_images()) / 255
        else:
            stored_obs = obs[:, :, :, -1].copy()
        observations.append(stored_obs)

        if visualization_method == 'conv2d_transpose':
            action, _states, attn = model.predict(obs, extra=attn_tensor)
            saliency_maps.append(attn)
        else:
            action, _states = model.predict(obs)

            smap = sr.get_basic_input_saliency_map(
                input_tensor,
                obs,
                n_gradient_samples=n_gradient_samples,
                gradient_sigma_spread=0.15,
                aggregation_method={
                    'simonyan': None,
                    'smoothgrad': 'smoothgrad',
                    'vargrad': 'vargrad',
                }[visualization_method])[..., -1]

            saliency_maps.append(smap)

        obs, rewards, dones, info = env.step(action)

    if visualization_method == 'conv2d_transpose':
        saliency_maps = render_attn(saliency_maps, 36, 8, 0)

    saliency_cutoff = max(np.percentile(attn, 99) for attn in saliency_maps)
    for smap in saliency_maps:
        smap /= saliency_cutoff
        np.clip(smap, a_min=0, a_max=1, out=smap)

    with VideoWriter(video_path, fps=10) as writer:
        for obs, smap in tqdm(zip(observations, saliency_maps),
                              postfix='writing video',
                              total=len(observations),
                              ncols=76):
            if obs_style == 'human':
                b, h, w = obs.shape[:-1]
                assert obs.shape[-1] == 3
                resized_attn = np.stack(
                    [resize(smap[bb, ...], (h, w)) for bb in range(b)])
                frame = 0.5 * (obs + resized_attn[..., np.newaxis])
            else:
                frame = np.stack(
                    [
                        np.zeros_like(obs),
                        smap,
                        obs.astype(np.float32)  # / 255
                    ],
                    axis=-1)
                frame = resize(frame, (1, 160, 160, 3))
            writer.write_frame(frame)
Example #30
0
        help="interval between saving model (default: 0, means don't save)")

    args = parser.parse_args()

    dtype = torch.float64
    torch.set_default_dtype(dtype)

    if args.cuda:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
    else:
        args.device = torch.device('cpu')

    args.num_threads = mp.cpu_count() - 1
    """environment"""
    env = make_atari_env(args.env_name, 1, args.seed)
    env = VecFrameStack(env, n_stack=4)

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # load trajectory
    args.expert_traj_path = "assets/expert_traj/{}_ppo_0.p".format(
        args.env_name)
    expert_trajs, _, _ = pickle.load(open(args.expert_traj_path, "rb"))

    imitator = GAILAtari(args, state_dim, action_dim)