Esempio n. 1
0
def train(num_episodes=1000, 
          save_every=100, 
          checkpoint_dir="checkpoints",
          tensorboard_dir="tensorboard",
          tboard_every=10):
    pol = Policy()
    writer = tf.contrib.summary.create_file_writer(tensorboard_dir)
    for j in range(1, num_episodes+1):
        random_secret = random.randint(0, config.max_guesses - 1)
        e = Episode(pol, random_secret)
        history = e.generate()

        print("Episode length: {}".format(len(history)))

        G = -1 

        optimizer = \
            tf.train.GradientDescentOptimizer(
                learning_rate=config.reinforce_alpha*G)

        for i in reversed(range(1, len(history))):
            history_so_far = history[:i]
            next_action, _ = history[i]
            with tfe.GradientTape() as tape:
                action_logits = pol(history_so_far, with_softmax=False)
                loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.one_hot(
                        tf.convert_to_tensor([next_action]),
                        config.max_guesses),
                    logits=action_logits
                )

            grads = tape.gradient(loss, pol.variables)
            optimizer.apply_gradients(zip(grads, pol.variables))

            G -= 1
            optimizer._learning_rate = G * config.reinforce_alpha
            optimizer._learning_rate_tensor = None
            # hack. Should be able to pass a callable as learning_rate, see
            # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args
            # can I perhaps submit a PR to fix this bug?

            sys.stdout.write("{}/{}\r".format(len(history)-i, len(history)))

        if j % save_every == 0 or j == num_episodes:
            saver = tfe.Saver(pol.named_variables)
            save_path = os.path.join(checkpoint_dir, 
                                     "episode{}".format(
                                         str(j).zfill(len(str(num_episodes)))))
            saver.save(save_path)

        if j % tboard_every == 0:
            with writer.as_default():
                with tf.contrib.summary.always_record_summaries():
                    tf.contrib.summary.scalar('total_return', 
                                              tf.convert_to_tensor([G]), 
                                              step=j)
Esempio n. 2
0
def load_policies(directory):
    directory = f"saved_models/{directory}"
    policies = {}
    for file in os.listdir(directory):
        model = tf.keras.models.load_model(f"{directory}/{file}")
        pol = Policy(hex_config["size"])
        pol.model = model
        policies[file] = pol
    return policies
Esempio n. 3
0
 def __init__(self):
     self.episodes = config["episodes"]
     self.amount_of_players = config["amount_of_players"]
     self.epsilon = config["epsilon"]
     self.epsilon_decay_rate = self.epsilon / self.episodes
     self.states = []
     self.distributions = []
     self.rewards = []
     self.policy = Policy(hex_config["size"] ** 2)
Esempio n. 4
0
 def __init__(self, env):
     # Load your Model here
     self.sess = tf.Session()
     self.saver = tf.train.import_meta_graph('policy_model/.meta')
     self.action_size = env.action_space.shape[0]
     self.policy = Policy(env.observation_space.shape[0], self.action_size,
                          0.003, 10, -1.0, None)
     self.saver.restore(self.sess,
                        tf.train.latest_checkpoint('policy_model/'))
Esempio n. 5
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, clipping_range):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    # saver = tf.train.Saver()
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    clipping_range)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        # saver.save(policy.sess, 'model_save', global_step=500)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Esempio n. 6
0
def load_policy(world, data):

    n = world.width * world.height

    if data == 'RAND':
        table = np.full((n, 4), 1. / 4.)
        return Policy(table)
    
    return None
Esempio n. 7
0
def load_policy(file_name):
  encoder = Encoder(in_channels = h.in_channels, feature_dim = h.feature_dim)  
  
  policy = Policy(encoder = encoder, feature_dim = h.feature_dim, num_actions = 15)
  policy.cuda()
  policy.load_state_dict(torch.load(MODEL_PATH + file_name + '.pt')["policy_state_dict"])
  policy.cuda() 

  return policy
Esempio n. 8
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #TODO Find out what this does
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Esempio n. 9
0
def record():
    '''
  This function generates a gif file for a single episode. This process may take some time.
  To watch the non-stop game play, please run the test() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"
    figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    list_obs = []
    list_reward = []

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_reward = 0
        obs = env.reset()
        while True:
            list_obs.append(obs)
            list_reward.append(total_reward)
            env.render()
            # Get observation.
            obs = (obs - obs_mean) / obs_std
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                list_obs.append(obs)
                list_reward.append(total_reward)
                break
    env.close()

    # Record the gameplay.
    imageio.mimsave(
        figure_path + "gameplay.gif",
        [plot_obs(obs, reward) for obs, reward in zip(list_obs, list_reward)],
        fps=30)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  arg,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
Esempio n. 11
0
 def __init__(self,
              domain: Assignment1Domain,
              epsilon: Optional[float] = 0.01,
              max_iterations: Optional[float] = 1000,
              gamma: Optional[float] = 0.9):
     self._domain = domain
     self._policy = Policy(domain)
     self._gamma = gamma
     self._epsilon = epsilon
     self._max_iterations = max_iterations
Esempio n. 12
0
 def __init__(self, state_size, action_size, sample_num):
     sess = tf.Session()
     self.policy = Policy(sess, state_size, action_size, sample_num)
     self.state_batch = []
     self.action_batch = []
     self.reward_list = []
     self.step_list = []
     self.weight_bach = []
     self.sample_num = sample_num
     sess.run(tf.global_variables_initializer())
Esempio n. 13
0
 def __init__(self, policy_params, env_name, noise):
     self.env = gym.make(env_name)
     self.transform = transforms.Compose([
         transforms.ToPILImage(),
         transforms.Resize((128, 128)),
         # transforms.Grayscale(),
         transforms.ToTensor()
     ])
     self.noise = SharedNoiseTable(noise)
     self.policy = Policy(**policy_params)
Esempio n. 14
0
    def test_save_restore(self):
        pol = Policy()
        episode = [(0, 0), (1, 0), (2, 3)]
        expected = pol(episode).numpy()
        with tempfile.TemporaryDirectory() as tdir:
            path = os.path.join(tdir, "checkpt")

            saver = tfe.Saver(pol.named_variables)
            saver.save(path)

            pol2 = Policy()
            def diff():
                actual = pol2(episode).numpy()
                return np.linalg.norm(actual-expected)

            self.assertGreater(diff(), 0.0001)
            saver = tfe.Saver(pol2.named_variables)
            saver.restore(path)
            self.assertGreaterEqual(0.00001, diff())
Esempio n. 15
0
  def init(self):
    with open('../config/config.yaml') as f:
      _, config_features, _ = yaml.load_all(f)

    self.features = config_features
    self.redis = redis.StrictRedis('localhost')
    #self.redis.flushdb()
    self.redis.set_response_callback('HGETALL', self.hgetall_custom_callback)
    self.policy = Policy(self.redis)
    self.add_default_policy()
Esempio n. 16
0
    def from_saved_model(self, modelpath, hot_one=True):

        state = torch.load(realpath(modelpath))
        self.policy = Policy(state["params"]["layers"], device=cpu)
        self.policy.load_state_dict(state["state_dict"])

        self.hot_one = hot_one
        self.displayname = state["params"][
            "rewarder"].__class__.__name__ if "rewarder" in state[
                "params"].keys() else "Saved model"
        self.loaded = "policy"
Esempio n. 17
0
    def __init__(self, name, globalP):
        self.env = Toy()
        self.name = name
        self.policy = Policy(name + '/Policy',
                             env=self.env,
                             state_shape=self.env.observation_shape,
                             n_actions=16)
        self.policy.build()

        self.pull_global_op = get_pull_global(globalP, self.policy)
        self.update_global_op = get_update_global(globalP, self.policy)
def run_wrapper(model_dir,
                mode,
                input_frame,
                num_intentions=3,
                scale_x=1,
                scale_z=1,
                rate=28):
    rospy.init_node("joy_controller")
    controller = Controller(mode, scale_x, scale_z, rate)
    policy = Policy(mode, input_frame, 2, model_dir, num_intentions)
    controller.execute(policy)
Esempio n. 19
0
    def __init__(self, *args, **kwargs):
        super(QLearning, self).__init__(*args)

        self.max_episodes = kwargs.get("max_episodes", 200)
        self.alpha = kwargs.get("alpha", 0.9)
        self.Q = defaultdict(lambda: 0)
        self.epsilon = kwargs.get("epsilon", 0.8)

        self.V_evaluator = V_evaluator(
            self.environment, Policy(self.environment, self.Q),
            lambda state, action: self.Q[(state, action)])
def main():
    """Builds a Policy object out of an inventory and policy file and optionally
    generates reachability tables in HTML or CSV formats."""

    parser = argparse.ArgumentParser(
        description=
        'Liest Policies aus einer Datei und übersetzt sie wahlweise in HTML oder CSV.'
    )
    parser.add_argument(
        'files',
        metavar='FILE',
        nargs='+',
        help=
        'Either an inventory file followed by a policy file, or a single file that combines both.'
    )
    parser.add_argument('--html',
                        dest='generate_html',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Generate the html file.')
    parser.add_argument('--csv',
                        dest='generate_csv',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Generate the csv file.')
    args = parser.parse_args()

    files = []
    try:
        for i in range(min(2, len(args.files))):
            files.append(open(args.files[i], 'r'))
    except IOError:
        print("Fehler: Datei(en) konnte(n) nicht gelesen werden.")
        sys.exit(1)

    policy_chars = "".join([file.read() for file in files])
    policy = Policy()
    try:
        PolicyBuilder.build(policy_chars, policy)

        prefix = args.files[-1].rsplit('.', 1)[0]

        if args.generate_html:
            html_file = open(prefix + '-reachability.html', 'w')
            html_file.write(policy.to_html())

        if args.generate_csv:
            csv_file = open(prefix + '-reachability.csv', 'w')
            csv_file.write(policy.vlans_to_csv())
    except PolicyException, exception:
        print("Fehler: %s" % exception)
Esempio n. 21
0
 def __init__(self):
     self.no_cells = Hyper.N * Hyper.N
     #self.results = np.zeros((2, int(Hyper.total_episodes / 100) + 1), dtype=np.int16)
     self.results = np.zeros((2, Hyper.total_episodes), dtype=np.int16)
     self.no_episodes = 0
     self.setup_display_dict()
     self.setup_env()
     self.setup_reward_dict()
     self.setup_action_dict()
     self.policy = Policy()
     self.timesteps_per_episode = []
     self.rewards_per_episode = []
 def __init__(self,
              env: [UnityMlFacade],
              device,
              seed,
              verbose=1,
              gamma=0.99,
              actor_learning_rate=0.001,
              critic_learning_rate=0.001,
              buffer_size=100000,
              batch_size=100,
              snapshot_window=5,
              hidden_layers_comma_sep='400,30'):
     self.env = env
     self.device = device
     self.seed = seed
     self.verbose = verbose
     self.gamma = gamma
     self.buffer_size = buffer_size
     self.batch_size = batch_size
     self.snapshot_window = snapshot_window
     self.policy_snapshots = deque(maxlen=self.snapshot_window)
     self.current_policy_snapshot = -1
     self.last_save = 0
     self.last_swap = 0
     self.action_size = self.env.action_space.shape[0] * self.env.num_agents
     self.state_size = self.env.observation_space.shape[0] * self.env.num_agents  # this should be 48
     hidden_layers = [int(layer_width) for layer_width in hidden_layers_comma_sep.split(',')]
     # create agent1
     self.player_policy = Policy(0, state_size=self.state_size, action_size=self.action_size,
                                 hidden_dims=hidden_layers, device=self.device,
                                 actor_learning_rate=actor_learning_rate,
                                 critic_learning_rate=critic_learning_rate,
                                 random_seed=seed)
     # create agent2
     self.opponent_policy = Policy(1, state_size=self.state_size, action_size=self.action_size,
                                   hidden_dims=hidden_layers, device=self.device,
                                   actor_learning_rate=actor_learning_rate,
                                   critic_learning_rate=critic_learning_rate,
                                   random_seed=seed)
     self.t_step = 0
Esempio n. 23
0
def test_policy():
    rospy.init_node('controller')
    con = Controller(None)
    con.register(TeleControl())
    #clf = None
    clf = Policy(config.TASK)
    con.register(AutoControl(clf, config.TASK, 'a'))

    try:
        con.run()
    finally:
        con.pub.publish(Twist())
    return clf
Esempio n. 24
0
    def __init__(self, num_actions=3, num_means=2, gamma=0.99):

        print num_actions, num_means

        self.basis_function = Basis_Function(num_means, num_means, num_actions,
                                             gamma)
        num_basis = self.basis_function._num_basis()

        self.policy = Policy(self.basis_function, num_basis)
        self.lstdq = LSTDQ(self.basis_function, gamma, self.policy)

        self.stop_criterium = 10**-5
        self.gamma = gamma
Esempio n. 25
0
 def policy_iteration(self, pol: Policy):
     pol = Policy(
         {s: {a: 1. / len(v)
              for a in v}
          for s, v in self.rewards.items()})
     v_old = self.get_state_value_func(pol)
     converge = False
     while not converge:
         pol = self.greedy_improved_policy(pol)
         v_new = self.iterative_policy_evaluation(pol)
         converge = is_equal(np.linalg.norm(v_new), np.linalg.norm(v_old))
         v_old = v_new
     return pol
Esempio n. 26
0
def evaluate_agent(env, obs_dim, act_dim, num_episodes):

    policy = Policy(obs_dim, act_dim, 0.003)
    policy.restore_weights()

    scaler = Scaler(obs_dim)

    print("Restored weights, evaluating...")

    for i_episode in range(num_episodes):
        run_episode(env, policy, scaler, 100000, stochastic=True)

    env.kill()
 def __init__(self, policy, mean_model=None, variance_model=None, x_norm=None, u_norm=None, y_norm=None):
     super(ExpectedDistanceProduction, self).__init__()
     from policy import Policy
     self.policy = Policy(7, 3, 4)
     self.mean = RewardFCPlain(7, 4, 3)
     self.variance = FCPositive(7, 4, 3)
     self.x_norm = Normalization(7)
     self.u_norm = Normalization(4)
     self.g_norm = Normalization(3)
     self.register_buffer(
         'weights',
         torch.FloatTensor([1.0, 1.0, 0.1])
     )
Esempio n. 28
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    print('Testing Period:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))



    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    env.set_goals(0)

    now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories  格林尼治时间!!!  utcnow改为now
    testname = now+'-'+TestNote
    logger = Logger(logname=env_name, now=testname)
    aigym_path = os.path.join('log-Test-files', env_name, testname)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt')
    episode = 0

    observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True)
    tmp=np.vstack((rewards,states_x,states_y))
    tmp1=np.transpose(tmp)
    data = np.concatenate((observes, actions, tmp1),axis=1)
    trajectory = {}
    for j in range(data.shape[0]):
        for i in range(data.shape[1]):
            trajectory[i] = data[j][i]
        logger.log(trajectory)
        logger.write(display=False)


    logger.close()
    policy.close_sess()
    val_func.close_sess()

    print('End time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
Esempio n. 29
0
def main():
    max_iteration = 5000
    episodes_per_batch = 20
    max_kl = 0.01
    init_logvar = -1
    policy_epochs = 5
    value_epochs = 10
    value_batch_size = 256
    gamma = 0.995
    lam = .97

    # initialize environment
    env = HumanoidEnv()
    env.seed(0)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    logger = Logger()

    # init qpos and qvel
    init_qpos = np.load('./mocap_expert_qpos.npy')
    init_qvel = np.load('./mocap_expert_qvel.npy')

    # policy function
    policy = Policy(obs_dim=obs_dim,
                    act_dim=act_dim,
                    max_kl=max_kl,
                    init_logvar=init_logvar,
                    epochs=policy_epochs,
                    logger=logger)

    session_to_restore = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/model_humanoid_ego_1700'
    stats_to_recover = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/stats_humanoid_ego_1700'
    scale, offset = policy.restore_session(
        session_to_restore=session_to_restore,
        stats_to_recover=stats_to_recover)

    # expert agent
    agent = ExpertAgent(env=env,
                        policy_function=policy,
                        scale=scale,
                        offset=offset,
                        init_qpos=init_qpos,
                        init_qvel=init_qvel,
                        logger=logger)

    agent.collect(episodes_per_batch=20)

    # close everything
    policy.close_session()
Esempio n. 30
0
def test():
    '''
  This function visualizes the game play. The environment will be reset immediately and the game will not be recorded.
  To record the game play, please run the record() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_step = 0
        total_reward = 0
        while True:
            # Get observation.
            if total_step == 0:
                obs = env.reset()
            else:
                obs = obs_next
            obs = (obs - obs_mean) / obs_std
            env.render()
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs_next, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                # Reset environment.
                print("Episodic reward: ", total_reward, sep="")
                obs_next = env.reset()
                total_reward = 0
            # Update step counter.
            total_step += 1
        env.close()