コード例 #1
0
ファイル: train.py プロジェクト: egeromin/mastermind
def train(num_episodes=1000, 
          save_every=100, 
          checkpoint_dir="checkpoints",
          tensorboard_dir="tensorboard",
          tboard_every=10):
    pol = Policy()
    writer = tf.contrib.summary.create_file_writer(tensorboard_dir)
    for j in range(1, num_episodes+1):
        random_secret = random.randint(0, config.max_guesses - 1)
        e = Episode(pol, random_secret)
        history = e.generate()

        print("Episode length: {}".format(len(history)))

        G = -1 

        optimizer = \
            tf.train.GradientDescentOptimizer(
                learning_rate=config.reinforce_alpha*G)

        for i in reversed(range(1, len(history))):
            history_so_far = history[:i]
            next_action, _ = history[i]
            with tfe.GradientTape() as tape:
                action_logits = pol(history_so_far, with_softmax=False)
                loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.one_hot(
                        tf.convert_to_tensor([next_action]),
                        config.max_guesses),
                    logits=action_logits
                )

            grads = tape.gradient(loss, pol.variables)
            optimizer.apply_gradients(zip(grads, pol.variables))

            G -= 1
            optimizer._learning_rate = G * config.reinforce_alpha
            optimizer._learning_rate_tensor = None
            # hack. Should be able to pass a callable as learning_rate, see
            # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args
            # can I perhaps submit a PR to fix this bug?

            sys.stdout.write("{}/{}\r".format(len(history)-i, len(history)))

        if j % save_every == 0 or j == num_episodes:
            saver = tfe.Saver(pol.named_variables)
            save_path = os.path.join(checkpoint_dir, 
                                     "episode{}".format(
                                         str(j).zfill(len(str(num_episodes)))))
            saver.save(save_path)

        if j % tboard_every == 0:
            with writer.as_default():
                with tf.contrib.summary.always_record_summaries():
                    tf.contrib.summary.scalar('total_return', 
                                              tf.convert_to_tensor([G]), 
                                              step=j)
コード例 #2
0
def load_policies(directory):
    directory = f"saved_models/{directory}"
    policies = {}
    for file in os.listdir(directory):
        model = tf.keras.models.load_model(f"{directory}/{file}")
        pol = Policy(hex_config["size"])
        pol.model = model
        policies[file] = pol
    return policies
コード例 #3
0
ファイル: trainer.py プロジェクト: s0lvang/IT3105-project3
 def __init__(self):
     self.episodes = config["episodes"]
     self.amount_of_players = config["amount_of_players"]
     self.epsilon = config["epsilon"]
     self.epsilon_decay_rate = self.epsilon / self.episodes
     self.states = []
     self.distributions = []
     self.rewards = []
     self.policy = Policy(hex_config["size"] ** 2)
コード例 #4
0
ファイル: finalmodel.py プロジェクト: parksang21/RL
 def __init__(self, env):
     # Load your Model here
     self.sess = tf.Session()
     self.saver = tf.train.import_meta_graph('policy_model/.meta')
     self.action_size = env.action_space.shape[0]
     self.policy = Policy(env.observation_space.shape[0], self.action_size,
                          0.003, 10, -1.0, None)
     self.saver.restore(self.sess,
                        tf.train.latest_checkpoint('policy_model/'))
コード例 #5
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, clipping_range):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    # saver = tf.train.Saver()
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    clipping_range)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        # saver.save(policy.sess, 'model_save', global_step=500)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
コード例 #6
0
def load_policy(world, data):

    n = world.width * world.height

    if data == 'RAND':
        table = np.full((n, 4), 1. / 4.)
        return Policy(table)
    
    return None
コード例 #7
0
def load_policy(file_name):
  encoder = Encoder(in_channels = h.in_channels, feature_dim = h.feature_dim)  
  
  policy = Policy(encoder = encoder, feature_dim = h.feature_dim, num_actions = 15)
  policy.cuda()
  policy.load_state_dict(torch.load(MODEL_PATH + file_name + '.pt')["policy_state_dict"])
  policy.cuda() 

  return policy
コード例 #8
0
ファイル: train.py プロジェクト: panserbjorn/ControllerV-REP
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #TODO Find out what this does
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
コード例 #9
0
ファイル: test.py プロジェクト: ZhenkaiShou/project
def record():
    '''
  This function generates a gif file for a single episode. This process may take some time.
  To watch the non-stop game play, please run the test() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"
    figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    list_obs = []
    list_reward = []

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_reward = 0
        obs = env.reset()
        while True:
            list_obs.append(obs)
            list_reward.append(total_reward)
            env.render()
            # Get observation.
            obs = (obs - obs_mean) / obs_std
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                list_obs.append(obs)
                list_reward.append(total_reward)
                break
    env.close()

    # Record the gameplay.
    imageio.mimsave(
        figure_path + "gameplay.gif",
        [plot_obs(obs, reward) for obs, reward in zip(list_obs, list_reward)],
        fps=30)
コード例 #10
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  arg,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
コード例 #11
0
 def __init__(self,
              domain: Assignment1Domain,
              epsilon: Optional[float] = 0.01,
              max_iterations: Optional[float] = 1000,
              gamma: Optional[float] = 0.9):
     self._domain = domain
     self._policy = Policy(domain)
     self._gamma = gamma
     self._epsilon = epsilon
     self._max_iterations = max_iterations
コード例 #12
0
 def __init__(self, state_size, action_size, sample_num):
     sess = tf.Session()
     self.policy = Policy(sess, state_size, action_size, sample_num)
     self.state_batch = []
     self.action_batch = []
     self.reward_list = []
     self.step_list = []
     self.weight_bach = []
     self.sample_num = sample_num
     sess.run(tf.global_variables_initializer())
コード例 #13
0
 def __init__(self, policy_params, env_name, noise):
     self.env = gym.make(env_name)
     self.transform = transforms.Compose([
         transforms.ToPILImage(),
         transforms.Resize((128, 128)),
         # transforms.Grayscale(),
         transforms.ToTensor()
     ])
     self.noise = SharedNoiseTable(noise)
     self.policy = Policy(**policy_params)
コード例 #14
0
    def test_save_restore(self):
        pol = Policy()
        episode = [(0, 0), (1, 0), (2, 3)]
        expected = pol(episode).numpy()
        with tempfile.TemporaryDirectory() as tdir:
            path = os.path.join(tdir, "checkpt")

            saver = tfe.Saver(pol.named_variables)
            saver.save(path)

            pol2 = Policy()
            def diff():
                actual = pol2(episode).numpy()
                return np.linalg.norm(actual-expected)

            self.assertGreater(diff(), 0.0001)
            saver = tfe.Saver(pol2.named_variables)
            saver.restore(path)
            self.assertGreaterEqual(0.00001, diff())
コード例 #15
0
  def init(self):
    with open('../config/config.yaml') as f:
      _, config_features, _ = yaml.load_all(f)

    self.features = config_features
    self.redis = redis.StrictRedis('localhost')
    #self.redis.flushdb()
    self.redis.set_response_callback('HGETALL', self.hgetall_custom_callback)
    self.policy = Policy(self.redis)
    self.add_default_policy()
コード例 #16
0
    def from_saved_model(self, modelpath, hot_one=True):

        state = torch.load(realpath(modelpath))
        self.policy = Policy(state["params"]["layers"], device=cpu)
        self.policy.load_state_dict(state["state_dict"])

        self.hot_one = hot_one
        self.displayname = state["params"][
            "rewarder"].__class__.__name__ if "rewarder" in state[
                "params"].keys() else "Saved model"
        self.loaded = "policy"
コード例 #17
0
    def __init__(self, name, globalP):
        self.env = Toy()
        self.name = name
        self.policy = Policy(name + '/Policy',
                             env=self.env,
                             state_shape=self.env.observation_shape,
                             n_actions=16)
        self.policy.build()

        self.pull_global_op = get_pull_global(globalP, self.policy)
        self.update_global_op = get_update_global(globalP, self.policy)
コード例 #18
0
def run_wrapper(model_dir,
                mode,
                input_frame,
                num_intentions=3,
                scale_x=1,
                scale_z=1,
                rate=28):
    rospy.init_node("joy_controller")
    controller = Controller(mode, scale_x, scale_z, rate)
    policy = Policy(mode, input_frame, 2, model_dir, num_intentions)
    controller.execute(policy)
コード例 #19
0
    def __init__(self, *args, **kwargs):
        super(QLearning, self).__init__(*args)

        self.max_episodes = kwargs.get("max_episodes", 200)
        self.alpha = kwargs.get("alpha", 0.9)
        self.Q = defaultdict(lambda: 0)
        self.epsilon = kwargs.get("epsilon", 0.8)

        self.V_evaluator = V_evaluator(
            self.environment, Policy(self.environment, self.Q),
            lambda state, action: self.Q[(state, action)])
コード例 #20
0
def main():
    """Builds a Policy object out of an inventory and policy file and optionally
    generates reachability tables in HTML or CSV formats."""

    parser = argparse.ArgumentParser(
        description=
        'Liest Policies aus einer Datei und übersetzt sie wahlweise in HTML oder CSV.'
    )
    parser.add_argument(
        'files',
        metavar='FILE',
        nargs='+',
        help=
        'Either an inventory file followed by a policy file, or a single file that combines both.'
    )
    parser.add_argument('--html',
                        dest='generate_html',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Generate the html file.')
    parser.add_argument('--csv',
                        dest='generate_csv',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Generate the csv file.')
    args = parser.parse_args()

    files = []
    try:
        for i in range(min(2, len(args.files))):
            files.append(open(args.files[i], 'r'))
    except IOError:
        print("Fehler: Datei(en) konnte(n) nicht gelesen werden.")
        sys.exit(1)

    policy_chars = "".join([file.read() for file in files])
    policy = Policy()
    try:
        PolicyBuilder.build(policy_chars, policy)

        prefix = args.files[-1].rsplit('.', 1)[0]

        if args.generate_html:
            html_file = open(prefix + '-reachability.html', 'w')
            html_file.write(policy.to_html())

        if args.generate_csv:
            csv_file = open(prefix + '-reachability.csv', 'w')
            csv_file.write(policy.vlans_to_csv())
    except PolicyException, exception:
        print("Fehler: %s" % exception)
コード例 #21
0
 def __init__(self):
     self.no_cells = Hyper.N * Hyper.N
     #self.results = np.zeros((2, int(Hyper.total_episodes / 100) + 1), dtype=np.int16)
     self.results = np.zeros((2, Hyper.total_episodes), dtype=np.int16)
     self.no_episodes = 0
     self.setup_display_dict()
     self.setup_env()
     self.setup_reward_dict()
     self.setup_action_dict()
     self.policy = Policy()
     self.timesteps_per_episode = []
     self.rewards_per_episode = []
コード例 #22
0
 def __init__(self,
              env: [UnityMlFacade],
              device,
              seed,
              verbose=1,
              gamma=0.99,
              actor_learning_rate=0.001,
              critic_learning_rate=0.001,
              buffer_size=100000,
              batch_size=100,
              snapshot_window=5,
              hidden_layers_comma_sep='400,30'):
     self.env = env
     self.device = device
     self.seed = seed
     self.verbose = verbose
     self.gamma = gamma
     self.buffer_size = buffer_size
     self.batch_size = batch_size
     self.snapshot_window = snapshot_window
     self.policy_snapshots = deque(maxlen=self.snapshot_window)
     self.current_policy_snapshot = -1
     self.last_save = 0
     self.last_swap = 0
     self.action_size = self.env.action_space.shape[0] * self.env.num_agents
     self.state_size = self.env.observation_space.shape[0] * self.env.num_agents  # this should be 48
     hidden_layers = [int(layer_width) for layer_width in hidden_layers_comma_sep.split(',')]
     # create agent1
     self.player_policy = Policy(0, state_size=self.state_size, action_size=self.action_size,
                                 hidden_dims=hidden_layers, device=self.device,
                                 actor_learning_rate=actor_learning_rate,
                                 critic_learning_rate=critic_learning_rate,
                                 random_seed=seed)
     # create agent2
     self.opponent_policy = Policy(1, state_size=self.state_size, action_size=self.action_size,
                                   hidden_dims=hidden_layers, device=self.device,
                                   actor_learning_rate=actor_learning_rate,
                                   critic_learning_rate=critic_learning_rate,
                                   random_seed=seed)
     self.t_step = 0
コード例 #23
0
def test_policy():
    rospy.init_node('controller')
    con = Controller(None)
    con.register(TeleControl())
    #clf = None
    clf = Policy(config.TASK)
    con.register(AutoControl(clf, config.TASK, 'a'))

    try:
        con.run()
    finally:
        con.pub.publish(Twist())
    return clf
コード例 #24
0
    def __init__(self, num_actions=3, num_means=2, gamma=0.99):

        print num_actions, num_means

        self.basis_function = Basis_Function(num_means, num_means, num_actions,
                                             gamma)
        num_basis = self.basis_function._num_basis()

        self.policy = Policy(self.basis_function, num_basis)
        self.lstdq = LSTDQ(self.basis_function, gamma, self.policy)

        self.stop_criterium = 10**-5
        self.gamma = gamma
コード例 #25
0
ファイル: mdp.py プロジェクト: guyu980/CME241
 def policy_iteration(self, pol: Policy):
     pol = Policy(
         {s: {a: 1. / len(v)
              for a in v}
          for s, v in self.rewards.items()})
     v_old = self.get_state_value_func(pol)
     converge = False
     while not converge:
         pol = self.greedy_improved_policy(pol)
         v_new = self.iterative_policy_evaluation(pol)
         converge = is_equal(np.linalg.norm(v_new), np.linalg.norm(v_old))
         v_old = v_new
     return pol
コード例 #26
0
ファイル: train.py プロジェクト: silverjoda/BioHex
def evaluate_agent(env, obs_dim, act_dim, num_episodes):

    policy = Policy(obs_dim, act_dim, 0.003)
    policy.restore_weights()

    scaler = Scaler(obs_dim)

    print("Restored weights, evaluating...")

    for i_episode in range(num_episodes):
        run_episode(env, policy, scaler, 100000, stochastic=True)

    env.kill()
コード例 #27
0
 def __init__(self, policy, mean_model=None, variance_model=None, x_norm=None, u_norm=None, y_norm=None):
     super(ExpectedDistanceProduction, self).__init__()
     from policy import Policy
     self.policy = Policy(7, 3, 4)
     self.mean = RewardFCPlain(7, 4, 3)
     self.variance = FCPositive(7, 4, 3)
     self.x_norm = Normalization(7)
     self.u_norm = Normalization(4)
     self.g_norm = Normalization(3)
     self.register_buffer(
         'weights',
         torch.FloatTensor([1.0, 1.0, 0.1])
     )
コード例 #28
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    print('Testing Period:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))



    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    env.set_goals(0)

    now = datetime.now().strftime("%b-%d_%H:%M:%S")  # create unique directories  格林尼治时间!!!  utcnow改为now
    testname = now+'-'+TestNote
    logger = Logger(logname=env_name, now=testname)
    aigym_path = os.path.join('log-Test-files', env_name, testname)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    # run a few episodes of untrained policy to initialize scaler:
    policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt')
    episode = 0

    observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True)
    tmp=np.vstack((rewards,states_x,states_y))
    tmp1=np.transpose(tmp)
    data = np.concatenate((observes, actions, tmp1),axis=1)
    trajectory = {}
    for j in range(data.shape[0]):
        for i in range(data.shape[1]):
            trajectory[i] = data[j][i]
        logger.log(trajectory)
        logger.write(display=False)


    logger.close()
    policy.close_sess()
    val_func.close_sess()

    print('End time:\n')
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
コード例 #29
0
def main():
    max_iteration = 5000
    episodes_per_batch = 20
    max_kl = 0.01
    init_logvar = -1
    policy_epochs = 5
    value_epochs = 10
    value_batch_size = 256
    gamma = 0.995
    lam = .97

    # initialize environment
    env = HumanoidEnv()
    env.seed(0)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    logger = Logger()

    # init qpos and qvel
    init_qpos = np.load('./mocap_expert_qpos.npy')
    init_qvel = np.load('./mocap_expert_qvel.npy')

    # policy function
    policy = Policy(obs_dim=obs_dim,
                    act_dim=act_dim,
                    max_kl=max_kl,
                    init_logvar=init_logvar,
                    epochs=policy_epochs,
                    logger=logger)

    session_to_restore = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/model_humanoid_ego_1700'
    stats_to_recover = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/stats_humanoid_ego_1700'
    scale, offset = policy.restore_session(
        session_to_restore=session_to_restore,
        stats_to_recover=stats_to_recover)

    # expert agent
    agent = ExpertAgent(env=env,
                        policy_function=policy,
                        scale=scale,
                        offset=offset,
                        init_qpos=init_qpos,
                        init_qvel=init_qvel,
                        logger=logger)

    agent.collect(episodes_per_batch=20)

    # close everything
    policy.close_session()
コード例 #30
0
ファイル: test.py プロジェクト: ZhenkaiShou/project
def test():
    '''
  This function visualizes the game play. The environment will be reset immediately and the game will not be recorded.
  To record the game play, please run the record() function.
  '''
    save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/"

    obs_mean_std = np.load(save_path + "obs_mean_std.npz")
    obs_mean = obs_mean_std["obs_mean"]
    obs_std = obs_mean_std["obs_std"]

    # Create environment.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space

    # Build models.
    policy = Policy(obs_space, action_space, is_training=False)

    with tf.Session() as sess:
        # Load variables.
        saver_policy = tf.train.Saver(policy.trainable_variables)
        saver_policy.restore(sess, save_path + "policy")

        total_step = 0
        total_reward = 0
        while True:
            # Get observation.
            if total_step == 0:
                obs = env.reset()
            else:
                obs = obs_next
            obs = (obs - obs_mean) / obs_std
            env.render()
            # Get action.
            action = sess.run(
                policy.action,
                feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])})
            action = np.squeeze(action, (0, 1))

            # Interact with the environment.
            obs_next, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                # Reset environment.
                print("Episodic reward: ", total_reward, sep="")
                obs_next = env.reset()
                total_reward = 0
            # Update step counter.
            total_step += 1
        env.close()