Beispiel #1
0
 def __init__(self, name, problem_server):
     self.name = name
     # need a handle to problem server so that it doesn't get GC'd (which
     # would kill the child process!)
     self.problem_server = problem_server
     self.problem_service = problem_server.service
     self.prob_meta, self.dom_meta = to_local(
         self.problem_service.get_meta())
     self.env_spec = to_local(self.problem_service.get_env_spec())
     self.dg_extra_dim = to_local(self.problem_service.get_dg_extra_dim())
     # will get filled in later
     self.policy = None
Beispiel #2
0
 def exposed_batch_iter(self, batch_size, n_batches):
     """Sample <batch_size> elements from internal buffer."""
     batch_size = to_local(batch_size)
     n_batches = to_local(n_batches)
     # first convert replay buffer to a list so that we can shuffle and
     # take indices
     assert len(self.replay) > 0, 'need non-empty replay pool'
     ordered_buf = list(self.replay)
     shuffle(ordered_buf)  # in-place
     gen = cycle(ordered_buf)
     for batch_num in range(n_batches):
         rich_batch = list(islice(gen, batch_size))
         yield self.flatten_batch(rich_batch)
Beispiel #3
0
    def _extend_replays(self, num_per_problem: int):
        """Extend the replays for //all// problems asynchronously."""
        # fire off extension methods
        results = []
        for problem in tqdm.tqdm(self.problems, desc='spawn extend'):
            get_action = self._make_get_action(problem)
            extend_replay = rpyc. async (problem.problem_service.extend_replay)
            result = extend_replay(get_action, num_per_problem)
            # apparently I need to keep hold of async ref according to RPyC
            # docs (it's weak or s.th). Also, I need a background thread to
            # serve each environment's requests (...this may break things
            # slightly).
            bg_thread = rpyc.utils.helpers.BgServingThread(
                problem.problem_server.conn)
            results.append((extend_replay, result, bg_thread))

        # Now we wait for results to come back. This is horribly inefficient
        # when some environments are much harder than others; oh well.
        succ_rates = []
        for _, result, bg_thread in tqdm.tqdm(results, desc='wait extend'):
            succ_rates.append(to_local(result.value))
            # always shut down cleanly
            bg_thread.stop()

        return succ_rates
Beispiel #4
0
 def __init__(self,
              problems: List['fpg.SingleProblem'],
              weight_manager: PropNetworkWeights,
              summary_writer: Any,
              strategy: SupervisedObjective,
              kl_coeff: Union[None, float],
              batch_size: int = 64,
              lr: float = 0.001) -> None:
     # this needs to be acquired before figuring out an action from NN
     self._get_act_lock = threading.RLock()
     # gets incremented to deal with TF
     self.batches_seen = 0
     self.problems = problems
     self.weight_manager = weight_manager
     self.summary_writer = summary_writer
     self.batch_size = batch_size
     self.batch_size_per_problem = max(batch_size // len(problems), 1)
     self.strategy = strategy
     self.kl_coeff = kl_coeff
     self.max_len = max(
         to_local(problem.problem_service.get_max_len())
         for problem in self.problems)
     self.tf_init_done = False
     self.lr = lr
     self._init_tf()
Beispiel #5
0
def get_problem_names(pddl_files):
    """Return a list of problem names from some PDDL files by spooling up
    background process."""
    config = ProblemServiceConfig(pddl_files, None)
    server = ProblemServer(config)
    try:
        names = to_local(server.service.get_problem_names())
        assert isinstance(names, list)
        assert all(isinstance(name, str) for name in names)
    finally:
        server.stop()
    return names
Beispiel #6
0
def eval_single(args, policy, problem_server, unique_prefix, elapsed_time,
                iter_num, weight_manager, scratch_dir):
    # now we evaluate the learned policy
    print('Evaluating policy')
    trial_results, paths = run_trials(policy,
                                      problem_server,
                                      args.rounds_eval,
                                      limit=args.limit_turns,
                                      det_sample=args.det_eval,
                                      show_time=args.show_eval_time)

    print('Trial results:')
    print('\n'.join('%s: %s' % (k, v) for k, v in trial_results.items()))

    out_dict = {
        'no_train': args.no_train,
        'args_problems': args.problems,
        'problem': to_local(problem_server.service.get_current_problem_name()),
        'timeout': args.timeout,
        'optimiser': args.optimiser.optimiser_name,
        'model': args.model,
        'model_opts': args.model_opts,
        'all_args': sys.argv[1:],
        # TODO: possibly add this. Not sure whether it's worthwhile given
        # that the supposed "convergence" measure might be spurious (e.g.
        # what if it just spikes up in reward briefly?).
        # convergence_* refers to first iteration at which best score was
        # encountere
        # 'convergence_time': convergence_time,
        # 'convergence_iters': convergence_iter,
        # elapsed_* also includes time/iterations spent looking for better
        # results after converging
        'elapsed_opt_time': elapsed_time,
        'elapsed_opt_iters': iter_num,
        'trial_paths': paths
    }
    out_dict.update(trial_results)
    result_path = path.join(scratch_dir, 'results.json')
    with open(result_path, 'w') as fp:
        dump(out_dict, fp, indent=2)
    # also write out lists of actions taken during final trial
    # TODO: should also write out some randomly chosen paths during training
    # TODO: also write out probabilities of each action for at least some paths
    # (or some states), and maybe even real Q-values of actions (would be
    # helpful!)
    actions_path = path.join(scratch_dir, 'trial-paths.txt')
    with open(actions_path, 'w') as fp:
        for alist in paths:
            fp.write(' -> '.join(alist))
            fp.write('\n\n')
Beispiel #7
0
 def _make_batches(self, n_batches: int) -> Iterable[Dict[Any, Any]]:
     """Make a given number of batches for each problem."""
     batch_iters = []
     for problem in self.problems:
         service = problem.problem_service
         it = service.batch_iter(self.batch_size_per_problem, n_batches)
         batch_iters.append(it)
     combined = zip(*batch_iters)
     # yield a complete feed dict
     for combined_batch in combined:
         assert len(combined_batch) == len(self.problems)
         yield_val = {}
         for problem, batch in zip(self.problems, combined_batch):
             ph_obs_var, ph_q_values = self.obs_qv_inputs[problem.name]
             obs_tensor, qv_tensor = to_local(batch)
             yield_val[ph_obs_var] = obs_tensor
             yield_val[ph_q_values] = qv_tensor
         yield yield_val
Beispiel #8
0
 def inner(obs):
     obs = to_local(obs)
     try:
         # if this times out then something really screwy is going on
         self._get_act_lock.acquire(timeout=60 * 30)
         # each thread needs to have this call somewhere, per
         # https://www.tensorflow.org/versions/r0.12/api_docs/python/client/session_management
         with self.sess.as_default():
             # make sure it's 1D (need different strategy for batch
             # cache)
             assert obs.ndim == 1
             obs_bytes = obs.tostring()
             if obs_bytes not in cache:
                 cache[obs_bytes] = get_action(obs)
                 return cache[obs_bytes]
             return cache[obs_bytes]
     finally:
         self._get_act_lock.release()
Beispiel #9
0
    def _instantiate_net(self, single_prob_instance: 'fpg.SingleProblem'):
        # create two placeholders
        problem_service = single_prob_instance.problem_service
        policy = single_prob_instance.policy
        obs_dim = to_local(problem_service.get_obs_dim())
        obs_dtype_name = to_local(problem_service.get_obs_dtype_name())
        ph_obs_var = tf.placeholder(shape=[None, obs_dim],
                                    name='observation',
                                    dtype=obs_dtype_name)
        act_dist = policy.dist_info_sym(ph_obs_var,
                                        summary_collections=['sl-activations'
                                                             ])['prob']
        act_dim = to_local(problem_service.get_act_dim())
        ph_q_values = tf.placeholder(shape=[None, act_dim],
                                     name='q_values',
                                     dtype='float32')

        loss_parts = []

        # now the loss ops
        if self.strategy == SupervisedObjective.ANY_GOOD_ACTION:
            best_qv = tf.reduce_min(ph_q_values, axis=-1, keep_dims=True)
            # TODO: is 0.01 threshold too big? Hmm.
            act_labels = tf.cast(tf.less(tf.abs(ph_q_values - best_qv), 0.01),
                                 'float32')
            # act_labels = tf.cast(tf.equal(ph_q_values, best_qv), 'float32')
            label_sum = tf.reduce_sum(act_labels, axis=-1, keep_dims=True)
            act_label_dist = act_labels / label_sum
            # zero out disabled or dead-end actions!
            dead_end_value = to_local(
                problem_service.get_ssipp_dead_end_value())
            act_label_dist *= tf.cast(act_labels <= dead_end_value, 'float32')
            # XXX: this will obviously break if we have softmax; it'll spend
            # heaps of time trying to get all labels to be equal, and still
            # have (nonsense) nonzero loss afterwards :(
            xent = tf.reduce_mean(cross_entropy(act_dist, act_label_dist))
            loss_parts.append(('xent', xent))
        elif self.strategy == SupervisedObjective.MAX_ADVANTAGE:
            state_values = tf.reduce_min(ph_q_values, axis=-1)
            # is_nonzero = tf.greater(act_dist, 1e-4)
            # act_dist_nz = tf.where(is_nonzero, act_dist,
            #                        tf.ones_like(act_dist))
            # exp_q = act_dist_nz * (ph_q_values - state_values)
            exp_q = act_dist * ph_q_values
            exp_vs = tf.reduce_sum(exp_q, axis=-1)
            # state value is irrelevant to objective, but is included because
            # it ensures that zero loss = optimal policy
            q_loss = tf.reduce_mean(exp_vs - state_values)
            loss_parts.append(('qloss', q_loss))
            # XXX: need to look at whatever this is (and fix it if it's wrong)
            # if self.kl_coeff:
            #     assert self.kl_coeff > 0, \
            #         "negative entropy coefficient must be positive if supplied"
            #     is_nonzero = tf.equal(act_dist, 0.0)
            #     num_enabled = tf.reduce_sum(
            #         tf.cast(is_nonzero, tf.float32), axis=1)
            #     # clip so that really tiny values don't make our loss balloon!
            #     act_dist_clip = tf.clip_by_value(act_dist, 1e-10, 1.0)
            #     # also change all the zero values to ones, so that they count
            #     # as zero in summation below
            #     act_dist_clamp = tf.where(is_nonzero, act_dist_clip,
            #                               tf.ones_like(act_dist))
            #     xent = -tf.reduce_sum(
            #         tf.log(act_dist_clamp), axis=1) / num_enabled
            #     kl_div = -tf.log(num_enabled) + xent
            #     scale_kl_div = self.kl_coeff * tf.reduce_mean(kl_div)
            #     loss_parts.append(('scale-kld', scale_kl_div))
            #
            #     batch_neg_entropy = tf.reduce_sum(
            #         act_dist * tf.log(act_dist_clamp), axis=-1)
            #     # we allow drift of this many bits from uniform; otherwise,
            #     # apply entropy loss!
            #     num_enabled = tf.reduce_sum(
            #         tf.cast(act_dist > 1e-10, tf.float32), axis=1)
            #     allowed_bits = num_enabled - 1.5
            #     uniform_bits = tf.log(num_enabled) / tf.log(2.0)
            #     min_neg_entropy = -uniform_bits + allowed_bits
            #     batch_neg_ent_clip = tf.clip_by_value(batch_neg_entropy,
            #                                           min_neg_entropy, 0)
            #     batch_neg_ent_clip += min_neg_entropy
            #     # we want to maximise entropy, kinda
            #     ent_reg = self.neg_ent_coeff * tf.reduce_mean(
            #         batch_neg_ent_clip)
            #     loss_parts.append(('entreg', ent_reg))
        else:
            raise ValueError("Unknown strategy %s" % self.strategy)

        # regularisation
        # TODO: make this configurable!
        weights = self.weight_manager.all_weights
        l2_reg = 0.001 * sum(tf.nn.l2_loss(w) for w in weights)
        loss_parts.append(('l2reg', l2_reg))

        loss = sum(p[1] for p in loss_parts)

        return ph_obs_var, ph_q_values, loss, loss_parts
Beispiel #10
0
 def _get_replay_sizes(self) -> List[int]:
     """Get the sizes of replay buffers for each problem."""
     rv = []
     for problem in self.problems:
         rv.append(to_local(problem.problem_service.get_replay_size()))
     return rv
Beispiel #11
0
 def exposed_env_step(self, action):
     action = to_local(action)
     return self.env_wrapped.step(action)
Beispiel #12
0
 def exposed_action_name(self, action_num):
     action_num = to_local(action_num)
     return self.env_raw.action_name(action_num)
Beispiel #13
0
 def exposed_extend_replay(self, get_action, n_paths):
     """Extend the replay buffer using the given policy (represented as a
     function from flattened observation vectors to action numbenrs)."""
     n_paths = to_local(n_paths)
     return self.internal_extend_replay(get_action, n_paths)
Beispiel #14
0
 def reset(self):
     remote_obs = self._problem_service.env_reset()
     return to_local(remote_obs)
Beispiel #15
0
 def __init__(self, problem_server):
     self._first_step = True
     self._problem_server = problem_server
     self._problem_service = problem_server.service
     spec = to_local(self._problem_service.get_env_spec())
     self._spec = spec