def make_dataflow_train(env):
    rng = random.gen_rng()

    def _outputs2action(outputs):
        epsilon = env.runtime['exp_epsilon']
        return outputs['q_argmax'] if rng.rand() > epsilon else rng.choice(
            get_player_nr_actions())

    collector = rl.train.SynchronizedExperienceCollector(
        env,
        make_player,
        _outputs2action,
        nr_workers=get_env('dqn.collector.nr_workers'),
        nr_predictors=get_env('dqn.collector.nr_workers'),
        predictor_output_names=get_env('dqn.collector.predictor_output_names'),
        mode=get_env('dqn.collector.mode'))

    return rl.train.QLearningDataFlow(collector,
                                      target=get_env('dqn.collector.target'),
                                      maxsize=get_env('dqn.expreplay.maxsize'),
                                      batch_size=get_env('trainer.batch_size'),
                                      epoch_size=get_env('trainer.epoch_size'),
                                      gamma=get_env('dqn.gamma'),
                                      nr_td_steps=get_env('dqn.nr_td_steps'),
                                      reward_cb=lambda r: np.clip(r, -1, 1))
Esempio n. 2
0
    def __init__(self, owner_env, scheduler, desc, nr_ensembles, devices,
                 nr_epochs, epoch_size):

        self._owner_env = owner_env
        self._scheduler = scheduler
        self._schedule_logger = EveryNSecondLogger(logger, 2)

        self._desc = desc
        self._nr_ensembles = nr_ensembles
        self._devices = devices
        self._nr_epochs = nr_epochs
        self._epoch_size = epoch_size

        self._envs = []
        self._funcs = []
        self._funcs_lock = threading.Lock()
        self._dataflows = []

        self._data_pool = []
        self._data_pool_last = 0  # number of data points used for training last time step
        self._data_pool_lock = threading.Lock()
        self._data_pool_cond = threading.Condition(lock=self._data_pool_lock)
        self._training_sets = []  # List of list of data.
        self._validation_set = []  # List of data.
        self._waiting_for_data = threading.Event()

        self._rng = random.gen_rng()
Esempio n. 3
0
 def proc():
     rng = tar.gen_rng()
     with fake_with_rng(rng):
         time.sleep(0.5)
         state = tar.get_rng().get_state()
         time.sleep(0.5)
         q.put(state)
Esempio n. 4
0
 def __init__(self, action_meanings=None):
     self.__rng = random.gen_rng()
     self._action_meanings = action_meanings
Esempio n. 5
0
 def __init__(self, env):
     self._env = env
     self.__rng = random.gen_rng()
     self.__initialized = False