def get_batch(
    flags,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    buffers: Buffers,
    initial_agent_state_buffers,
    timings,
    lock=threading.Lock(),
):
    with lock:
        timings.time("lock")
        indices = [full_queue.get() for _ in range(flags.batch_size)]
        timings.time("dequeue")
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers
    }
    # NOTE: AttentionNet is batch first.
    initial_agent_state = tuple(
        torch.cat(ts, dim=0)
        for ts in zip(*[initial_agent_state_buffers[m] for m in indices])
    )
    timings.time("batch")
    for m in indices:
        free_queue.put(m)
    timings.time("enqueue")
    batch = {k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items()}
    initial_agent_state = tuple(
        t.to(device=flags.device, non_blocking=True) for t in initial_agent_state
    )
    timings.time("device")
    return batch, initial_agent_state
Esempio n. 2
0
def get_batch(
    flags,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    buffers: Buffers,
    initial_agent_state_buffers,
    timings,
    lock=threading.Lock(),
):
    with lock:
        timings.time("lock")
        indices = [full_queue.get() for _ in range(flags.batch_size)]
        timings.time("dequeue")
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1) for key in buffers
    }
    initial_agent_state = [torch.stack([initial_agent_state_buffers[m][i][0] for m in indices], axis=0)
                      for i in range(2)]
    #print("initial_agent_state[0].shape: ", initial_agent_state[0].shape)
    timings.time("batch")
    for m in indices:
        free_queue.put(m)
    timings.time("enqueue")
    batch = {k: t.to(device=flags.device, non_blocking=True) for k, t in batch.items()}
    initial_agent_state = [t.to(device=flags.device, non_blocking=True) for t in initial_agent_state]
    timings.time("device")
    return batch, initial_agent_state
Esempio n. 3
0
def simulation_launch(cfg, bot, bot_id, objective_id, task_factory, loss, mcts,
                      signal):
    ping, sync, share = SimpleQueue(), SimpleQueue(), Queue()
    stats = Queue() if not bot_id else None

    sim = Simulation(cfg, bot, bot_id, objective_id, task_factory)

    critic = Process(  #Thread(#
        target=critic_launch,
        args=(
            cfg,
            bot,
            objective_id,
            task_factory,
            sim.task.update_goal,
            ping,
            sync,
            loss,
            share,
            stats,
        ))
    critic.start()

    sim.explore(ping, sync, loss, mcts, signal, share, stats)
    print("SIMULATION OVER")
    critic.join()

    # TODO : ping scoping refactor, following slush refactor, dtor refactor ( sim + crit )
    for q in [ping, sync, share, stats]:
        while q is not None and not q.empty():
            q.get()
Esempio n. 4
0
def get_batch(
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    buffers: Buffers,
    flags,
    timings,
    lock=threading.Lock()) -> typing.Dict[str, torch.Tensor]:
    with lock:
        timings.time('lock')
        indices = [full_queue.get() for _ in range(flags.batch_size)]
        timings.time('dequeue')
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1)
        for key in buffers
    }
    timings.time('batch')
    for m in indices:
        free_queue.put(m)
    timings.time('enqueue')
    batch = {
        k: t.to(device=flags.device, non_blocking=True)
        for k, t in batch.items()
    }
    timings.time('device')
    return batch
Esempio n. 5
0
def get_batch(
        flags,
        free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue,
        buffers: Buffers,
        timings,
        lock=threading.Lock(),
):
    with lock:
        timings.time("lock")
        indices = [full_queue.get() for _ in range(flags.batch_size)]
        timings.time("dequeue")
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1)
        for key in buffers
    }
    timings.time("batch")
    for m in indices:
        free_queue.put(m)
    timings.time("enqueue")
    batch = {
        k: t.to(device=flags.device, non_blocking=True)
        for k, t in batch.items()
    }
    timings.time("device")
    return batch
Esempio n. 6
0
def agent_launch(bot_id, cfg, task_factory, encoder, Actor, Critic, stop_q, callback = None, goal_encoder = None):
    agent = Zer0Bot(bot_id, cfg, task_factory, encoder, Actor, Critic, goal_encoder)

    loss_gate, mcts, signal = zip(*[(
        Queue(), SimpleQueue(), SimpleQueue()
        ) for i in range(cfg['n_simulations'])])

    sims = [ Thread(#Process(#
        target=simulation_launch,
        args=(cfg, agent.bot, bot_id, i, task_factory, loss, seed, sig, )
        ) for i, (loss, seed, sig) in enumerate(zip(loss_gate, mcts, signal)) ]

    for sim in sims:
        sim.start()

    while stop_q.empty():
        scores = agent.train(loss_gate, mcts, signal)
        if None == callback:
            continue
        scores = callback(agent, scores)
        if scores is None:
            continue
        stop_q.put(scores)

    print("AGENT OVER")
    for seed, sim in zip(mcts, sims):
        seed.put(None)
        sim.join()

    for qs in [loss_gate, mcts, signal]:
        for q in qs:
            while not q.empty():
                q.get()
Esempio n. 7
0
def get_batch(
        flags,
        free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue,
        buffers,
        initial_agent_state_buffers,
        lock=threading.Lock(),
):
    with lock:
        indices = [full_queue.get() for _ in range(flags.batch_size)]
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1)
        for key in buffers
    }
    initial_agent_state = (torch.cat(ts, dim=1) for ts in zip(
        *[initial_agent_state_buffers[m] for m in indices]))
    for m in indices:
        free_queue.put(m)
    batch = {
        k: t.to(device=flags.device, non_blocking=True)
        for k, t in batch.items()
    }
    initial_agent_state = tuple(
        t.to(device=flags.device, non_blocking=True)
        for t in initial_agent_state)
    return batch, initial_agent_state
def read_img(path_queue: multiprocessing.JoinableQueue,
             data_queue: multiprocessing.SimpleQueue):
    torch.set_num_threads(1)
    while True:
        img_path = path_queue.get()
        img = Image.open(img_path)
        data_queue.put(T(img))
        path_queue.task_done()
def act(flags, actor_index: int, free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue, model: torch.nn.Module, buffers: Buffers,
        initial_agent_state_buffers, level_name):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        ######changed next line
        gym_env = create_env(flags, level_name, seed)
        env = environment.Environment(gym_env)
        env_output = env.initial()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                timings.time("model")

                env_output = env.step(agent_output["action"])

                timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 10
0
def act(
    flags,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)

        gym_env = create_env(
            flags.env,
            savedir=flags.rundir,
            archivefile="nethack.%i.%%(pid)i.%%(time)s.zip" % actor_index,
        )
        env = ResettingEnvironment(gym_env)
        env_output = env.initial()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout.
            for t in range(flags.unroll_length):
                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                env_output = env.step(agent_output["action"])

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

            full_queue.put(index)

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise
Esempio n. 11
0
def act(i: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue,
        model: torch.nn.Module, buffers: Buffers, flags):
    try:
        logging.info('Actor %i started.', i)
        timings = prof.Timings()  # Keep track of how fast things are.

        gym_env = Net.create_env(flags)
        seed = i ^ int.from_bytes(os.urandom(4), byteorder='little')
        gym_env.seed(seed)
        env = environment.Environment(gym_env)
        env_output = env.initial()
        agent_output = model(env_output)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]

            # Do new rollout
            for t in range(flags.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output = model(env_output)

                timings.time('model')

                env_output = env.step(agent_output['action'])

                timings.time('step')

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time('write')
            full_queue.put(index)

        if i == 0:
            logging.info('Actor %i: %s', i, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error('Exception in worker process %i', i)
        traceback.print_exc()
        print()
        raise e
Esempio n. 12
0
def main():
    episode = 0
    path = "/home/michael/dev/fyp/AIGym/MP-Conv-Pong/"
    mp.set_start_method('spawn')
    worker_count = 3  #mp.cpu_count()
    learning_rate = 1e3
    model = Model(2)
    criterion = nn.CrossEntropyLoss(reduce=False)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    if episode > 0:
        model.load_state_dict(torch.load(path + "Models/" + str(episode)))
        optimizer.load_state_dict(
            torch.load(path + "Optimizers/" + str(episode)))
    model.cuda()
    model.share_memory()

    envs = [gym.make("Pong-v0") for i in range(worker_count)]

    epoch_size = 1
    batch_save = 128
    best_score = None
    running_reward = None
    reward_queue = SimpleQueue()

    #Start workers
    workers = [
        Worker(envs[i], epoch_size, model, criterion, optimizer, reward_queue,
               str(i + 1)) for i in range(worker_count)
    ]
    [w.start() for w in workers]

    # Gather rewards
    while True:
        reward = reward_queue.get()
        if isinstance(reward, Exception):
            print(reward)
        else:
            episode += 1
            if (episode % batch_save == 0):
                torch.save(model.state_dict(), path + "Models/" + str(episode))
                torch.save(optimizer.state_dict(),
                           path + "Optimizers/" + str(episode))

            if best_score is None:
                best_score = reward
            elif reward > best_score:
                best_score = reward
            running_reward = reward if running_reward is None else running_reward * 0.95 + reward * 0.05
            if episode % 1 == 0:
                print(
                    "episode {:4.0f} complete - average reward = {:3.0f}, last score was = {:3.0f}, best score is = {:3.0f}"
                    .format(episode, running_reward, reward, best_score))
Esempio n. 13
0
    def _generate_parallel(self, iteration, network, device, num_workers):
        q, r = divmod(self.remaining_games, num_workers)
        num_active_workers = Value('i', num_workers)
        resign_threshold = Value('d', self.resign_mgr.threshold())
        evaluator_mgr = BulkEvaluatorManager([network], device, num_workers)
        output_queue = SimpleQueue()

        # start the workers
        workers = []
        for worker_id in range(num_workers):
            num_games = q + 1 if worker_id < r else q
            evaluator = evaluator_mgr.get_evaluator(worker_id, 0)
            worker = Process(
                target=self._worker_job,
                args=(worker_id, num_games, num_active_workers,
                      resign_threshold, evaluator, output_queue),
            )
            workers.append(worker)
            worker.start()

        # start evaluator server
        server = evaluator_mgr.get_server(num_active_workers)
        server.start()

        # collect the examples generated by workers
        while num_active_workers.value > 0 or not output_queue.empty():
            examples, resign_value_history, result = output_queue.get()
            self.example_pool += examples
            self.game_length.append(len(examples))

            # add the history into resignation manager to update the threshold
            if resign_value_history is not None:
                self.resign_mgr.add(resign_value_history, result)
                resign_threshold.value = self.resign_mgr.threshold()

            self.remaining_games -= 1

            # periodically save the progress
            if (self.conf.GAMES_PER_ITERATION - self.remaining_games) \
                    % self.conf.EXAMPLE_POOL_SAVE_FREQUENCY == 0:
                self.save(iteration)
                log.info(
                    f'[iter={iteration}] ExamplePool: checkpoint saved, '
                    f'{self.remaining_games} games remaining'
                )

        for worker in workers:
            worker.join()
        server.join()
Esempio n. 14
0
def get_batch(
        flags,
        free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue,
        buffers: Buffers,
        initial_agent_state_buffers,
        timings,
        lock=threading.Lock(),
):
    # need to make sure that we wait until batch_size trajectories/rollouts have been put into the queue
    with lock:
        timings.time("lock")
        # get the indices of actors "offering" trajectories/rollouts to be processed by the learner
        indices = [full_queue.get() for _ in range(flags.batch_size)]
        timings.time("dequeue")

    # create the batch as a dictionary for all the data in the buffers (see act() function for list of
    # keys), where each entry is a tensor of these values stacked across actors along the first dimension,
    # which I believe should be the "batch dimension" (see _format_frame())
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1)
        for key in buffers
    }

    # similar thing for the initial agent states, where I think the tuples are concatenated to become torch tensors
    initial_agent_state = (torch.cat(ts, dim=1) for ts in zip(
        *[initial_agent_state_buffers[m] for m in indices]))
    timings.time("batch")

    # once the data has been "transferred" into batch and initial_agent_state,
    # signal that the data has been processed to the actors
    for m in indices:
        free_queue.put(m)
    timings.time("enqueue")

    # move the data to the right device (e.g. GPU)
    batch = {
        k: t.to(device=flags.device, non_blocking=True)
        for k, t in batch.items()
    }
    initial_agent_state = tuple(
        t.to(device=flags.device, non_blocking=True)
        for t in initial_agent_state)
    timings.time("device")

    return batch, initial_agent_state
Esempio n. 15
0
 def start(self, gui_queue=False):
     if gui_queue:
         self.gui_queue = SimpleQueue()
     else:
         self.gui_queue = None
     self.fpga_process = Process(target=self.BMI_core_func, name='fpga', args=(self.gui_queue,)) #, args=(self.pipe_jovian_side,)
     self.fpga_process.daemon = True
     self.fpga_process.start()  
def get_batch(
        flags,
        free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue,
        buffers: Buffers,
        initial_agent_state_buffers,
        timings,
        lock=threading.Lock(),
):
    with lock:
        timings.time("lock")
        indices = [full_queue.get() for _ in range(flags.batch_size)]

        # TODO: Check if emptying full_queue and then readding to it takes very long,
        #       seems like the only way to ensure a batch of similar length elements
        # One problem with doing this is that if get a really short trajectory, may never end up
        # using it. DONT CHANGE THIS FOR NOW.

        timings.time("dequeue")
    batch = {
        key: torch.stack([buffers[key][m] for m in indices], dim=1)
        for key in buffers
    }
    initial_agent_state = (torch.cat(ts, dim=1) for ts in zip(
        *[initial_agent_state_buffers[m] for m in indices]))
    timings.time("batch")
    for m in indices:
        free_queue.put(m)
    timings.time("enqueue")
    batch = {
        k: t.to(device=flags.device, non_blocking=True)
        for k, t in batch.items()
    }
    initial_agent_state = tuple(
        t.to(device=flags.device, non_blocking=True)
        for t in initial_agent_state)
    timings.time("device")
    return batch, initial_agent_state
Esempio n. 17
0
    def __init__(self, factory_env, factory_mgr, n_tasks):
        self.pipe_cmd = Queue()  # we want to queue more data in a row
        self.pipe_data = [SimpleQueue() for _ in range(n_tasks + 1)]

        self.factory_mgr = factory_mgr

        # create thread ( in main process!! ) which will handle requests!
        self.com = RemoteTaskServer(factory_mgr, factory_env, self.pipe_cmd,
                                    self.pipe_data)

        self.dtb = {}
        self.lock = threading.RLock()

        #    def turnon():
        self.com.start()
Esempio n. 18
0
def train_ai2thor(model, args, rank=0, b=None):

    seed = args.seed + 10000 * rank
    torch.manual_seed(seed)
    np.random.seed(seed)

    # torch.cuda.set_device(rank)
    # device = torch.device(f'cuda:{rank}')
    device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
    # if torch.cuda.is_available():
    #     os.environ['DISPLAY'] = f':{rank}'

    model = model.to(device)
    model.share_memory()

    # Experience buffer
    storage = PPOBuffer(model.obs_shape,
                        args.steps,
                        args.num_workers,
                        args.state_size,
                        args.gamma,
                        device=device)
    storage.share_memory()

    #torch.multiprocessing.set_start_method('spawn')
    # start multiple processes
    ready_to_works = [Event() for _ in range(args.num_workers)]
    exit_flag = Value('i', 0)
    queue = SimpleQueue()

    processes = []
    # task_config_file = "config_files/multiMugTaskTrain.json"
    task_config_file = "config_files/multiMugTaskTrain.json"
    # start workers
    for worker_id in range(args.num_workers):
        print('START>>>>>>>>>>>>>>>>')
        p = Process(target=worker,
                    args=(worker_id, model, storage, ready_to_works[worker_id],
                          queue, exit_flag, args.use_priors, task_config_file))
        p.start()
        processes.append(p)

    # start trainer
    train_params = {
        "epochs": args.epochs,
        "steps": args.steps,
        "world_size": args.world_size,
        "num_workers": args.num_workers
    }
    ppo_params = {
        "clip_param": args.clip_param,
        "train_iters": args.train_iters,
        "mini_batch_size": args.mini_batch_size,
        "value_loss_coef": args.value_loss_coef,
        "entropy_coef": args.entropy_coef,
        "rnn_steps": args.rnn_steps,
        "lr": args.lr,
        "max_kl": args.max_kl
    }

    distributed = False
    if args.world_size > 1:
        if distributed == True:
            distributed = True
            # Initialize Process Group, distributed backend type
            dist_backend = 'nccl'
            # Url used to setup distributed training
            dist_url = "tcp://127.0.0.1:23456"
            print("Initialize Process Group... pid:", os.getpid())
            dist.init_process_group(backend=dist_backend,
                                    init_method=dist_url,
                                    rank=rank,
                                    world_size=args.world_size)
            # Make model DistributedDataParallel
            model = DistributedDataParallel(model,
                                            device_ids=[rank],
                                            output_device=rank)
    else:
        print('Distribution is not allowed')

    learner(model, storage, train_params, ppo_params, ready_to_works, queue,
            exit_flag, rank, distributed, b)

    for p in processes:
        print("process ", p.pid, " joined")
        p.join()
Esempio n. 19
0
                        epoch_reward += r
                        history["observation"].append(x.cpu())
                        history["reward"].append(torch.FloatTensor(1).fill_(r))

                self.reward_queue.put(epoch_reward)
                dataset = compileHistory(history)
                train(self.model, self.criterion, self.optimizer, dataset)

        except Exception as identifier:
            self.reward_queue.put(identifier)
            self.reward_queue.put(traceback.format_exc())


if __name__ == '__main__':
    mp.set_start_method('spawn')
    learning_rate = 1e-3
    model = Model(2)
    model.cuda()
    model.share_memory()
    criterion = nn.CrossEntropyLoss(reduce=False)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    env = gym.make("Pong-v0")
    epoch_size = 1
    queue = SimpleQueue()
    worker = Worker(env, epoch_size, model, criterion, optimizer, queue,
                    "test")
    worker.start()
    while True:
        print(queue.get())
Esempio n. 20
0
def act(
    args,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = Timings()  # Keep track of how fast things are.

        gym_env = create_env(args)
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        gym_env.seed(seed)
        env = Environment(gym_env)
        def make_env(args):
            def thunk():
                env = create_env(args)
                return env
            return thunk
        envs = DummyVecEnv([make_env(args) for i in range(1)])
        
        env_output = env.initial()
        envs.reset()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout.
            for t in range(args.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                # timings.time("model")

                env_output = env.step(agent_output["action"])
                # env_output = env.step(agent_output["action"])
                # envs.step((torch.randint(0, envs.action_space.n, (envs.num_envs,))).numpy())
                assert agent_output["action"] == env_output["last_action"]
                timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 21
0
def train(config):
    task_queue = SimpleQueue()
    result_queue = SimpleQueue()
    stop = mp.Value('i', False)
    stats = SharedStats(config.state_dim)
    normalizers = [StaticNormalizer(config.state_dim) for _ in range(config.num_workers)]
    for normalizer in normalizers:
        normalizer.offline_stats.load(stats)

    workers = [Worker(id, normalizers[id], task_queue, result_queue, stop, config) for id in range(config.num_workers)]
    for w in workers: w.start()

    opt = cma.CMAOptions()
    opt['tolfun'] = -config.target
    opt['popsize'] = config.pop_size
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    opt['maxiter'] = sys.maxsize
    es = cma.CMAEvolutionStrategy(config.initial_weight, config.sigma, opt)

    total_steps = 0
    initial_time = time.time()
    training_rewards = []
    training_steps = []
    training_timestamps = []
    test_mean, test_ste = test(config, config.initial_weight, stats)
    logger.info('total steps %d, %f(%f)' % (total_steps, test_mean, test_ste))
    training_rewards.append(test_mean)
    training_steps.append(0)
    training_timestamps.append(0)
    while True:
        solutions = es.ask()
        for id, solution in enumerate(solutions):
            task_queue.put((id, solution))
        while not task_queue.empty():
            continue
        result = []
        while len(result) < len(solutions):
            if result_queue.empty():
                continue
            result.append(result_queue.get())
        result = sorted(result, key=lambda x: x[0])
        total_steps += np.sum([r[2] for r in result])
        cost = [r[1] for r in result]
        best_solution = solutions[np.argmin(cost)]
        elapsed_time = time.time() - initial_time
        test_mean, test_ste = test(config, best_solution, stats)
        logger.info('total steps %d, test %f(%f), best %f, elapased time %f' %
            (total_steps, test_mean, test_ste, -np.min(cost), elapsed_time))
        training_rewards.append(test_mean)
        training_steps.append(total_steps)
        training_timestamps.append(elapsed_time)
        # with open('data/%s-best_solution_%s.bin' % (TAG, config.task), 'wb') as f:
        #     pickle.dump(solutions[np.argmin(result)], f)
        if config.max_steps and total_steps > config.max_steps:
            stop.value = True
            break

        cost = fitness_shift(cost)
        es.tell(solutions, cost)
        # es.disp()
        for normalizer in normalizers:
            stats.merge(normalizer.online_stats)
            normalizer.online_stats.zero()
        for normalizer in normalizers:
            normalizer.offline_stats.load(stats)

    stop.value = True
    for w in workers: w.join()
    return [training_rewards, training_steps, training_timestamps]
Esempio n. 22
0
def act(i: int, free_queue: mp.SimpleQueue, full_queue: mp.SimpleQueue,
        model: torch.nn.Module, buffers: Buffers,
        episode_state_count_dict: dict, train_state_count_dict: dict,
        initial_agent_state_buffers, flags):
    try:
        log.info('Actor %i started.', i)
        timings = prof.Timings()

        gym_env = create_env(flags)
        seed = i ^ int.from_bytes(os.urandom(4), byteorder='little')
        gym_env.seed(seed)

        if flags.num_input_frames > 1:
            gym_env = FrameStack(gym_env, flags.num_input_frames)

        env = Environment(gym_env,
                          fix_seed=flags.fix_seed,
                          env_seed=flags.env_seed)

        env_output = env.initial()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)

        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Update the episodic state counts
            episode_state_key = tuple(env_output['frame'].view(-1).tolist())
            if episode_state_key in episode_state_count_dict:
                episode_state_count_dict[episode_state_key] += 1
            else:
                episode_state_count_dict.update({episode_state_key: 1})
            buffers['episode_state_count'][index][0, ...] = \
                torch.tensor(1 / np.sqrt(episode_state_count_dict.get(episode_state_key)))

            # Reset the episode state counts when the episode is over
            if env_output['done'][0][0]:
                for episode_state_key in episode_state_count_dict:
                    episode_state_count_dict = dict()

            # Update the training state counts if you're doing count-based exploration
            if flags.model == 'count':
                train_state_key = tuple(env_output['frame'].view(-1).tolist())
                if train_state_key in train_state_count_dict:
                    train_state_count_dict[train_state_key] += 1
                else:
                    train_state_count_dict.update({train_state_key: 1})
                buffers['train_state_count'][index][0, ...] = \
                    torch.tensor(1 / np.sqrt(train_state_count_dict.get(train_state_key)))

            # Do new rollout
            for t in range(flags.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                timings.time('model')

                env_output = env.step(agent_output['action'])

                timings.time('step')

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]

                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                # Update the episodic state counts
                episode_state_key = tuple(
                    env_output['frame'].view(-1).tolist())
                if episode_state_key in episode_state_count_dict:
                    episode_state_count_dict[episode_state_key] += 1
                else:
                    episode_state_count_dict.update({episode_state_key: 1})
                buffers['episode_state_count'][index][t + 1, ...] = \
                    torch.tensor(1 / np.sqrt(episode_state_count_dict.get(episode_state_key)))

                # Reset the episode state counts when the episode is over
                if env_output['done'][0][0]:
                    episode_state_count_dict = dict()

                # Update the training state counts if you're doing count-based exploration
                if flags.model == 'count':
                    train_state_key = tuple(
                        env_output['frame'].view(-1).tolist())
                    if train_state_key in train_state_count_dict:
                        train_state_count_dict[train_state_key] += 1
                    else:
                        train_state_count_dict.update({train_state_key: 1})
                    buffers['train_state_count'][index][t + 1, ...] = \
                        torch.tensor(1 / np.sqrt(train_state_count_dict.get(train_state_key)))

                timings.time('write')
            full_queue.put(index)

        if i == 0:
            log.info('Actor %i: %s', i, timings.summary())

    except KeyboardInterrupt:
        pass
    except Exception as e:
        logging.error('Exception in worker process %i', i)
        traceback.print_exc()
        print()
        raise e
Esempio n. 23
0
def act(
    flags,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        gym_env = create_env(flags)
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        gym_env.seed(seed)
        env = environment.Environment(gym_env)
        env_output = env.initial()

        agent_state = model.initial_state(batch_size=1)
        mems, mem_padding = None, None
        agent_output, unused_state, mems, mem_padding, _ = model(
            env_output, agent_state, mems, mem_padding)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # explicitly make done False to allow the loop to run
            # Don't need to set 'done' to true since now take step out of done state
            # when do arrive at 'done'
            # env_output['done'] = torch.tensor([0], dtype=torch.uint8)

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do one new rollout, untill flags.unroll_length
            t = 0
            while t < flags.unroll_length and not env_output['done'].item():
                # for t in range(flags.unroll_length):
                timings.reset()

                # REmoved since never this will never be true (MOVED TO AFTER FOR LOOP)
                # if env_output['done'].item():
                #    mems = None

                with torch.no_grad():
                    agent_output, agent_state, mems, mem_padding, _ = model(
                        env_output, agent_state, mems, mem_padding)

                timings.time("model")

                # TODO: Shakti add action repeat?
                env_output = env.step(agent_output["action"])

                timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")
                t += 1

            if env_output['done'].item():
                mems = None
                # Take arbitrary step to reset environment
                env_output = env.step(torch.tensor([2]))

            if t != flags.unroll_length:
                # TODO I checked and seems good but Shakti can you check as well?
                buffers['done'][index][t + 1:] = torch.tensor(
                    [True]).repeat(flags.unroll_length - t)

            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        # print()
        raise e
Esempio n. 24
0
def act(
    flags,
    game_params,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        sc_env = init_game(game_params['env'],
                           flags.map_name,
                           random_seed=seed)
        obs_processer = IMPALA_ObsProcesser(action_table=model.action_table,
                                            **game_params['obs_processer'])
        env = environment.Environment(sc_env, obs_processer, seed)
        # initial rollout starts here
        env_output = env.initial()
        with torch.no_grad():
            agent_output = model.actor_step(env_output)

        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                if key not in ['sc_env_action'
                               ]:  # no need to save this key on buffers
                    buffers[key][index][0, ...] = agent_output[key]

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                env_output = env.step(agent_output["sc_env_action"])

                timings.time("step")

                with torch.no_grad():
                    agent_output = model.actor_step(env_output)

                timings.time("model")

                #env_output = env.step(agent_output["sc_env_action"])

                #timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    if key not in ['sc_env_action'
                                   ]:  # no need to save this key on buffers
                        buffers[key][index][t + 1, ...] = agent_output[key]
                # env_output will be like
                # s_{0}, ..., s_{T}
                # act_mask_{0}, ..., act_mask_{T}
                # discount_{0}, ..., discount_{T}
                # r_{-1}, ..., r_{T-1}
                # agent_output will be like
                # a_0, ..., a_T with a_t ~ pi(.|s_t)
                # log_pi(a_0|s_0), ..., log_pi(a_T|s_T)
                # so the learner can use (s_i, act_mask_i) to predict log_pi_i
                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 25
0
    def __init__(self,
 # brain configs
            encoder,
            brain_descriptions,
            n_actors, n_critics,
            n_history, state_size, action_size,
            n_step, floating_step, gamma,
 # agent configs
            update_goal,
            her_max_ratio,
            gae, gae_tau,
            freeze_delta, freeze_count,
            ):

        self.gates = []
        self.agents = []
        self.brains = []
        self.offsets = [0]
        self.total = 0

        encoder.share_memory()
        for bd in brain_descriptions:
            self.total += bd.count
            self.offsets.append(self.offsets[-1] + bd.count)
            self.gates.append(SimpleQueue())
            self.brains.append(Brain(
                xid=len(self.brains),
                ddpg=bd.ddpg,
                Actor=bd.Actor, Critic=bd.Critic, encoder=encoder, master=(not len(self.brains)),
                n_actors=n_actors, n_critics=n_critics,
                n_history=n_history, state_size=state_size, action_size=action_size,
                resample_delay=bd.resample_delay,
                lr_actor=bd.lr_actor, lr_critic=bd.lr_critic, clip_norm=bd.clip_norm,
                n_step=n_step, gamma=gamma, gae=gae,
                ppo_eps=bd.ppo_eps, dbgout=bd.dbgout,
                adv_on=bd.adv_on, adv_boost=bd.adv_boost,
                model_path=bd.model_path, save=bd.save, load=bd.load, delay=bd.delay,
                ))
            self.brains[-1].share_memory() # make it explicit

            self.agents.append(
                Agent(self.brains[-1],
                    replay_buffer=bd.replay_buffer, update_goal=update_goal,
                    n_groups=bd.n_groups,
                    n_step=n_step, floating_step=floating_step, gamma=gamma, good_reach=bd.good_reach,
                    sync_delta=bd.sync_delta, learning_delay=bd.learning_delay, learning_repeat=bd.learning_repeat, batch_size=bd.batch_size,
                    fresh_frac=bd.fresh_frac, optim_epochs=bd.optim_epochs,
                    replay_cleaning=bd.replay_cleaning, prob_treshold=bd.prob_treshold,
                    her_max_ratio=her_max_ratio,
                    gae=gae, gae_tau=gae_tau,
                    tau_replay_counter=bd.tau_replay_counter, tau_base=bd.tau_base, tau_final=bd.tau_final,
                    freeze_delta=freeze_delta, freeze_count=freeze_count,
                    ))
            continue

            self.agents.append(Process(
                    target=agent_launch,
                    args=(self.gates[-1], self.brains[-1],
                        bd.replay_buffer, update_goal,
                        bd.n_groups,
                        n_step, floating_step, gamma, bd.good_reach,
                        bd.sync_delta, bd.learning_delay, bd.learning_repeat, bd.batch_size,
                        bd.fresh_frac, bd.optim_epochs,
                        bd.replay_cleaning, bd.prob_treshold,
                        her_max_ratio,
                        gae, gae_tau,
                        bd.tau_replay_counter, bd.tau_base, bd.tau_final,
                        freeze_delta, freeze_count,
                        )))

            self.agents[-1].start()
Esempio n. 26
0
 def __init__(self, num):
     self.lock = threading.RLock()
     self.pipes = [SimpleQueue() for _ in range(num + 1)]
     self.counter = {"reset": [], "step": []}
Esempio n. 27
0
def act(
    flags,
    env: str,
    task: int,
    full_action_space: bool,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        # create the environment from command line parameters
        # => could also create a special one which operates on a list of games (which we need)
        gym_env = create_env(
            env,
            frame_height=flags.frame_height,
            frame_width=flags.frame_width,
            gray_scale=(flags.aaa_input_format == "gray_stack"),
            full_action_space=full_action_space,
            task=task)

        # generate a seed for the environment (NO HUMAN STARTS HERE!), could just
        # use this for all games wrapped by the environment for our application
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        gym_env.seed(seed)

        # wrap the environment, this is actually probably the point where we could
        # use multiple games, because the other environment is still one from Gym
        env = environment.Environment(gym_env)

        # get the initial frame, reward, done, return, step, last_action
        env_output = env.initial()

        # perform the first step
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            # get a buffer index from the queue for free buffers (?)
            index = free_queue.get()
            # termination signal (?) for breaking out of this loop
            if index is None:
                break

            # Write old rollout end.
            # the keys here are (frame, reward, done, episode_return, episode_step, last_action)
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            # here the keys are (policy_logits, baseline, action)
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            # I think the agent_state is just the RNN/LSTM state (which will be the "initial" state for the next step)
            # not sure why it's needed though because it really just seems to be the initial state before starting to
            # act; however, it might be randomly initialised, which is why we might want it...
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout
            for t in range(flags.unroll_length):
                timings.reset()

                # forward pass without keeping track of gradients to get the agent action
                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                timings.time("model")

                # agent acting in the environment
                env_output = env.step(agent_output["action"])

                timings.time("step")

                # writing the respective outputs of the current step (see above for the list of keys)
                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")

            # after finishing a trajectory put the index in the "full queue",
            # presumably so that the data can be processed/sent to the learner
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 28
0
def act(
    flags,
    game_params,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        sc_env = init_game(game_params['env'], flags.map_name, random_seed=seed)
        obs_processer = IMPALA_ObsProcesser_v2(env=sc_env, action_table=model.action_table, **game_params['obs_processer'])
        env = environment.Environment_v2(sc_env, obs_processer, seed)
        # initial rollout starts here
        env_output = env.initial() 
        new_res = model.spatial_processing_block.new_res
        agent_state = model.spatial_processing_block.conv_lstm._init_hidden(batch_size=1, 
                                                                            image_size=(new_res,new_res)
                                                                           )
        
        with torch.no_grad():
            agent_output, new_agent_state = model.actor_step(env_output, *agent_state[0]) 

        agent_state = agent_state[0] # _init_hidden yields [(h,c)], whereas actor step only (h,c)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end. 
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                if key not in ['sc_env_action']: # no need to save this key on buffers
                    buffers[key][index][0, ...] = agent_output[key]
            
            # lstm state in syncro with the environment / input to the agent 
            # that's why agent_state = new_agent_state gets executed afterwards
            initial_agent_state_buffers[index][0][...] = agent_state[0]
            initial_agent_state_buffers[index][1][...] = agent_state[1]
            
            
            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                env_output = env.step(agent_output["sc_env_action"])
                
                timings.time("step")
                
                # update state
                agent_state = new_agent_state 
            
                with torch.no_grad():
                    agent_output, new_agent_state = model.actor_step(env_output, *agent_state)
                
                timings.time("model")
                
                #env_output = env.step(agent_output["sc_env_action"])

                #timings.time("step")

                for key in env_output:
                    buffers[key][index][t+1, ...] = env_output[key] 
                for key in agent_output:
                    if key not in ['sc_env_action']: # no need to save this key on buffers
                        buffers[key][index][t+1, ...] = agent_output[key] 
                # env_output will be like
                # s_{0}, ..., s_{T}
                # act_mask_{0}, ..., act_mask_{T}
                # discount_{0}, ..., discount_{T}
                # r_{-1}, ..., r_{T-1}
                # agent_output will be like
                # a_0, ..., a_T with a_t ~ pi(.|s_t)
                # log_pi(a_0|s_0), ..., log_pi(a_T|s_T)
                # so the learner can use (s_i, act_mask_i) to predict log_pi_i
                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Esempio n. 29
0
def train(config):
    task_queue = SimpleQueue()
    result_queue = SimpleQueue()
    stop = mp.Value('i', False)
    stats = SharedStats(config.state_dim)
    param = torch.FloatTensor(torch.from_numpy(config.initial_weight))
    param.share_memory_()
    n_params = len(param.numpy().flatten())
    if config.args.noise_type == 'lss':
        noise_sizes = [
            cofig.state_dim * config.hidden_size,
            config.hidden_size * config.hidden_size,
            config.hidden_size * config.action_dim
        ]
    else:
        noise_sizes = None
    noise_generator = NoiseGenerator(n_params,
                                     config.pop_size,
                                     config.args.noise,
                                     noise_sizes=noise_sizes)
    normalizers = [
        StaticNormalizer(config.state_dim) for _ in range(config.num_workers)
    ]
    for normalizer in normalizers:
        normalizer.offline_stats.load(stats)
    workers = [
        Worker(id, param, normalizers[id], task_queue, result_queue, stop,
               noise_generator, config) for id in range(config.num_workers)
    ]
    for w in workers:
        w.start()

    training_rewards = []
    training_steps = []
    training_timestamps = []
    initial_time = time.time()
    total_steps = 0
    iteration = 0
    while not stop.value:
        test_mean, test_ste = test(config, param.numpy(), stats)
        elapsed_time = time.time() - initial_time
        training_rewards.append(test_mean)
        training_steps.append(total_steps)
        training_timestamps.append(elapsed_time)
        logger.info('Test: total steps %d, %f(%f), elapsed time %d' %
                    (total_steps, test_mean, test_ste, elapsed_time))

        for i in range(config.pop_size):
            task_queue.put(i)
        rewards = []
        epsilons = []
        steps = []
        while len(rewards) < config.pop_size:
            if result_queue.empty():
                continue
            epsilon, fitness, step = result_queue.get()
            epsilons.append(epsilon)
            rewards.append(fitness)
            steps.append(step)

        total_steps += np.sum(steps)
        r_mean = np.mean(rewards)
        r_std = np.std(rewards)
        # rewards = (rewards - r_mean) / r_std
        logger.info('Train: iteration %d, %f(%f)' %
                    (iteration, r_mean, r_std / np.sqrt(config.pop_size)))
        iteration += 1
        # if r_mean > config.target:
        if config.max_steps and total_steps > config.max_steps:
            stop.value = True
            break
        for normalizer in normalizers:
            stats.merge(normalizer.online_stats)
            normalizer.online_stats.zero()
        for normalizer in normalizers:
            normalizer.offline_stats.load(stats)
        if config.args.reward_type == 'rank':
            rewards = fitness_shift(rewards)
        gradient = np.asarray(epsilons) * np.asarray(rewards).reshape((-1, 1))
        gradient = np.mean(gradient, 0) / config.sigma
        gradient -= config.weight_decay * gradient
        if config.args.opt == 'adam':
            gradient = config.opt.update(gradient)
        gradient = torch.FloatTensor(gradient)
        param.add_(config.learning_rate * gradient)

    for w in workers:
        w.join()
    return [training_rewards, training_steps, training_timestamps]
Esempio n. 30
0
def act(flags, gym_env, actor_index: int, free_queue: mp.SimpleQueue,
        full_queue: mp.SimpleQueue, buffers: Buffers, actor_buffers: Buffers,
        actor_model_queues: List[mp.SimpleQueue],
        actor_env_queues: List[mp.SimpleQueue]):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = prof.Timings()  # Keep track of how fast things are.

        gym_env = gym_env
        #seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        #gym_env.seed(seed)
        if flags.agent in ["CNN"]:
            env = environment.Environment(gym_env, "image")
        elif flags.agent in ["NLM", "KBMLP", "GCN"]:
            if flags.state in ["relative", "integer", "block"]:
                env = environment.Environment(gym_env, "VKB")
            elif flags.state == "absolute":
                env = environment.Environment(gym_env, "absVKB")
        env_output = env.initial()
        for key in env_output:
            actor_buffers[key][actor_index][0] = env_output[key]
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in actor_buffers:
                buffers[key][index][0] = actor_buffers[key][actor_index][0]

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                actor_model_queues[actor_index].put(actor_index)
                env_info = actor_env_queues[actor_index].get()
                if env_info == "exit":
                    return

                timings.time("model")

                env_output = env.step(actor_buffers["action"][actor_index][0])

                timings.time("step")

                for key in actor_buffers:
                    buffers[key][index][t +
                                        1] = actor_buffers[key][actor_index][0]
                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in env_output:
                    actor_buffers[key][actor_index][0] = env_output[key]

                timings.time("write")

            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e