Esempio n. 1
0
    def __init__(self, args, data, opts):
        '''
        需要完成几个任务,第一个是对初始任务,怎么考虑
        第二是从构建此表,最终完成采样过程
        第三是根据采样过程,构建出模型的size,选取出需要用到的参数
        :param model:
        :param task:
        :param args:
        :param data:
        '''
        self.args = args
        self.data = data
        self.opts = opts
        self.controller = Controller(args=self.args,
                                     task_num=self.opts.num_task)
        self.controller_optim = Adam(self.controller.parameters(),
                                     lr=args.controller_lr)
        cuda_condition = torch.cuda.is_available() and args.with_cuda
        self.device = torch.device("cuda" if cuda_condition else "cpu")
        self.controller = self.controller.to(self.device)

        self.tasks_config = []
        self.task_acc = []
        self.model_dict = []
        self.task_scope = 1  # =>reuse
        self.general_scope = 1  # =>new
        if self.args.adapt: self.task_scope += 1
        if self.args.fuse: self.general_scope += 1
        self.tensorboard_writer = SummaryWriter()
        self.iter = 0
Esempio n. 2
0
    def get_data(self, request, id):
        response = api.get_controller(request, id)
        data = json.loads(response.text)

        controller = Controller(data["id"], data["controller_name"],
                                data["class_name"], data["enabled"])
        return controller
Esempio n. 3
0
    def __init__(self, mdir, device, time_limit, explorer=False):
        """ Build vae, rnn, controller and environment. """
        self.explorer = explorer

        # Load controllers
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        if self.explorer:
            ctrl_file = join(mdir, 'exp', 'best.tar')

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        # MDRNNCell
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

        self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5)
        self.mdrnn_notcell.to(device)
        self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])
Esempio n. 4
0
    def __init__(self, device, time_limit, discrete_VAE):
        """ Build vae, rnn, controller and environment. """

        self.env = gym.make('CarRacing-v0')

        self.device = device

        self.time_limit = time_limit

        self.discrete_VAE = discrete_VAE

        #Because the represenation is discrete, we increase the size of the latent vector
        if (self.discrete_VAE):
            LSIZE = 128

        self.vae = VAE(3, LSIZE, 1024)
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5)
        self.controller = Controller(LSIZE, RSIZE, ASIZE)
Esempio n. 5
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)

################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(ctrl_dir, 'best.tar')
print("Attempting to load previous best...")

if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
    cur_best = - state['reward']
    controller.load_state_dict(state['state_dict'])
    print("Previous best was {}...".format(-cur_best))

parameters = controller.parameters()
es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                              {'popsize': pop_size})
Esempio n. 6
0
def train_explorer(logdir,
                   epochs=10,
                   n_samples=4,
                   pop_size=4,
                   display=True,
                   max_workers=10):
    results = {}
    results['best'] = []
    # multiprocessing variables
    num_workers = min(max_workers, n_samples * pop_size)
    time_limit = 1000

    # create tmp dir if non existent and clean it if existent
    tmp_dir = join(logdir, 'tmp_exp')
    if not exists(tmp_dir):
        mkdir(tmp_dir)
    else:
        for fname in listdir(tmp_dir):
            unlink(join(tmp_dir, fname))

    # create exp dir if non exitent
    explore_dir = join(logdir, 'explore')
    if not exists(explore_dir):
        mkdir(explore_dir)

    ################################################################################
    #                           Thread routines                                    #
    ################################################################################
    def slave_routine(p_queue, r_queue, e_queue, p_index):
        """ Thread routine.

        Threads interact with p_queue, the parameters queue, r_queue, the result
        queue and e_queue the end queue. They pull parameters from p_queue, execute
        the corresponding rollout, then place the result in r_queue.

        Each parameter has its own unique id. Parameters are pulled as tuples
        (s_id, params) and results are pushed as (s_id, result).  The same
        parameter can appear multiple times in p_queue, displaying the same id
        each time.

        As soon as e_queue is non empty, the thread terminate.

        When multiple gpus are involved, the assigned gpu is determined by the
        process index p_index (gpu = p_index % n_gpus).

        :args p_queue: queue containing couples (s_id, parameters) to evaluate
        :args r_queue: where to place results (s_id, results)
        :args e_queue: as soon as not empty, terminate
        :args p_index: the process index
        """
        # init routine
        gpu = p_index % torch.cuda.device_count()
        device = torch.device(
            'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu')

        # redirect streams
        sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a')
        sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a')

        # with torch.no_grad():
        #     r_gen = RolloutGenerator(logdir, device, time_limit)

        #     while e_queue.empty():
        #         if p_queue.empty():
        #             sleep(.1)
        #         else:
        #             s_id, params = p_queue.get()
        #             r_queue.put((s_id, r_gen.rollout(params)))

        with torch.no_grad():
            r_gen = RolloutGenerator(logdir, device, time_limit)

            while e_queue.empty():
                if p_queue.empty():
                    sleep(.1)
                else:
                    s_id, params = p_queue.get()
                    r_queue.put((s_id, r_gen.rollout(params)))

    ################################################################################
    #                Define queues and start workers                               #
    ################################################################################
    p_queue = Queue()
    r_queue = Queue()
    e_queue = Queue()

    for p_index in range(num_workers):
        Process(target=slave_routine,
                args=(p_queue, r_queue, e_queue, p_index)).start()

    ################################################################################
    #                           Evaluation                                         #
    ################################################################################
    def evaluate(solutions, results, rollouts=100):
        """ Give current controller evaluation.

        Evaluation is minus the cumulated reward averaged over rollout runs.

        :args solutions: CMA set of solutions
        :args results: corresponding results
        :args rollouts: number of rollouts

        :returns: minus averaged cumulated reward
        """
        index_min = np.argmin(results)
        best_guess = solutions[index_min]
        restimates = []

        for s_id in range(rollouts):
            p_queue.put((s_id, best_guess))

        print("Evaluating...")
        for _ in tqdm(range(rollouts)):
            while r_queue.empty():
                sleep(.1)
            restimates.append(r_queue.get()[1])

        return best_guess, np.mean(restimates), np.std(restimates)

    ################################################################################
    #                           Launch CMA                                         #
    ################################################################################
    controller = Controller(LSIZE, RSIZE, ASIZE)  # dummy instance

    # define current best and load parameters
    cur_best = None
    ctrl_file = join(explore_dir, 'best.tar')
    print("Attempting to load previous best...")
    if exists(ctrl_file):
        state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
        cur_best = -state['reward']
        controller.load_state_dict(state['state_dict'])
        print("Previous best was {}...".format(-cur_best))

    parameters = controller.parameters()
    es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                                  {'popsize': pop_size})

    epoch = 0
    log_step = 3
    while not es.stop():
        if cur_best is not None and -cur_best > target_return:
            print("Already better than target, breaking...")
            break

        r_list = [0] * pop_size  # result list
        solutions = es.ask()

        # push parameters to queue
        for s_id, s in enumerate(solutions):
            for _ in range(n_samples):
                p_queue.put((s_id, s))

        # retrieve results
        if display:
            pbar = tqdm(total=pop_size * n_samples)
        for _ in range(pop_size * n_samples):
            while r_queue.empty():
                sleep(.1)
            r_s_id, r = r_queue.get()
            r_list[r_s_id] += r / n_samples
            if display:
                pbar.update(1)
        if display:
            pbar.close()

        es.tell(solutions, r_list)
        es.disp()

        # evaluation and saving
        if epoch % log_step == log_step - 1:
            best_params, best, std_best = evaluate(solutions, r_list)

            # log the best
            results['best'].append(best)

            print("Current evaluation: {}".format(best))
            if not cur_best or cur_best > best:
                cur_best = best
                print("Saving new best with value {}+-{}...".format(
                    -cur_best, std_best))
                load_parameters(best_params, controller)
                torch.save(
                    {
                        'epoch': epoch,
                        'reward': -cur_best,
                        'state_dict': controller.state_dict()
                    }, join(explore_dir, 'best.tar'))

            if -best > target_return:
                print(
                    "Terminating controller training with value {}...".format(
                        best))
                break

        epoch += 1

    es.result_pretty()
    e_queue.put('EOP')

    return results
Esempio n. 7
0
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 number_goals,
                 Forward_model,
                 hiddengoals: bool,
                 curiosityreward=bool,
                 static=bool):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file, Dtild_file, hiddenvae_file = [
            join(mdir, m, 'best.tar')
            for m in ['vae', 'mdrnn', 'ctrl', 'dtild', 'hiddenvae']
        ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        vae_state, rnn_state, hiddenvae_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file, hiddenvae_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state),
                     ('HiddenVAE', hiddenvae_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.HiddenVAE = HiddenVAE(256, LSIZE).to(device)
        self.HiddenVAE.load_state_dict(hiddenvae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.mdrnnBIG = MDRNN(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnnBIG.load_state_dict(rnn_state["state_dict"])

        self.controller = Controller(256, 256, 6).to(device)

        self.env = gym.make('MiniGrid-MultiRoom-N6-v0')

        self.device = device
        self.number_goals = number_goals
        self.time_limit = time_limit

        self.vae_state = vae_state
        self.rnn_state = rnn_state
        self.hiddenvae_state = hiddenvae_state

        self.hiddengoals = hiddengoals
        self.curiosityreward = curiosityreward
        self.static = static
        self.Forward_model = Forward_model

        self.fmodel = Dtild(32, 256, 1, 32).to(device)
Esempio n. 8
0
    for s_id in range(rollouts):
        p_queue.put((s_id, best_guess))

    print("Evaluating...")
    for _ in tqdm(range(rollouts)):
        while r_queue.empty():
            sleep(.1)
        restimates.append(r_queue.get()[1])

    return best_guess, np.mean(restimates), np.std(restimates)


################################################################################
#                           Launch CMA                                         #
################################################################################
controller = Controller(LSIZE, RSIZE, ASIZE,
                        is_gate=args.is_gate)  # dummy instance

# define current best and load parameters
cur_best = None
ctrl_file = join(ctrl_dir, 'best.tar')
print("Attempting to load previous best...")
if exists(ctrl_file):
    state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'})
    cur_best = -state['reward']

    # changes so that can load previous mdoels even if the controller source
    # code has changes (add new parameters)
    load_model_safe_(controller, state['state_dict'])
    try:
        print(controller.gates, controller.is_gate)
    except AttributeError:
Esempio n. 9
0
def train_C_given_M(mdrnnCell, latent_dim, hidden_dim, action_dim):

    # Parameters
    num_episode = 1
    batch_size = 1
    learning_rate = 0.01
    gamma = 0.99
    done_threshold = np.log(0.5)

    interim_policy = Controller(latent_dim, hidden_dim, action_dim)
    optimizer = torch.optim.RMSprop(interim_policy.parameters(),
                                    lr=learning_rate)

    # Batch History
    state_pool = []
    action_pool = []
    reward_pool = []
    steps = 0

    for e in range(num_episode):

        # initial latent and hidden states
        z_t = torch.randn(1, LSIZE)
        h_t = 2 * [torch.zeros(1, RSIZE)]

        for t in range(1000):

            # pick action using policy net given z_t, h_t
            mean_a_t = interim_policy(z_t, h_t[0])
            action_policy_std = 0.1
            cov = action_policy_std * torch.eye(action_dim)
            stochastic_policy = MultivariateNormal(loc=mean_a_t,
                                                   covariance_matrix=cov)
            a_t = stochastic_policy.sample()

            mu, sigma, pi, r, d, n_h = mdrnnCell(a_t, z_t, h_t)
            # sample next z_t from N(mu, sigma)
            pi = pi.squeeze()
            mixt = Categorical(torch.exp(pi)).sample().item()

            z_t = mu[:,
                     mixt, :]  # + sigma[:, mixt, :] * torch.randn_like(mu[:, mixt, :])
            h_t = n_h

            reward = -0.1
            if d >= done_threshold:
                done = True
            else:
                done = False

            state_pool.append((z_t, h_t))
            action_pool.append(a_t)
            reward_pool.append(reward)

            steps += 1
            if done:
                break

        # Update policy
        if e > 0 and e % batch_size == 0:

            # Discount reward
            running_add = 0
            for i in reversed(range(steps)):
                if reward_pool[i] == 0:
                    running_add = 0
                else:
                    running_add = running_add * gamma + reward_pool[i]
                    reward_pool[i] = running_add

            # Normalize reward
            reward_mean = np.mean(reward_pool)
            reward_std = np.std(reward_pool)
            for i in range(steps):
                reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std

            # Gradient Desent
            optimizer.zero_grad()

            for i in range(steps):
                z_t, h_t = state_pool[i]
                action = action_pool[i]
                reward = reward_pool[i]

                mean_a_t = interim_policy(z_t, h_t[0])
                action_policy_std = 0.1
                cov = action_policy_std * torch.eye(action_dim)
                stochastic_policy = MultivariateNormal(loc=mean_a_t,
                                                       covariance_matrix=cov)
                loss = -stochastic_policy.log_prob(
                    action) * reward  # Negtive score function x reward
                # TODO: why do we need to use retain_graph here?
                loss.backward(retain_graph=True)
                optimizer.step()

            state_pool = []
            action_pool = []
            reward_pool = []
            steps = 0

    return interim_policy
Esempio n. 10
0
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 iteration_num=None,
                 video_dir=None):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = [
            join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"]
        ]

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        print("\nRollout Generator")

        vae_state, rnn_state = [
            torch.load(fname, map_location={"cuda:0": str(device)})
            for fname in (vae_file, rnn_file)
        ]

        print("Loading VAE from {}".format(vae_file))
        print("Loading RNN from {}".format(rnn_file))
        for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s["epoch"], s["precision"]))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state["state_dict"])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip("_l0"): v
             for k, v in rnn_state["state_dict"].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            print("Loading Controller from {}".format(ctrl_file))
            ctrl_state = torch.load(ctrl_file,
                                    map_location={"cuda:0": str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state["reward"]))
            self.controller.load_state_dict(ctrl_state["state_dict"])

        self.env = gym.make("BipedalWalkerHardcore-v2")
        self.device = device

        self.time_limit = time_limit
Esempio n. 11
0
                elif arg == 'fuzzy':
                    controller_fuzzy = True
                    mono = True
                    simulations /= 2
                else:
                    raise GetoptError()
    except GetoptError:
        usage()
        sys.exit(-1)

    #simulations = 5 if len(sys.argv) == 1 else int(sys.argv[1])
    for i in range(int(simulations * 2)):
        # create a controller
        if mono:
            control = FuzzyLogicController(
                log=log) if controller_fuzzy else Controller(log=log)
        else:
            control = Controller(
                log=log) if i % 2 == 0 else FuzzyLogicController(log=log)

        # create North-to-South and West-to-East lanes
        north2south = Lane(control,
                           S=15,
                           D=7,
                           name='North to South',
                           init_state=State.green)
        west2east = Lane(control,
                         S=15,
                         D=7,
                         name='West to East',
                         init_state=State.red)

M = buildMemory('weights/2019.12.07/mdn_rnn_weights')
get_hidden = K.function(M.layers[0].input, M.layers[0].output)


# In[ ]:


print(M.summary())


# In[ ]:


controller = Controller(32+256, 3)
controller.set_weights(np.load('./weights/C_weights.npy'))


# $$\text{Controller}: \mathbb R^{288} \rightarrow \mathbb R^3 $$

# In[ ]:


print('controller shape:')


# In[ ]:


print(controller.shape)
Esempio n. 13
0
# Fix numeric divergence due to bug in Cudnn
torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if cuda else "cpu")

trained = 0
#model = VAE(3, LSIZE).to(device)
vae_model = VAE(3, LSIZE)
vae_model = torch.nn.DataParallel(vae_model, device_ids=[7])
vae_model.cuda(7)
vae_model.eval()
mdrnn_model = MDRNNCell(LSIZE, ASIZE, RSIZE, 5)
mdrnn_model = torch.nn.DataParallel(mdrnn_model, device_ids=[7])
mdrnn_model.cuda(7)
mdrnn_model.eval()
controller = torch.nn.DataParallel(Controller(LSIZE, RSIZE, ASIZE)).cuda()

vis = visdom.Visdom(env='dream')
image_window = vis.image(
    np.random.rand(RED_SIZE * 10, RED_SIZE * 10),
    opts=dict(title='dream!', caption='dream.'),
)

# check vae dir exists, if not, create it
dream_dir = join(args.logdir, 'dream')
vae_dir = join(args.logdir, 'vae')
reload_file = join(vae_dir, 'best.tar')
state = torch.load(reload_file)
print("Reloading model at epoch {}"
      ", with test error {}".format(state['epoch'], state['precision']))
vae_model.load_state_dict(state['state_dict'])
Esempio n. 14
0
def ctrl_exp_gen_data(rollouts,
                      datadir,
                      logdir,
                      noise_type,
                      device,
                      use_ctrl_exp,
                      exp_prob=.5,
                      randomness_factor=.1):
    """ 
    randomness factor is the multiple we will multiply the current standard deviation by to get the std for the normal disnt std.
    This help because it is more resonable. Really should be updating based on parameter distances over updates, but whatever.
    ** read the openai parameter thing

    Uses fixed parameters for vae and mdrnn, but maybe change

    All the if use_ctrl_exp: should be switched to having the random thing inside a module, or at least consistent with the explorer way.
    """
    assert exists(logdir), "The directory does not exist..."
    exp_prob = float(exp_prob)

    env = gym.make("CarRacing-v0")
    seq_len = 1000

    if use_ctrl_exp:
        a_rollout = []

        #### Load controller and explorer
        ctrl_file = join(logdir, 'ctrl', 'best.tar')
        exp_file = join(logdir, 'exp', 'best.tar')

        controller = Controller(LSIZE, RSIZE, ASIZE).to(device)
        explorer = Controller(LSIZE, RSIZE, ASIZE).to(device)

        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            controller.load_state_dict(ctrl_state['state_dict'])

        if exists(exp_file):
            exp_state = torch.load(exp_file,
                                   map_location={'cuda:0': str(device)})
            print("Loading Explorer with reward {}".format(
                exp_state['reward']))
            explorer.load_state_dict(exp_state['state_dict'])

        # Make the generators (this is unnecessary, shoul dbe organized some other way)
        ctrl_gen = RolloutGeneratorSingle(logdir, device, controller)
        exp_gen = RolloutGeneratorSingle(logdir, device, explorer)

        # for parameter noise exploration
        def update_params_noise(model, randomness_factor):
            def gaussian(ins, stddev=std):
                return ins + Variable(torch.randn(ins.size()).cuda() * stddev)

            all_params = []
            controller_new = controller
            for name, param in controller.named_parameters():
                all_params.append(param)

            std = np.std(np.array(params))
            print('Parameter mean: ', np.mean(np.array(params)))
            print('Parameter std: ', std)

            std = std * randomness_factor
            controller_new.apply(gaussian)
            return controller_new

    for i in range(rollouts):
        env.reset()
        env.env.viewer.window.dispatch_events()

        s_rollout = []
        r_rollout = []
        d_rollout = []

        if use_ctrl_exp:
            # randomize the explorer and controller
            explorer_new = update_params_noise(explorer, randomness_factor)
            controller_new = update_params_noise(controller, randomness_factor)

            # initialize the hidden state for the model:
            hidden = [torch.zeros(1, RSIZE).to(device) for _ in range(2)]

        else:
            if noise_type == 'white':
                a_rollout = [env.action_space.sample() for _ in range(seq_len)]
            elif noise_type == 'brown':
                a_rollout = sample_continuous_policy(env.action_space, seq_len,
                                                     1. / 50)

        t = 0
        while True:

            if use_ctrl_exp:
                # explore or exploit:
                if random.uniform(0, 1) < exp_prob:
                    action, obs, hidden = ctrl_gen(obs, hidden)
                else:
                    action, obs, hidden = exp_gen(obs, hidden)
                a_rollout.append(action)
            else:
                action = a_rollout[t]

            t += 1
            s, r, done, _ = env.step(action)
            env.env.viewer.window.dispatch_events()
            s_rollout += [s]
            r_rollout += [r]
            d_rollout += [done]
            if done:
                print("> End of rollout {}, {} frames...".format(
                    i, len(s_rollout)))
                np.savez(join(datadir, 'rollout_{}'.format(i)),
                         observations=np.array(s_rollout),
                         rewards=np.array(r_rollout),
                         actions=np.array(a_rollout),
                         terminals=np.array(d_rollout))
                break