Exemple #1
0
    def step_dope(self, action, rollout, model=False, action_list=[]):
        '''
        steps the true environment, using dopamine models. The last environment in the proxy chain is the true environment,
        and has a different step function (performs model updates and most state saving inside dopamine)
        '''
        if model:
            reward = self.computeReward(rollout, 1)
            action = self.models.currentModel().forward(
                self.current_state, reward[self.models.option_index])
        if len(self.proxy_chain) > 1:
            state, base_state, done, action_list = self.proxy_chain[-1].step(
                action, model=True, action_list=[action] + action_list)
        else:
            raw_state, factored_state, done = self.proxy_chain[-1].step(action)
            action_list = [action] + action_list

        if done:
            self.reset_history()
        self.raw_state = (raw_state, factored_state)
        # TODO: implement multiprocessing support
        state, resp = self.stateExtractor.get_state(self.raw_state)
        self.extracted_state = pytorch_model.wrap(
            state, cuda=self.iscuda).unsqueeze(0)
        self.insert_extracted()
        self.insert_changepoint_queue(
            self.cp_state, pytorch_model.wrap(action, cuda=self.iscuda),
            pytorch_model.wrap(resp, cuda=self.iscuda))
        return self.extracted_state, self.raw_state, done, action_list
    def forward(self, x, resp):
        '''
        TODO: make use of time_estimator, link up Q vals and action probs
        TODO: clean up cuda = True to something that is actually true
        TODO: only accepts integer array input states of form (num in batch, num_vals)
        '''
        Qvals = []
        aprobs = []
        for xv in x:  # for each x in the batch, convert state to hash and get Q value
            if len(xv.shape) > 1:
                xv = xv[0]
            hsh = self.hash_function(xv)
            Qval, Aprob = self.get_Qval(hsh)
            Qvals.append(Qval)
            aprobs.append(Aprob)
        Qvals = torch.stack(Qvals, dim=0)
        aprobs = torch.stack(aprobs, dim=0)
        action_probs = pytorch_model.wrap(aprobs, cuda=self.iscuda)
        Q_vals = pytorch_model.wrap(Qvals, cuda=self.iscuda)
        # print(Q_vals.shape, action_probs.shape)
        values = Q_vals.max(dim=1)[0]
        probs = F.softmax(action_probs, dim=1)
        # print("probs", probs)
        log_probs = F.log_softmax(action_probs, dim=1)

        dist_entropy = -(log_probs * probs).sum(-1).mean()

        return values, dist_entropy, probs, Q_vals
 def insert(self,
            reenter,
            extracted_state,
            current_state,
            epsilon,
            done,
            resp,
            action,
            changepoint_state,
            option_param,
            option_no,
            rewards=None,
            returns=None,
            action_probs=None,
            Qvals=None,
            value_preds=None,
            option_agnostic=False):
     if not reenter:
         if self.buffer_filled == self.buffer_steps:
             for i in range(len(self.option_agnostic)):
                 self.option_agnostic[i] = self.option_agnostic[i].roll(
                     -1, 0)
             for i in range(len(self.option_specific)):
                 self.option_specific[i] = self.option_specific[i].roll(
                     -1, 1)
             self.reset_values()
     else:
         self.buffer_filled -= 1  # if reentering, subtract 1 so that we insert to the same location. Don't reenter at very first
     self.buffer_filled += int(self.buffer_filled < self.buffer_steps)
     # print(self.buffer_filled)
     self.extracted_state[self.buffer_filled - 1].copy_(
         extracted_state.squeeze().detach())
     self.current_state[self.buffer_filled - 1].copy_(
         current_state.squeeze().detach())
     self.resps[self.buffer_filled - 1].copy_(resp.squeeze().detach())
     self.actions[self.buffer_filled - 1].copy_(action.squeeze().detach())
     self.dones[self.buffer_filled - 1].copy_(
         pytorch_model.wrap(int(done), cuda=self.iscuda))
     self.epsilon[self.buffer_filled - 1].copy_(epsilon.squeeze().detach())
     self.changepoint_states[self.buffer_filled - 1].copy_(
         changepoint_state.squeeze().detach())
     self.option_param[self.buffer_filled - 1].copy_(
         option_param.squeeze().detach())
     self.option_num[self.buffer_filled - 1].copy_(
         pytorch_model.wrap(option_no, cuda=self.iscuda))
     if not option_agnostic:
         for oidx in range(self.num_options):
             self.value_preds[oidx, self.buffer_filled - 1].copy_(
                 value_preds[oidx].squeeze().detach())
             self.Qvals[oidx, self.buffer_filled - 1].copy_(
                 Qvals[oidx].squeeze().detach())
             self.action_probs[oidx, self.buffer_filled - 1].copy_(
                 action_probs[oidx].squeeze().detach())
             if rewards is not None:
                 self.rewards[oidx, self.buffer_filled - 1].copy_(
                     rewards[oidx].squeeze().detach())
             if returns is not None:
                 self.returns[oidx, self.buffer_filled - 1].copy_(
                     returns[oidx].squeeze().detach())
Exemple #4
0
 def reset_history(self):
     self.current_state = pytorch_model.wrap(np.zeros(
         (self.num_hist * int(np.prod(self.state_size)), )),
                                             cuda=self.iscuda)
     self.current_resp = pytorch_model.wrap(
         [[0 for i in range(len(self.stateExtractor.fnames))]
          for _ in range(self.num_hist)],
         cuda=self.iscuda).flatten()
Exemple #5
0
 def determine_step(self, state, reward):
     '''
     output: what is this function?
     '''
     actions = []
     for i in range(self.num_options):
         actions.append(self.models[i](state, reward))
     pytorch_model.wrap(actions)
     return actions
def generate_soft_dataset(states, resps, true_environment, reward_fns, args):
    pre_load_weights = args.load_weights
    args.load_weights = True
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    print(args.load_weights)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    train_models = proxy_environment.models
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)

    train_models.initialize(args, len(reward_fns), state_class, num_actions)
    train_models.session(args)
    proxy_environment.duplicate(args)  # assumes that we are loading weights
    args.load_weights = pre_load_weights

    soft_actions = [[] for i in range(train_models.num_options)]
    for oidx in range(train_models.num_options):
        train_models.option_index = oidx
        if args.model_form == 'population':
            train_models.currentModel().use_mean = True
        for i in range(len(states) // 30 + 1):
            state = states[i * 30:(i + 1) * 30]
            resp = resps[i * 30:(i + 1) * 30]
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                pytorch_model.wrap(state, cuda=args.cuda),
                pytorch_model.wrap(resp, cuda=args.cuda))
            # print (action_probs)
            values, action_probs, Q_vals = train_models.get_action(
                values, action_probs, Q_vals)
            soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist()
    print("soft actions", np.sum(np.array(soft_actions[0]), axis=0))
    for i in range(len(soft_actions)):
        soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda)
    return np.array(soft_actions)
Exemple #7
0
 def step(self, action):
     # TODO: action is tenor, might not be safe assumption
     t = time.time()
     raw_state, raw_factor_state, done = self.screen.step(action,
                                                          render=True)
     self.reward = self.screen.reward
     factor_state = self.focus_model.forward(pytorch_model.wrap(
         raw_state, cuda=False).unsqueeze(0).unsqueeze(0),
                                             ret_numpy=True)
     for key in factor_state.keys():
         factor_state[key] *= 84
         factor_state[key] = (np.squeeze(factor_state[key]), (1.0, ))
     factor_state['Action'] = raw_factor_state['Action']
     self.factor_state = factor_state
     if self.screen.itr != 0:
         object_dumps = open(
             os.path.join(self.save_path, "focus_dumps.txt"), 'a')
     else:
         object_dumps = open(os.path.join(self.save_path,
                                          "focus_dumps.txt"),
                             'w')  # create file if it does not exist
     for key in factor_state.keys():
         object_dumps.write(
             key + ":" + " ".join([str(fs) for fs in factor_state[key]]) +
             "\t")  # TODO: attributes are limited to single floats
     object_dumps.write(
         "\n")  # TODO: recycling does not stop object dumping
     # print("elapsed ", time.time() - t)
     return raw_state, factor_state, done
 def compute_reward(self, states, actions, resps, precomputed=None):
     '''
     states must have at least two in the stack: to keep size of rewards at num_states - 1
     assumes ball is the last state
     assuming input shape: [state_size = num_stack*traj_dim]
     '''
     rewards = []
     # print(states.shape)
     for last_state, state, action, nextstate in zip(
             states, states[1:], actions, states[2:]):
         corr = state.squeeze()[:2]
         corr = corr - last_state.squeeze()[:2]
         # print(base, corr)
         norm_corr = corr
         if corr.norm() > 0:
             norm_corr = corr / corr.norm()
         r = -1e-2
         if (self.anydir and norm_corr.norm() > self.epsilon) or (
                 self.dir is not None and
             (self.dir - norm_corr).norm() < self.epsilon):
             r = 1
         # print(corr.cpu().numpy(), self.dir.cpu().numpy(), last_state.cpu().numpy(), r)
         # print(state, norm_corr, r)
         # print(corr, self.dir, (self.dir - norm_corr).norm(), r)
         # print(state, -abs(int(state[1])))
         rewards.append(r)
     return pytorch_model.wrap(rewards, cuda=True)
def get_option_rewards(dataset_path, reward_fns, actions, length_constraint=50000, raws= None, dumps = None):
    states, resps, raws, dumps = load_states(reward_fns[0].get_state, dataset_path, length_constraint=length_constraint, raws=raws, dumps=dumps)
    rewards = []
    for reward_fn in reward_fns:
        reward = reward_fn.compute_reward(pytorch_model.wrap(states, cuda=True), actions, None)
        rewards.append(reward.tolist())
    return rewards
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        args, num_inputs, num_outputs, factor = self.get_args(kwargs)
        # TODO: assumes images of size 84x84
        # TODO: only handles bounds as input, and no object shape. If useful, we would need both
        # TODO: valid input orders: 83, 75, 67, 59, 51, 43, 35
        self.scale = args.scale
        self.period = args.period
        self.order = args.order + 1  # num population repurposed for tile factor
        self.order_vector = []  # shape: [self.order ** 2, 2]
        for j in range(self.order):
            for k in range(self.order):
                self.order_vector.append(
                    [j / (self.order - 1), k / (self.order - 1)])
        self.order_vector = pytorch_model.wrap(self.order_vector,
                                               cuda=args.cuda).detach()

        self.viewsize = int((((self.order - 4) / 4 - 2) / 2 - 2))
        print("insize", self.insize)
        self.conv1 = nn.Conv2d(1, 2 * factor, 8, stride=4)
        self.conv2 = nn.Conv2d(2 * factor, 4 * factor, 4, stride=2)
        self.conv3 = nn.Conv2d(4 * factor, 8 * factor, 3, stride=1)
        self.linear1 = nn.Linear(8 * factor * self.viewsize * self.viewsize,
                                 self.insize)
        self.layers[-4] = self.conv1
        self.layers[-3] = self.conv2
        self.layers[-2] = self.conv3
        self.layers[-1] = self.linear1
        self.reset_parameters()
 def take_action(self, probs, q_vals):
     action = sample_actions(probs, deterministic=False)
     if np.random.rand() < self.epsilon:
         action = pytorch_model.wrap(np.random.randint(self.num_outputs,
                                                       size=probs.shape[0]),
                                     cuda=True)
     return action
 def compute_reward(self, states, actions, resps):
     trajectory = pytorch_model.unwrap(states[:-1, :self.traj_dim])
     saliency_trajectory = pytorch_model.unwrap(states[:-1, self.traj_dim:])
     # print("states shape", trajectory.shape, saliency_trajectory.shape)
     assignments, cps = self.model.get_mode(trajectory, saliency_trajectory)
     rewards = []
     # print(assignments, cps)
     rewarded = False
     for asmt in assignments:
         # if asmt == self.desired_mode:
         #### DANGEROUS LINE ####
         if asmt == self.desired_mode and not rewarded:
             rewards.append(1)
             rewarded = True
         else:
             rewards.append(0)
     rewards.append(0)  # match the number of changepoints
     full_rewards = []
     lcp = 0
     lr = 0
     cps.append(len(trajectory))
     # print(cps, rewards)
     for cp, r in zip(cps, rewards):
         if self.seg_reward:  # reward copied over all time steps
             full_rewards += [r] * (cp - lcp)
         else:
             if r == 1 and cp == 0:
                 r = 0
             full_rewards += [0] * (cp - lcp - 1) + [r]
         lcp = cp
         lr = r
     # print(rewards, cps, full_rewards)
     return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
 def compute_reward(self, states, actions, resps, precomputed=None):
     '''
     states must have at least two in the stack: to keep size of rewards at num_states - 1
     assumes ball is the last state
     assuming input shape: [state_size = num_stack*traj_dim]
     '''
     rewards = []
     # print(states.shape)
     # start = time.time()
     for last_state, state, action, nextstate in zip(
             states, states[1:], actions, states[2:]):
         last_state = last_state.squeeze()
         state = state.squeeze()
         nextstate = nextstate.squeeze()
         state_first = last_state[:2]
         state_second = state[:2]
         proximity = state[:2] - state[-2:]
         state_third = nextstate[:2]
         # s1 = time.time()
         # print("separate ", s1 - start)
         # print(state_second.shape, state.shape, state_first.shape)
         v1 = state_second - state_first
         v2 = state_third - state_second
         # print(state_first, state_second, state_third)
         rewarded = False
         if v1[0] > 0 and state_second[
                 0] > 65:  # was moving down, below the blocks
             if torch.norm(v2 - self.desired_vel) == 0:
                 rewards.append(1)
                 rewarded = True
             else:
                 for v in self.desired_vels:
                     if torch.norm(v2 - v) == 0:
                         # print ("REWARD", v1, v2)
                         if self.anybounce:
                             rewards.append(1)
                         else:
                             rewards.append(0.25)
                         rewarded = True
         # s2 = time.time()
         # print("rew ", s1 - s2)
         if not rewarded:
             if self.form == 'dense':
                 # rewards.append(-abs(proximity[1] / (proximity[0] + .1) * .1))
                 rewards.append(-abs(proximity[0] + proximity[1]) * 0.001)
             if self.form.find('xdense') != -1:
                 if proximity[0] == 3 and self.form.find('neg') != -1:
                     rewards.append(-1)
                 # rewards.append(-abs(proximity[1] / (proximity[0] + .1) * .1))
                 else:
                     rewards.append(-abs(proximity[1]) * 0.001)
             else:
                 # print(state, proximity[0])
                 if proximity[0] > 3 and self.form.find('neg') != -1:
                     rewards.append(-1)
                 else:
                     rewards.append(0)
         # print("prewrap ", time.time() - s2)
     return pytorch_model.wrap(rewards, cuda=self.cuda)
def Q_criteria(models, values, dist_entropy, action_probs, Q_vals, optimizer,
               true_values, targets):
    # we should probably include the criteria
    # print(true_values, Q_vals.shape, pytorch_model.wrap(targets, cuda=True).squeeze().long().shape)
    # print(pytorch_model.wrap(targets, cuda=True).squeeze().long())
    # print(Q_vals.gather(1, pytorch_model.wrap(targets, cuda=True).unsqueeze(1).long()))
    loss = (
        Q_vals.gather(
            1,
            pytorch_model.wrap(targets, cuda=True).unsqueeze(1).long()) -
        pytorch_model.wrap(true_values, cuda=True).squeeze()).pow(2).mean()
    # for optimizer in optimizers:
    optimizer.zero_grad()
    loss.backward()
    # for optimizer in optimizers:
    optimizer.step()
    return loss
 def get_trajectories(self, full_states):
     states = []
     resps = []
     for state in full_states:
         state, resp = self.state_class.get_state(state)
         states.append(state)
         resps.append(resp)
     return pytorch_model.wrap(np.stack(states), cuda=self.cuda)
 def __init__(self, args):
     super().__init__(None, args)
     self.queue_len = args.changepoint_queue_len
     self.rewards = pytorch_model.wrap(np.array(
         [0 for i in range(args.changepoint_queue_len)]),
                                       cuda=args.cuda).detach()
     self.rewards.requires_grad = False
     self.reward_filled = 0
     self.iscuda = args.cuda
    def generate_training_set(self,
                              states,
                              models,
                              changepoints,
                              match=False,
                              window=-1):
        trajectory = states[:, :self.traj_dim]
        saliency_trajectory = states[:, self.traj_dim:]

        # trajectory = states[:-1,:self.traj_dim]
        # saliency_trajectory = states[:-1,self.traj_dim:]
        assignments, changepoints = self.model.get_mode(
            trajectory, saliency_trajectory, models, changepoints)
        self.min = np.min(trajectory, axis=0)
        self.max = np.max(trajectory, axis=0)
        lcp, cp, ncp = changepoints[0], changepoints[1], changepoints[2]
        asmts = []
        for i in range(3, len(changepoints) - 1):
            # print(assignments[i-1],trajectory[lcp:cp+1].squeeze())
            asmts.append(
                (assignments[i - 3], trajectory[lcp:cp + 1],
                 trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1],
                 saliency_trajectory[cp + 1:ncp]))
            lcp, cp, ncp = cp, ncp, changepoints[i]
        asmts.append((assignments[i - 2], trajectory[lcp:cp + 1],
                      trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1],
                      saliency_trajectory[cp + 1:ncp]))
        if ncp != len(trajectory):
            lcp, cp, ncp = cp, ncp, len(trajectory)
            asmts.append(
                (assignments[i - 1], trajectory[lcp:cp + 1],
                 trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1],
                 saliency_trajectory[cp + 1:ncp]))
        self.modes = list(range(self.model.determiner.num_mappings))
        mode_data = {m: [] for m in range(self.model.determiner.num_mappings)}
        for asmt, databefore, dataafter, corrbefore, corrafter in asmts:
            if window < 0:
                data_use = databefore
                if match:
                    other_data = corrbefore
                    # print(data_use.shape, other_data.shape)
                    data_use = np.concatenate((data_use, other_data), axis=1)
            else:
                data_use = np.concatenate(
                    (databefore[-window:], dataafter[:window + 1]), axis=0)
                if match:
                    other_data = corrbefore[-window:] + corrafter[:window]
                    data_use = np.concatenate((data_use, other_data), axis=0)
            if asmt != -1:
                mode_data[asmt] += self.form_batch(data_use)
        total = 0
        for asmt in mode_data.keys():
            mode_data[asmt] = pytorch_model.wrap(mode_data[asmt])
            total += len(mode_data)
        self.pairs = mode_data
        arr = [v.squeeze() for v in self.pairs.values()]
        return total
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     args, num_inputs, num_outputs, factor = self.get_args(kwargs)
     self.l1 = nn.Linear(self.basis_size, args.num_population)
     self.value_bounds = args.value_bounds
     self.num_value_atoms = args.num_value_atoms
     self.dz = (self.value_bounds[1] - self.value_bounds[0]) / (self.num_value_atoms - 1)
     self.value_support = pytorch_model.wrap([self.value_bounds[0] + (i * self.dz) for i in range(self.num_value_atoms)], cuda = args.cuda)
     self.value_support.requires_grad = False
 def __init__(self, dim, mode, hist, name):
     super(LDSlearner, self).__init__()
     self.name = name
     self.As = nn.ModuleList([nn.Linear(dim, dim) for _ in range(hist - 1)])
     self.dim = dim  # dimension of a single state
     self.mode = mode
     self.hist = hist  # number of states in the reward function
     self.variance = pytorch_model.wrap([-1 for i in range(dim)])
     self.is_cuda = False
def supervised_criteria(models, values, dist_entropy, action_probs, Q_vals, optimizer, true_values):
    loss = F.binary_cross_entropy(action_probs.squeeze(), pytorch_model.wrap(true_values, cuda=True).squeeze()) # TODO: cuda support required
    loss += -(action_probs.squeeze() * torch.log(action_probs.squeeze() + 1e-10)).sum(dim=1).mean() * .01
    # print(action_probs[:5], true_values[:5], loss)
    # for optimizer in optimizers:
    #     optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss
Exemple #21
0
 def getState(self):
     raw_state, raw_factor_state = self.screen.getState()
     if self.factor_state is None:
         factor_state = self.focus_model.forward(pytorch_model.wrap(
             raw_state, cuda=False).unsqueeze(0).unsqueeze(0),
                                                 ret_numpy=True)
         for key in factor_state.keys():
             factor_state[key] *= 84
             factor_state[key] = (np.squeeze(factor_state[key]), (1.0, ))
         factor_state['Action'] = raw_factor_state['Action']
         self.factor_state = factor_state
     factor_state = self.factor_state
     return raw_state, factor_state
    def compute_reward(self, states, actions):
        '''

        TODO: make support multiple processes
        possibly make this not iterative?
        '''
        rewards = []
        for state, action, nextstate in zip(states, actions, states[1:]):
            # print(state)
            if state - nextstate == 0:
                rewards.append(2)
            else:
                rewards.append(-1)
        return pytorch_model.wrap(rewards, cuda=True)
 def __init__(self, args, direc=0):
     super().__init__(None, args)
     self.traj_dim = 2  # SET THIS
     self.head, self.tail = get_edge(args.train_edge)
     self.name = args.reward_form
     self.anydir = direc == -1
     self.dir = None
     if direc == 0:
         self.dir = pytorch_model.wrap(np.array([0, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 1:
         self.dir = pytorch_model.wrap(np.array([0, -1]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 2:
         self.dir = pytorch_model.wrap(np.array([0, 1]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 3:
         self.dir = pytorch_model.wrap(np.array([-1, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 4:
         self.dir = pytorch_model.wrap(np.array([1, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     self.epsilon = 1e-3
    def compute_reward(self, states, actions, resps, precomputed=None):
        '''

        TODO: make support multiple processes
        possibly make this not iterative?
        '''
        rewards = []
        for state, action, nextstate in zip(states, actions, states[1:]):
            # print(state)
            if np.linalg.norm(state - self.target) == 0:
                rewards.append(1)
            else:
                rewards.append(-0.01)
        return pytorch_model.wrap(rewards, cuda=True)
    def __init__(self, vel, args):
        super().__init__(None, args)
        self.name = "Paddle->Ball"
        self.head, self.tail = "Ball", "Paddle"

        self.anybounce = False
        self.desired_vels = [
            pytorch_model.wrap([-2., -1.], cuda=args.cuda),
            pytorch_model.wrap([-1., -1.], cuda=args.cuda),
            pytorch_model.wrap([-1., 1.], cuda=args.cuda),
            pytorch_model.wrap([-2., 1.], cuda=args.cuda)
        ]
        if vel == -1:
            self.anybounce = True
        self.desired_vel = self.desired_vels[0]
        if vel == 0:
            self.desired_vel = self.desired_vels[0]
        elif vel == 1:
            self.desired_vel = self.desired_vels[1]
        elif vel == 2:
            self.desired_vel = self.desired_vels[2]
        elif vel == 3:
            self.desired_vel = self.desired_vels[3]
        self.form = args.reward_form
Exemple #26
0
def construct_tile_order(minmax, normalize, order):
    minvs, maxvs = minmax
    order_vectors = []
    for minv, maxv in zip(minvs, maxvs):
        order_vector = []
        numv = min(order, int(pytorch_model.unwrap(torch.ceil(maxv - minv) + 1))) # TODO: assumes integer differences between states, fix?
        for i in range (numv): 
            if not normalize:
                order_vector.append((minv + i * (maxv - minv) / (max(numv - 1, 1))))
            else:
                order_vector.append((i / max(numv - 1, 1)))
        order_vectors.append(pytorch_model.wrap(np.array(order_vector)).detach())
    for vec in order_vectors:
        vec.requires_grad = False   
    return order_vectors
 def take_action(self, probs, q_vals):
     action = -1
     while action == -1:
         try:
             action = int(input(""))
         except ValueError as e:
             continue
         if action > self.num_outputs - 1:
             action = -1
     action = torch.tensor([action])
     if np.random.rand() < self.epsilon:
         action = pytorch_model.wrap(np.random.randint(self.num_outputs,
                                                       size=probs.shape[0]),
                                     cuda=True)
     return action
Exemple #28
0
 def get_trajectories(self, full_states):
     # print(self.head)
     obj_dumps = [s[1] for s in full_states]
     trajectory = get_individual_data(self.head, obj_dumps, pos_val_hash=1)
     # TODO: automatically determine if correlate pos_val_hash is 1 or 2
     # TODO: multiple tail support
     # TODO: Separation of Interference and Contingent objects
     if self.tail[0] == "Action":
         # print(obj_dumps, self.tail[0])
         merged = trajectory
         # correlate_trajectory = get_individual_data(self.tail[0], obj_dumps, pos_val_hash=2)
     else:
         correlate_trajectory = get_individual_data(self.tail[0], obj_dumps, pos_val_hash=1)
         merged = np.concatenate([trajectory, correlate_trajectory], axis=1)
         # print(pytorch_model.wrap(merged))
     return pytorch_model.wrap(merged).cuda()
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     '''
     factor is the order
     layers defines the variate (1 = univariate, 2 = paired, 3=all)
     object_extractors[0] is the current object
     the remainder are any correlate objects, with relative computations 
     computations are relative to pre_extracted state (just getters)
     '''
     self.order_vector = []
     for i in range (self.order):
         self.order_vector.append(np.pi*2*i/self.period)
     self.order_vector = pytorch_model.wrap(np.array(self.order_vector))
     self.order_vector.requires_grad = False
     self.train()
     self.reset_parameters()
 def getState(self):
     raw_state = self.current_raw
     factor_state = {'Action': self.current_action}
     if self.factor_state is None:
         if self.focus_model is not None:
             factor_state = self.focus_model.forward(pytorch_model.wrap(
                 raw_state, cuda=True).unsqueeze(0).unsqueeze(0),
                                                     ret_numpy=True)
             for key in factor_state.keys():
                 factor_state[key] *= 84
                 factor_state[key] = (np.squeeze(factor_state[key]),
                                      (1.0, ))
         self.factor_state = factor_state
     else:
         factor_state = self.factor_state
     return raw_state, factor_state