def step_dope(self, action, rollout, model=False, action_list=[]): ''' steps the true environment, using dopamine models. The last environment in the proxy chain is the true environment, and has a different step function (performs model updates and most state saving inside dopamine) ''' if model: reward = self.computeReward(rollout, 1) action = self.models.currentModel().forward( self.current_state, reward[self.models.option_index]) if len(self.proxy_chain) > 1: state, base_state, done, action_list = self.proxy_chain[-1].step( action, model=True, action_list=[action] + action_list) else: raw_state, factored_state, done = self.proxy_chain[-1].step(action) action_list = [action] + action_list if done: self.reset_history() self.raw_state = (raw_state, factored_state) # TODO: implement multiprocessing support state, resp = self.stateExtractor.get_state(self.raw_state) self.extracted_state = pytorch_model.wrap( state, cuda=self.iscuda).unsqueeze(0) self.insert_extracted() self.insert_changepoint_queue( self.cp_state, pytorch_model.wrap(action, cuda=self.iscuda), pytorch_model.wrap(resp, cuda=self.iscuda)) return self.extracted_state, self.raw_state, done, action_list
def forward(self, x, resp): ''' TODO: make use of time_estimator, link up Q vals and action probs TODO: clean up cuda = True to something that is actually true TODO: only accepts integer array input states of form (num in batch, num_vals) ''' Qvals = [] aprobs = [] for xv in x: # for each x in the batch, convert state to hash and get Q value if len(xv.shape) > 1: xv = xv[0] hsh = self.hash_function(xv) Qval, Aprob = self.get_Qval(hsh) Qvals.append(Qval) aprobs.append(Aprob) Qvals = torch.stack(Qvals, dim=0) aprobs = torch.stack(aprobs, dim=0) action_probs = pytorch_model.wrap(aprobs, cuda=self.iscuda) Q_vals = pytorch_model.wrap(Qvals, cuda=self.iscuda) # print(Q_vals.shape, action_probs.shape) values = Q_vals.max(dim=1)[0] probs = F.softmax(action_probs, dim=1) # print("probs", probs) log_probs = F.log_softmax(action_probs, dim=1) dist_entropy = -(log_probs * probs).sum(-1).mean() return values, dist_entropy, probs, Q_vals
def insert(self, reenter, extracted_state, current_state, epsilon, done, resp, action, changepoint_state, option_param, option_no, rewards=None, returns=None, action_probs=None, Qvals=None, value_preds=None, option_agnostic=False): if not reenter: if self.buffer_filled == self.buffer_steps: for i in range(len(self.option_agnostic)): self.option_agnostic[i] = self.option_agnostic[i].roll( -1, 0) for i in range(len(self.option_specific)): self.option_specific[i] = self.option_specific[i].roll( -1, 1) self.reset_values() else: self.buffer_filled -= 1 # if reentering, subtract 1 so that we insert to the same location. Don't reenter at very first self.buffer_filled += int(self.buffer_filled < self.buffer_steps) # print(self.buffer_filled) self.extracted_state[self.buffer_filled - 1].copy_( extracted_state.squeeze().detach()) self.current_state[self.buffer_filled - 1].copy_( current_state.squeeze().detach()) self.resps[self.buffer_filled - 1].copy_(resp.squeeze().detach()) self.actions[self.buffer_filled - 1].copy_(action.squeeze().detach()) self.dones[self.buffer_filled - 1].copy_( pytorch_model.wrap(int(done), cuda=self.iscuda)) self.epsilon[self.buffer_filled - 1].copy_(epsilon.squeeze().detach()) self.changepoint_states[self.buffer_filled - 1].copy_( changepoint_state.squeeze().detach()) self.option_param[self.buffer_filled - 1].copy_( option_param.squeeze().detach()) self.option_num[self.buffer_filled - 1].copy_( pytorch_model.wrap(option_no, cuda=self.iscuda)) if not option_agnostic: for oidx in range(self.num_options): self.value_preds[oidx, self.buffer_filled - 1].copy_( value_preds[oidx].squeeze().detach()) self.Qvals[oidx, self.buffer_filled - 1].copy_( Qvals[oidx].squeeze().detach()) self.action_probs[oidx, self.buffer_filled - 1].copy_( action_probs[oidx].squeeze().detach()) if rewards is not None: self.rewards[oidx, self.buffer_filled - 1].copy_( rewards[oidx].squeeze().detach()) if returns is not None: self.returns[oidx, self.buffer_filled - 1].copy_( returns[oidx].squeeze().detach())
def reset_history(self): self.current_state = pytorch_model.wrap(np.zeros( (self.num_hist * int(np.prod(self.state_size)), )), cuda=self.iscuda) self.current_resp = pytorch_model.wrap( [[0 for i in range(len(self.stateExtractor.fnames))] for _ in range(self.num_hist)], cuda=self.iscuda).flatten()
def determine_step(self, state, reward): ''' output: what is this function? ''' actions = [] for i in range(self.num_options): actions.append(self.models[i](state, reward)) pytorch_model.wrap(actions) return actions
def generate_soft_dataset(states, resps, true_environment, reward_fns, args): pre_load_weights = args.load_weights args.load_weights = True option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) print(args.load_weights) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments train_models = proxy_environment.models head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) train_models.initialize(args, len(reward_fns), state_class, num_actions) train_models.session(args) proxy_environment.duplicate(args) # assumes that we are loading weights args.load_weights = pre_load_weights soft_actions = [[] for i in range(train_models.num_options)] for oidx in range(train_models.num_options): train_models.option_index = oidx if args.model_form == 'population': train_models.currentModel().use_mean = True for i in range(len(states) // 30 + 1): state = states[i * 30:(i + 1) * 30] resp = resps[i * 30:(i + 1) * 30] values, dist_entropy, action_probs, Q_vals = train_models.determine_action( pytorch_model.wrap(state, cuda=args.cuda), pytorch_model.wrap(resp, cuda=args.cuda)) # print (action_probs) values, action_probs, Q_vals = train_models.get_action( values, action_probs, Q_vals) soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist() print("soft actions", np.sum(np.array(soft_actions[0]), axis=0)) for i in range(len(soft_actions)): soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda) return np.array(soft_actions)
def step(self, action): # TODO: action is tenor, might not be safe assumption t = time.time() raw_state, raw_factor_state, done = self.screen.step(action, render=True) self.reward = self.screen.reward factor_state = self.focus_model.forward(pytorch_model.wrap( raw_state, cuda=False).unsqueeze(0).unsqueeze(0), ret_numpy=True) for key in factor_state.keys(): factor_state[key] *= 84 factor_state[key] = (np.squeeze(factor_state[key]), (1.0, )) factor_state['Action'] = raw_factor_state['Action'] self.factor_state = factor_state if self.screen.itr != 0: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'a') else: object_dumps = open(os.path.join(self.save_path, "focus_dumps.txt"), 'w') # create file if it does not exist for key in factor_state.keys(): object_dumps.write( key + ":" + " ".join([str(fs) for fs in factor_state[key]]) + "\t") # TODO: attributes are limited to single floats object_dumps.write( "\n") # TODO: recycling does not stop object dumping # print("elapsed ", time.time() - t) return raw_state, factor_state, done
def compute_reward(self, states, actions, resps, precomputed=None): ''' states must have at least two in the stack: to keep size of rewards at num_states - 1 assumes ball is the last state assuming input shape: [state_size = num_stack*traj_dim] ''' rewards = [] # print(states.shape) for last_state, state, action, nextstate in zip( states, states[1:], actions, states[2:]): corr = state.squeeze()[:2] corr = corr - last_state.squeeze()[:2] # print(base, corr) norm_corr = corr if corr.norm() > 0: norm_corr = corr / corr.norm() r = -1e-2 if (self.anydir and norm_corr.norm() > self.epsilon) or ( self.dir is not None and (self.dir - norm_corr).norm() < self.epsilon): r = 1 # print(corr.cpu().numpy(), self.dir.cpu().numpy(), last_state.cpu().numpy(), r) # print(state, norm_corr, r) # print(corr, self.dir, (self.dir - norm_corr).norm(), r) # print(state, -abs(int(state[1]))) rewards.append(r) return pytorch_model.wrap(rewards, cuda=True)
def get_option_rewards(dataset_path, reward_fns, actions, length_constraint=50000, raws= None, dumps = None): states, resps, raws, dumps = load_states(reward_fns[0].get_state, dataset_path, length_constraint=length_constraint, raws=raws, dumps=dumps) rewards = [] for reward_fn in reward_fns: reward = reward_fn.compute_reward(pytorch_model.wrap(states, cuda=True), actions, None) rewards.append(reward.tolist()) return rewards
def __init__(self, **kwargs): super().__init__(**kwargs) args, num_inputs, num_outputs, factor = self.get_args(kwargs) # TODO: assumes images of size 84x84 # TODO: only handles bounds as input, and no object shape. If useful, we would need both # TODO: valid input orders: 83, 75, 67, 59, 51, 43, 35 self.scale = args.scale self.period = args.period self.order = args.order + 1 # num population repurposed for tile factor self.order_vector = [] # shape: [self.order ** 2, 2] for j in range(self.order): for k in range(self.order): self.order_vector.append( [j / (self.order - 1), k / (self.order - 1)]) self.order_vector = pytorch_model.wrap(self.order_vector, cuda=args.cuda).detach() self.viewsize = int((((self.order - 4) / 4 - 2) / 2 - 2)) print("insize", self.insize) self.conv1 = nn.Conv2d(1, 2 * factor, 8, stride=4) self.conv2 = nn.Conv2d(2 * factor, 4 * factor, 4, stride=2) self.conv3 = nn.Conv2d(4 * factor, 8 * factor, 3, stride=1) self.linear1 = nn.Linear(8 * factor * self.viewsize * self.viewsize, self.insize) self.layers[-4] = self.conv1 self.layers[-3] = self.conv2 self.layers[-2] = self.conv3 self.layers[-1] = self.linear1 self.reset_parameters()
def take_action(self, probs, q_vals): action = sample_actions(probs, deterministic=False) if np.random.rand() < self.epsilon: action = pytorch_model.wrap(np.random.randint(self.num_outputs, size=probs.shape[0]), cuda=True) return action
def compute_reward(self, states, actions, resps): trajectory = pytorch_model.unwrap(states[:-1, :self.traj_dim]) saliency_trajectory = pytorch_model.unwrap(states[:-1, self.traj_dim:]) # print("states shape", trajectory.shape, saliency_trajectory.shape) assignments, cps = self.model.get_mode(trajectory, saliency_trajectory) rewards = [] # print(assignments, cps) rewarded = False for asmt in assignments: # if asmt == self.desired_mode: #### DANGEROUS LINE #### if asmt == self.desired_mode and not rewarded: rewards.append(1) rewarded = True else: rewards.append(0) rewards.append(0) # match the number of changepoints full_rewards = [] lcp = 0 lr = 0 cps.append(len(trajectory)) # print(cps, rewards) for cp, r in zip(cps, rewards): if self.seg_reward: # reward copied over all time steps full_rewards += [r] * (cp - lcp) else: if r == 1 and cp == 0: r = 0 full_rewards += [0] * (cp - lcp - 1) + [r] lcp = cp lr = r # print(rewards, cps, full_rewards) return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
def compute_reward(self, states, actions, resps, precomputed=None): ''' states must have at least two in the stack: to keep size of rewards at num_states - 1 assumes ball is the last state assuming input shape: [state_size = num_stack*traj_dim] ''' rewards = [] # print(states.shape) # start = time.time() for last_state, state, action, nextstate in zip( states, states[1:], actions, states[2:]): last_state = last_state.squeeze() state = state.squeeze() nextstate = nextstate.squeeze() state_first = last_state[:2] state_second = state[:2] proximity = state[:2] - state[-2:] state_third = nextstate[:2] # s1 = time.time() # print("separate ", s1 - start) # print(state_second.shape, state.shape, state_first.shape) v1 = state_second - state_first v2 = state_third - state_second # print(state_first, state_second, state_third) rewarded = False if v1[0] > 0 and state_second[ 0] > 65: # was moving down, below the blocks if torch.norm(v2 - self.desired_vel) == 0: rewards.append(1) rewarded = True else: for v in self.desired_vels: if torch.norm(v2 - v) == 0: # print ("REWARD", v1, v2) if self.anybounce: rewards.append(1) else: rewards.append(0.25) rewarded = True # s2 = time.time() # print("rew ", s1 - s2) if not rewarded: if self.form == 'dense': # rewards.append(-abs(proximity[1] / (proximity[0] + .1) * .1)) rewards.append(-abs(proximity[0] + proximity[1]) * 0.001) if self.form.find('xdense') != -1: if proximity[0] == 3 and self.form.find('neg') != -1: rewards.append(-1) # rewards.append(-abs(proximity[1] / (proximity[0] + .1) * .1)) else: rewards.append(-abs(proximity[1]) * 0.001) else: # print(state, proximity[0]) if proximity[0] > 3 and self.form.find('neg') != -1: rewards.append(-1) else: rewards.append(0) # print("prewrap ", time.time() - s2) return pytorch_model.wrap(rewards, cuda=self.cuda)
def Q_criteria(models, values, dist_entropy, action_probs, Q_vals, optimizer, true_values, targets): # we should probably include the criteria # print(true_values, Q_vals.shape, pytorch_model.wrap(targets, cuda=True).squeeze().long().shape) # print(pytorch_model.wrap(targets, cuda=True).squeeze().long()) # print(Q_vals.gather(1, pytorch_model.wrap(targets, cuda=True).unsqueeze(1).long())) loss = ( Q_vals.gather( 1, pytorch_model.wrap(targets, cuda=True).unsqueeze(1).long()) - pytorch_model.wrap(true_values, cuda=True).squeeze()).pow(2).mean() # for optimizer in optimizers: optimizer.zero_grad() loss.backward() # for optimizer in optimizers: optimizer.step() return loss
def get_trajectories(self, full_states): states = [] resps = [] for state in full_states: state, resp = self.state_class.get_state(state) states.append(state) resps.append(resp) return pytorch_model.wrap(np.stack(states), cuda=self.cuda)
def __init__(self, args): super().__init__(None, args) self.queue_len = args.changepoint_queue_len self.rewards = pytorch_model.wrap(np.array( [0 for i in range(args.changepoint_queue_len)]), cuda=args.cuda).detach() self.rewards.requires_grad = False self.reward_filled = 0 self.iscuda = args.cuda
def generate_training_set(self, states, models, changepoints, match=False, window=-1): trajectory = states[:, :self.traj_dim] saliency_trajectory = states[:, self.traj_dim:] # trajectory = states[:-1,:self.traj_dim] # saliency_trajectory = states[:-1,self.traj_dim:] assignments, changepoints = self.model.get_mode( trajectory, saliency_trajectory, models, changepoints) self.min = np.min(trajectory, axis=0) self.max = np.max(trajectory, axis=0) lcp, cp, ncp = changepoints[0], changepoints[1], changepoints[2] asmts = [] for i in range(3, len(changepoints) - 1): # print(assignments[i-1],trajectory[lcp:cp+1].squeeze()) asmts.append( (assignments[i - 3], trajectory[lcp:cp + 1], trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1], saliency_trajectory[cp + 1:ncp])) lcp, cp, ncp = cp, ncp, changepoints[i] asmts.append((assignments[i - 2], trajectory[lcp:cp + 1], trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1], saliency_trajectory[cp + 1:ncp])) if ncp != len(trajectory): lcp, cp, ncp = cp, ncp, len(trajectory) asmts.append( (assignments[i - 1], trajectory[lcp:cp + 1], trajectory[cp + 1:ncp], saliency_trajectory[lcp:cp + 1], saliency_trajectory[cp + 1:ncp])) self.modes = list(range(self.model.determiner.num_mappings)) mode_data = {m: [] for m in range(self.model.determiner.num_mappings)} for asmt, databefore, dataafter, corrbefore, corrafter in asmts: if window < 0: data_use = databefore if match: other_data = corrbefore # print(data_use.shape, other_data.shape) data_use = np.concatenate((data_use, other_data), axis=1) else: data_use = np.concatenate( (databefore[-window:], dataafter[:window + 1]), axis=0) if match: other_data = corrbefore[-window:] + corrafter[:window] data_use = np.concatenate((data_use, other_data), axis=0) if asmt != -1: mode_data[asmt] += self.form_batch(data_use) total = 0 for asmt in mode_data.keys(): mode_data[asmt] = pytorch_model.wrap(mode_data[asmt]) total += len(mode_data) self.pairs = mode_data arr = [v.squeeze() for v in self.pairs.values()] return total
def __init__(self, **kwargs): super().__init__(**kwargs) args, num_inputs, num_outputs, factor = self.get_args(kwargs) self.l1 = nn.Linear(self.basis_size, args.num_population) self.value_bounds = args.value_bounds self.num_value_atoms = args.num_value_atoms self.dz = (self.value_bounds[1] - self.value_bounds[0]) / (self.num_value_atoms - 1) self.value_support = pytorch_model.wrap([self.value_bounds[0] + (i * self.dz) for i in range(self.num_value_atoms)], cuda = args.cuda) self.value_support.requires_grad = False
def __init__(self, dim, mode, hist, name): super(LDSlearner, self).__init__() self.name = name self.As = nn.ModuleList([nn.Linear(dim, dim) for _ in range(hist - 1)]) self.dim = dim # dimension of a single state self.mode = mode self.hist = hist # number of states in the reward function self.variance = pytorch_model.wrap([-1 for i in range(dim)]) self.is_cuda = False
def supervised_criteria(models, values, dist_entropy, action_probs, Q_vals, optimizer, true_values): loss = F.binary_cross_entropy(action_probs.squeeze(), pytorch_model.wrap(true_values, cuda=True).squeeze()) # TODO: cuda support required loss += -(action_probs.squeeze() * torch.log(action_probs.squeeze() + 1e-10)).sum(dim=1).mean() * .01 # print(action_probs[:5], true_values[:5], loss) # for optimizer in optimizers: # optimizer.zero_grad() loss.backward() optimizer.step() return loss
def getState(self): raw_state, raw_factor_state = self.screen.getState() if self.factor_state is None: factor_state = self.focus_model.forward(pytorch_model.wrap( raw_state, cuda=False).unsqueeze(0).unsqueeze(0), ret_numpy=True) for key in factor_state.keys(): factor_state[key] *= 84 factor_state[key] = (np.squeeze(factor_state[key]), (1.0, )) factor_state['Action'] = raw_factor_state['Action'] self.factor_state = factor_state factor_state = self.factor_state return raw_state, factor_state
def compute_reward(self, states, actions): ''' TODO: make support multiple processes possibly make this not iterative? ''' rewards = [] for state, action, nextstate in zip(states, actions, states[1:]): # print(state) if state - nextstate == 0: rewards.append(2) else: rewards.append(-1) return pytorch_model.wrap(rewards, cuda=True)
def __init__(self, args, direc=0): super().__init__(None, args) self.traj_dim = 2 # SET THIS self.head, self.tail = get_edge(args.train_edge) self.name = args.reward_form self.anydir = direc == -1 self.dir = None if direc == 0: self.dir = pytorch_model.wrap(np.array([0, 0]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 1: self.dir = pytorch_model.wrap(np.array([0, -1]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 2: self.dir = pytorch_model.wrap(np.array([0, 1]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 3: self.dir = pytorch_model.wrap(np.array([-1, 0]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 4: self.dir = pytorch_model.wrap(np.array([1, 0]), cuda=args.cuda) self.dir.requires_grad = False self.epsilon = 1e-3
def compute_reward(self, states, actions, resps, precomputed=None): ''' TODO: make support multiple processes possibly make this not iterative? ''' rewards = [] for state, action, nextstate in zip(states, actions, states[1:]): # print(state) if np.linalg.norm(state - self.target) == 0: rewards.append(1) else: rewards.append(-0.01) return pytorch_model.wrap(rewards, cuda=True)
def __init__(self, vel, args): super().__init__(None, args) self.name = "Paddle->Ball" self.head, self.tail = "Ball", "Paddle" self.anybounce = False self.desired_vels = [ pytorch_model.wrap([-2., -1.], cuda=args.cuda), pytorch_model.wrap([-1., -1.], cuda=args.cuda), pytorch_model.wrap([-1., 1.], cuda=args.cuda), pytorch_model.wrap([-2., 1.], cuda=args.cuda) ] if vel == -1: self.anybounce = True self.desired_vel = self.desired_vels[0] if vel == 0: self.desired_vel = self.desired_vels[0] elif vel == 1: self.desired_vel = self.desired_vels[1] elif vel == 2: self.desired_vel = self.desired_vels[2] elif vel == 3: self.desired_vel = self.desired_vels[3] self.form = args.reward_form
def construct_tile_order(minmax, normalize, order): minvs, maxvs = minmax order_vectors = [] for minv, maxv in zip(minvs, maxvs): order_vector = [] numv = min(order, int(pytorch_model.unwrap(torch.ceil(maxv - minv) + 1))) # TODO: assumes integer differences between states, fix? for i in range (numv): if not normalize: order_vector.append((minv + i * (maxv - minv) / (max(numv - 1, 1)))) else: order_vector.append((i / max(numv - 1, 1))) order_vectors.append(pytorch_model.wrap(np.array(order_vector)).detach()) for vec in order_vectors: vec.requires_grad = False return order_vectors
def take_action(self, probs, q_vals): action = -1 while action == -1: try: action = int(input("")) except ValueError as e: continue if action > self.num_outputs - 1: action = -1 action = torch.tensor([action]) if np.random.rand() < self.epsilon: action = pytorch_model.wrap(np.random.randint(self.num_outputs, size=probs.shape[0]), cuda=True) return action
def get_trajectories(self, full_states): # print(self.head) obj_dumps = [s[1] for s in full_states] trajectory = get_individual_data(self.head, obj_dumps, pos_val_hash=1) # TODO: automatically determine if correlate pos_val_hash is 1 or 2 # TODO: multiple tail support # TODO: Separation of Interference and Contingent objects if self.tail[0] == "Action": # print(obj_dumps, self.tail[0]) merged = trajectory # correlate_trajectory = get_individual_data(self.tail[0], obj_dumps, pos_val_hash=2) else: correlate_trajectory = get_individual_data(self.tail[0], obj_dumps, pos_val_hash=1) merged = np.concatenate([trajectory, correlate_trajectory], axis=1) # print(pytorch_model.wrap(merged)) return pytorch_model.wrap(merged).cuda()
def __init__(self, **kwargs): super().__init__(**kwargs) ''' factor is the order layers defines the variate (1 = univariate, 2 = paired, 3=all) object_extractors[0] is the current object the remainder are any correlate objects, with relative computations computations are relative to pre_extracted state (just getters) ''' self.order_vector = [] for i in range (self.order): self.order_vector.append(np.pi*2*i/self.period) self.order_vector = pytorch_model.wrap(np.array(self.order_vector)) self.order_vector.requires_grad = False self.train() self.reset_parameters()
def getState(self): raw_state = self.current_raw factor_state = {'Action': self.current_action} if self.factor_state is None: if self.focus_model is not None: factor_state = self.focus_model.forward(pytorch_model.wrap( raw_state, cuda=True).unsqueeze(0).unsqueeze(0), ret_numpy=True) for key in factor_state.keys(): factor_state[key] *= 84 factor_state[key] = (np.squeeze(factor_state[key]), (1.0, )) self.factor_state = factor_state else: factor_state = self.factor_state return raw_state, factor_state