def compute_reward(self, states, actions, resps): trajectory = pytorch_model.unwrap(states[:-1, :self.traj_dim]) saliency_trajectory = pytorch_model.unwrap(states[:-1, self.traj_dim:]) # print("states shape", trajectory.shape, saliency_trajectory.shape) assignments, cps = self.model.get_mode(trajectory, saliency_trajectory) rewards = [] # print(assignments, cps) rewarded = False for asmt in assignments: # if asmt == self.desired_mode: #### DANGEROUS LINE #### if asmt == self.desired_mode and not rewarded: rewards.append(1) rewarded = True else: rewards.append(0) rewards.append(0) # match the number of changepoints full_rewards = [] lcp = 0 lr = 0 cps.append(len(trajectory)) # print(cps, rewards) for cp, r in zip(cps, rewards): if self.seg_reward: # reward copied over all time steps full_rewards += [r] * (cp - lcp) else: if r == 1 and cp == 0: r = 0 full_rewards += [0] * (cp - lcp - 1) + [r] lcp = cp lr = r # print(rewards, cps, full_rewards) return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
def fc_dqn_network(num_actions, network_type, state): """The convolutional network used to compute the agent's Q-values. Args: num_actions: int, number of actions. network_type: namedtuple, collection of expected values to return. state: `tf.Tensor`, contains the agent's current state. Returns: net: _network_type object containing the tensors output by the network. """ q_values = gym_lib._basic_discrete_domain_network( pytorch_model.unwrap(minmax[0]), pytorch_model.unwrap(minmax[1]), num_actions, state) return network_type(q_values)
def unwrap_or_none(val): if val is not None: if type(val) == torch.tensor: return pytorch_model.unwrap(val) return val else: return -1.0
def determineChanged(self, states, actions, resps): change_indexes, ats, states = self.state_class.determine_delta_target( pytorch_model.unwrap(states)) change = len(change_indexes) > 0 if change: return change, states[0] return change, None
def remove_mean_batch(imgs, focus, nb_size=(5, 5)): in_np = isinstance(imgs, np.ndarray) if not in_np: imgs = pytorch_model.unwrap(imgs) focus = (focus * imgs.shape[2:]).astype(int) focus_mean = image_focus_mean(imgs, focus, nb_size) imgs = image_focus_subtract(imgs, focus, focus_mean, nb_size) return imgs if in_np else torch.from_numpy(imgs).float()
def forward(self, x, reward): ''' TODO: make use of time_estimator, link up Q vals and action probs TODO: clean up cuda = True to something that is actually true ''' x = pytorch_model.unwrap(x) # print(reward, x) return self.dope_dqn.step(reward[0], x)
def hash(self, state): ''' assuming state of the form [changepoint state dim] ''' state = self.normalize(state) basis = [] for order_vector, val in zip(self.order_vectors, state): basis.append(int(pytorch_model.unwrap(torch.exp(-(val - order_vector).pow(2)).argmax()))) # could use any monotonically decreasing function return tuple(basis)
def remove_mean_memory(self, imgs, focus): in_np = isinstance(imgs, np.ndarray) if not in_np: imgs = pytorch_model.unwrap(imgs) focus = (focus * imgs.shape[2:]).astype(int) focus_mean = image_focus_mean(imgs, focus, self.nb_size) self.mean = (self.n_mean * self.mean + focus.shape[0] * focus_mean) \ / (self.n_mean + focus.shape[0]) self.n_mean = self.n_mean + focus.shape[0] imgs = image_focus_subtract(imgs, focus, self.mean, self.nb_size) return imgs if in_np else torch.from_numpy(imgs).float()
def compute_reward(self, states, actions, resps, precomputed=None): self.reward_base = -0.01 # TODO: patchwork line trajectory = pytorch_model.unwrap(states[:-1,:self.traj_dim]) saliency_trajectory = pytorch_model.unwrap(states[:-1,self.traj_dim:]) # print("states shape", trajectory.shape, saliency_trajectory.shape) if precomputed is not None: assignments, cps = precomputed else: assignments, cps = self.model.get_mode(trajectory, saliency_trajectory) # print(assignments, cps, self.desired_mode) rewards = [] # print(assignments, cps) rewarded = False for cp, asmt in zip(cps, assignments): # print(cp, asmt, asmt == self.desired_mode) if asmt == self.desired_mode: #### DANGEROUS LINE #### # if asmt == self.desired_mode and not rewarded: rewards.append(1) # rewarded = True else: rewards.append(self.reward_base) rewards = [self.reward_base] + rewards # match the number of changepoints, first value ignored full_rewards = [] lcp = 0 lr = 0 cps.append(len(trajectory)) # print(len(cps), len(rewards), len(assignments), cps, rewards) for cp, r in zip(cps, rewards): if self.seg_reward: # reward copied over all time steps full_rewards += [r] * (cp - lcp) else: if r == 1 and cp == 0: r = self.reward_base full_rewards += [self.reward_base] * (cp-lcp-1) + [r] lcp = cp lr = r # print(full_rewards, trajectory) # print(np.concatenate((np.array(full_rewards).reshape(len(full_rewards),1), trajectory), axis=1)) # print(rewards, cps, full_rewards) return pytorch_model.wrap(np.array(full_rewards), cuda=self.cuda)
def compute_reward(self, states, actions, resps): rewards = torch.zeros(len(states)) change_indexes, ats, st = self.state_class.determine_delta_target( pytorch_model.unwrap(states)) if len(change_indexes) > 0: dists = np.linalg.norm(self.parameters - st[0]) rewards[change_indexes[0]] = (self.max_dist - dists) / self.max_dist rewards[states[:, -2] == 79] = -1.0 if self.cuda: rewards = rewards.cuda() return rewards
def fc_rainbow_network(num_actions, num_atoms, support, network_type, state): """Build the deep network used to compute the agent's Q-value distributions. Args: num_actions: int, number of actions. num_atoms: int, the number of buckets of the value function distribution. support: tf.linspace, the support of the Q-value distribution. network_type: `namedtuple`, collection of expected values to return. state: `tf.Tensor`, contains the agent's current state. Returns: net: _network_type object containing the tensors output by the network. """ print(minmax) net = gym_lib._basic_discrete_domain_network( pytorch_model.unwrap(minmax[0]), pytorch_model.unwrap(minmax[1]), num_actions, state, num_atoms=num_atoms) logits = tf.reshape(net, [-1, num_actions, num_atoms]) probabilities = tf.contrib.layers.softmax(logits) q_values = tf.reduce_sum(support * probabilities, axis=2) return network_type(q_values, logits, probabilities)
def generate_soft_dataset(states, resps, true_environment, reward_fns, args): pre_load_weights = args.load_weights args.load_weights = True option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) print(args.load_weights) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments train_models = proxy_environment.models head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) train_models.initialize(args, len(reward_fns), state_class, num_actions) train_models.session(args) proxy_environment.duplicate(args) # assumes that we are loading weights args.load_weights = pre_load_weights soft_actions = [[] for i in range(train_models.num_options)] for oidx in range(train_models.num_options): train_models.option_index = oidx if args.model_form == 'population': train_models.currentModel().use_mean = True for i in range(len(states) // 30 + 1): state = states[i * 30:(i + 1) * 30] resp = resps[i * 30:(i + 1) * 30] values, dist_entropy, action_probs, Q_vals = train_models.determine_action( pytorch_model.wrap(state, cuda=args.cuda), pytorch_model.wrap(resp, cuda=args.cuda)) # print (action_probs) values, action_probs, Q_vals = train_models.get_action( values, action_probs, Q_vals) soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist() print("soft actions", np.sum(np.array(soft_actions[0]), axis=0)) for i in range(len(soft_actions)): soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda) return np.array(soft_actions)
def step(self, action): # TODO: action is tensor, might not be safe assumption # t = time.time() uaction = pytorch_model.unwrap(action.long()) raw_state, reward, done, info = self.screen.step([uaction]) # a = time.time() # print("screen step", a - t) raw_state = np.squeeze(raw_state) # raw_state[:10,:] = 0.0 self.current_raw = raw_state raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]} self.current_action = action self.reward = reward[0] self.factor_state = raw_factor_state self.last_action = uaction # logging if len(self.save_path) > 0: if self.recycle > 0: state_path = os.path.join( self.save_path, str((self.itr % self.recycle) // 2000)) count = self.itr % self.recycle else: state_path = os.path.join(self.save_path, str(self.itr // 2000)) count = self.itr try: os.makedirs(state_path) except OSError: pass if self.itr != 0: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'a') else: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'w') # create file if it does not exist for key in factor_state.keys(): writeable = list(factor_state[key][0]) + list( factor_state[key][1]) object_dumps.write( key + ":" + " ".join([str(fs) for fs in writeable]) + "\t") # TODO: attributes are limited to single floats object_dumps.write( "\n") # TODO: recycling does not stop object dumping # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw) self.itr += 1 # print("elapsed ", time.time() - t) return raw_state, self.factor_state, done
def step(self, action): action = int(pytorch_model.unwrap(action[0])) # print("action", action) if action == 0: # noop pass elif action % 2 == 1: v = self.current_state[(action - 1) // 2] if v != 0: ncs = self.current_state.copy() ncs[(action - 1) // 2] -= 1 self.current_state = ncs elif action % 2 == 0: v = self.current_state[(action - 1) // 2] if v != self.num_states - 1: ncs = self.current_state.copy() ncs[(action - 1) // 2] += 1 self.current_state = ncs done = False if self.terminal_state: done = True for i in range(self.num_dims): done = done and ((self.current_state[i] == self.num_states - 1) or (self.current_state[i] == 0)) if done: self.current_state = self.initial_state if len(self.save_path) != 0: state_path = os.path.join(self.save_path, str(self.itr // 2000)) try: os.makedirs(state_path) except OSError: pass # imio.imsave(os.path.join(state_path, "state" + str(self.itr % 2000) + ".png"), self.current_state) # print(self.save_path, state_path) if self.itr != 0: object_dumps = open(self.save_path + "/object_dumps.txt", 'a') else: object_dumps = open(self.save_path + "/object_dumps.txt", 'w') # create file if it does not exist # print("writing", self.save_path + "/object_dumps.txt") object_dumps.write("chain:" + str(self.current_state[0]) + "\t\n") object_dumps.close() self.itr += 1 if self.itr % 100 == 0: self.current_state = self.initial_state done = True # if done: # self.current_state[0] = 0 return self.current_state, {"chain": (self.current_state, 1)}, done
def construct_tile_order(minmax, normalize, order): minvs, maxvs = minmax order_vectors = [] for minv, maxv in zip(minvs, maxvs): order_vector = [] numv = min(order, int(pytorch_model.unwrap(torch.ceil(maxv - minv) + 1))) # TODO: assumes integer differences between states, fix? for i in range (numv): if not normalize: order_vector.append((minv + i * (maxv - minv) / (max(numv - 1, 1)))) else: order_vector.append((i / max(numv - 1, 1))) order_vectors.append(pytorch_model.wrap(np.array(order_vector)).detach()) for vec in order_vectors: vec.requires_grad = False return order_vectors
def forward(self, img, prev_out=None, ret_numpy=True, ret_extra=False): out = img for layer in self.layers: out = layer(out).detach() if prev_out is not None: # apply prior filter if specified pfilter = prior_filter(prev_out, out.size()) pfilter = self.preprocess(pfilter) out = torch.mul(out, pfilter) focus_out = self.argmax_xy(out) focus_out = focus_out if ret_numpy \ else torch.from_numpy(focus_out).float() if ret_extra: return focus_out, pytorch_model.unwrap(out) return focus_out
def __init__(self, args, train_models): self.optimizers = [] self.solutions = [] self.weight_sharing = args.weight_sharing for i in range(len(train_models.models)): if args.load_weights and not args.freeze_initial: # TODO: initialize from non-population model xinit = pytorch_model.unwrap(train_models.models[i].mean.get_parameters()) # TODO: parameter for sigma? sigma = 0.6#pytorch_model.unwrap(torch.stack([train_models.models[i].networks[j].get_parameters() for j in range(train_models.models[i].num_population)]).var(dim=1).mean()) print(xinit, sigma) else: xinit = (np.random.rand(train_models.currentModel().networks[0].count_parameters())-0.5)*2 # initializes [-1,1] sigma = 1.0 cmaes_params = {"popsize": args.num_population} # might be different than the population in the model... cmaes = cma.CMAEvolutionStrategy(xinit, sigma, cmaes_params) self.optimizers.append(cmaes) self.solutions.append(cmaes.ask()) for i in range(len(self.models.models)): self.assign_solutions(train_models, i)
def argmax_xy(self, out): out = pytorch_model.unwrap(out) batch_size = out.shape[0] row_size = out.shape[2] col_size = out.shape[3] if self.argmax_mode == 'first': # first argmax argmax = np.argmax(out.reshape((batch_size, -1)), axis=1) elif self.argmax_mode == 'rand': # random argmax for tie-breaking out = out.reshape((batch_size, -1)) out_max = np.max(out, axis=1) argmax = np.array([np.random.choice(np.flatnonzero(line == line_max)) for line, line_max in zip(out, out_max)]) else: raise ValueError('argmax_mode %s invalid'%(self.argmax_mode)) argmax %= row_size * col_size # in case of multiple filters argmax_coor = np.array([np.unravel_index(argmax_i, (row_size, col_size)) for argmax_i in argmax], dtype=float) argmax_coor = argmax_coor / np.array([row_size, col_size]) return argmax_coor
def train_dopamine(args, save_path, true_environment, train_models, proxy_environment, proxy_chain, reward_classes, state_class, num_actions, behavior_policy): print("#######") print("Training Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) snum = args.num_stack args.num_stack = 1 proxy_environment.initialize(args, proxy_chain, reward_classes, state_class, behavior_policy) args.num_stack = snum if args.save_models: save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment) behavior_policy.initialize(args, num_actions) train_models.initialize(args, len(reward_classes), state_class, proxy_environment.action_size) proxy_environment.set_models(train_models) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) raw_state = base_env.getState() cp_state = proxy_environment.changepoint_state([raw_state]) # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state) # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape) # rollouts = RolloutOptionStorage(args.num_processes, (state_class.shape,), proxy_environment.action_size, cr.flatten().shape[0], # state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, # args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, # args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 total_elapsed = 0 true_reward = 0 ep_reward = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() print(hist_state) val = None train_models.currentModel().begin_episode(pytorch_model.unwrap(hist_state)) for j in range(args.num_iters): raw_actions = [] last_total_steps, total_steps = 0, 0 for step in range(args.num_steps): # start = time.time() fcnt += 1 total_steps += 1 current_state, current_resp = proxy_environment.getHistState() estate = proxy_environment.getState() if args.true_environment: reward = pytorch_model.wrap([[base_env.reward]]) else: reward = proxy_environment.computeReward(1) true_reward += base_env.reward ep_reward += base_env.reward # print(current_state, reward[train_models.option_index]) action = train_models.currentModel().forward( current_state, pytorch_model.unwrap(reward[train_models.option_index])) # print("ap", action) action = pytorch_model.wrap([action]) cp_state = proxy_environment.changepoint_state([raw_state]) # print(state, action) # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) # print("step check (al, s)", action_list, state) # learning_algorithm.interUpdateModel(step) #### logging option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 #### logging # print(train_models.currentModel().dope_rainbow) if done: # print("reached end") print("Episode Reward: ", ep_reward) ep_reward = 0 train_models.currentModel().end_episode( pytorch_model.unwrap(reward[train_models.option_index])) state, resp = proxy_environment.getHistState() train_models.currentModel().begin_episode( pytorch_model.unwrap(state)) # print(step) break # var = [v for v in tf.trainable_variables() if v.name == "Online/fully_connected/weights:0"][0] # nval = train_models.currentModel().sess.run(var) # if val is not None: # print(var, np.sum(abs(nval - val)), train_models.currentModel().dope_rainbow.eval_mode) # val = nval current_state = proxy_environment.getHistState() # print(state, action) # print("step states (cs, s, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) cp_state = proxy_environment.changepoint_state([raw_state]) # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks) # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs) total_duration += step + 1 # print("rewards", rewards) # rollouts.insert_rewards(rewards) # print(rollouts.extracted_state) # print(rewards) # rollouts.compute_returns(args, values) # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns) # print("returns and return queue", rollouts.returns, rollouts.return_queue) # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns) name = train_models.currentName() # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions) #### logging option_counter[name] += step + 1 option_value[name] += true_reward #### logging if j % args.save_interval == 0 and args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options #### logging if j % args.log_interval == 0: # print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze())) # print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze())) for name in train_models.names(): if option_counter[name] > 0: print(name, option_value[name] / option_counter[name], [ option_actions[name][i] / option_counter[name] for i in range(len(option_actions[name])) ]) if j % (args.log_interval * 20) == 0: option_value[name] = 0 option_counter[name] = 0 for i in range(len(option_actions[name])): option_actions[name][i] = 0 end = time.time() total_elapsed += total_duration log_stats = "Updates {}, num timesteps {}, FPS {}, reward {}".format( j, total_elapsed, int(total_elapsed / (end - start)), true_reward / (args.num_steps * args.log_interval)) print(log_stats) true_reward = 0.0 total_duration = 0
def unwrap_or_none(val): if val is not None: return pytorch_model.unwrap(val) else: return -1.0
def testRL(args, save_path, true_environment, proxy_chain, proxy_environment, state_class, behavior_policy, num_actions, reward_classes=None): print("#######") print("Evaluating Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) if reward_classes is not None: proxy_environment.reward_fns = reward_classes args.changepoint_queue_len = max(args.changepoint_queue_len, args.num_iters) proxy_environment.initialize(args, proxy_chain, proxy_environment.reward_fns, proxy_environment.stateExtractor, behavior_policy) print(base_env.save_path) behavior_policy.initialize(args, num_actions) train_models = proxy_environment.models train_models.initialize(args, len(reward_classes), state_class, num_actions) proxy_environment.duplicate(args) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) resp = proxy_environment.getResp() print(state.shape) raw_state = base_env.getState() cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() raw_states = dict() ep_reward = 0 rollouts.set_parameters(args.num_iters * train_models.num_options) # if args.num_iters > rollouts.changepoint_queue_len: # rollouts.set_changepoint_queue(args.num_iters) done = False for i in range(train_models.num_options): train_models.option_index = i train_models.currentModel().test = True raw_states[train_models.currentName()] = [] for j in range(args.num_iters): fcnt += 1 raw_actions = [] rollouts.cuda() current_state, current_resp = proxy_environment.getHistState() values, dist_entropy, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, qv = train_models.get_action(values, action_probs, Q_vals) cp_state = proxy_environment.changepoint_state([raw_state]) ep_reward += base_env.reward # print(ap, qv) action = behavior_policy.take_action(ap, qv) rollouts.insert( False, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) raw_states[train_models.currentName()].append(raw_state) option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 if done: print("Episode Reward: ", ep_reward, " ", fcnt) ep_reward = 0 # print("reached end") # proxy_environment.determine_swaps(length, needs_rewards=True) # doesn't need to generate rewards print(args.num_iters) print(action_probs) print("Episode Reward: ", ep_reward, " ", fcnt) rewards = proxy_environment.computeReward(args.num_iters) # print(rewards.shape) print(rewards.sum()) rollouts.insert_rewards(rewards, total_duration) total_duration += j rollouts.compute_returns(args, values) rollouts.cpu() save_rols = copy.copy(rollouts) save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols) reward_total = rollouts.rewards.sum( dim=1)[train_models.option_index] / args.num_iters print("Rewards for Policy:", reward_total)
def save_actions(self, action_list): if len(self.save_path) > 0: for i, action in enumerate(action_list): self.save_files[i].write( str(int(pytorch_model.unwrap(action.squeeze()))) + '\n')
def trainRL(args, save_path, true_environment, train_models, learning_algorithm, proxy_environment, proxy_chain, reward_classes, state_class, behavior_policy): print("#######") print("Training Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle, args.single_save_dir) proxy_environment.initialize(args, proxy_chain, reward_classes, state_class, behavior_policy) if args.save_models: if args.env.find("Atari") != -1: screen = base_env.screen base_env.screen = None save_to_pickle(os.path.join(save_path, "env.pkl"), proxy_environment) if args.env.find("Atari") != -1: base_env.screen = screen behavior_policy.initialize(args, proxy_environment.action_size) print(reward_classes[0], reward_classes[0].parameter_minmax) if not args.load_weights: train_models.initialize( args, len(reward_classes), state_class, proxy_environment.action_size, parameter_minmax=reward_classes[0].parameter_minmax) proxy_environment.set_models(train_models) else: print("loading weights", len(reward_classes)) train_models.initialize( args, len(reward_classes), state_class, proxy_environment.action_size, parameter_minmax=reward_classes[0].parameter_minmax) train_models.session(args) proxy_environment.duplicate(args) train_models.train() proxy_environment.set_save(0, args.save_dir, args.save_recycle) learning_algorithm.initialize(args, train_models, reward_classes=reward_classes) print(proxy_environment.get_names()) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) raw_state = base_env.getState() resp = proxy_environment.getResp() cp_state = proxy_environment.changepoint_state([raw_state]) # print("initial_state (s, hs, rs, cps)", state, hist_state, raw_state, cp_state) # print(cp_state.shape, state.shape, hist_state.shape, state_class.shape) print(args.trace_len, args.trace_queue_len) args.buffer_clip = max(args.buffer_clip, args.reward_check) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda, return_form=args.return_form) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 total_elapsed = 0 true_reward = 0 ep_reward = 0 sample_schedule = args.sample_schedule start = time.time() fcnt = 0 final_rewards = list() average_rewards, average_counts = [], [] option_counter = collections.Counter() option_value = collections.Counter() trace_queue = [ ] # keep the last states until end of trajectory (or until a reset), and dump when a reward is found retest = False done = False for j in range(args.num_iters): rollouts.set_parameters(learning_algorithm.current_duration * args.reward_check) # print("set_parameters", state) raw_actions = [] rollouts.cuda() last_total_steps, total_steps = 0, 0 s = time.time() for step in range(learning_algorithm.current_duration): for m in range(args.reward_check): fcnt += 1 total_steps += 1 current_state, current_resp = proxy_environment.getHistState() estate = proxy_environment.getState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0), use_grad=False) v, ap, lp, qv = train_models.get_action( values, action_probs, log_probs, Q_vals) # a = time.time() # print("choose action", a-s) # print(action_probs, Q_vals, ap, lp, qv) action = behavior_policy.take_action(ap, qv) cp_state = proxy_environment.changepoint_state([raw_state]) # print(state, action) # print("before_insert", state) # print(current_state.reshape((4,84,84))[0].cpu().numpy().shape) # cv2.imshow('frame',current_state.reshape((4,84,84))[0].cpu().numpy()) # if cv2.waitKey(1) & 0xFF == ord('q'): # pass # print(action, true_environment.paddle.pos, true_environment.ball.vel, true_environment.ball.pos) if args.behavior_policy == "dem" or args.visualize: cv2.imshow('frame', raw_state[0].reshape((84, 84))) if cv2.waitKey(1) & 0xFF == ord('q'): pass # cv2.imshow('frame',raw_state[0].reshape((84,84))) # if cv2.waitKey(1) & 0xFF == ord('q'): # pass rollouts.insert( retest, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) rollouts.insert_dilation(proxy_environment.swap) retest = False # print("step states (cs, ns, cps, act)", current_state, estate, cp_state, action) # print("step outputs (val, de, ap, qv, v, ap, qv)", values, dist_entropy, action_probs, Q_vals, v, ap, qv) trace_queue.append( (current_state.clone().detach(), action.clone().detach())) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) # print(action_list) # s = time.time() # print("step time", s-a) # print("after step", state) true_reward += base_env.reward ep_reward += base_env.reward if args.reward_form == 'raw': for rc in reward_classes: rc.insert_reward(base_env.reward) # print(base_env.reward) # print(action_list, action) # print("step check (al, s)", action_list, state) #### logging option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 #### logging if done: print("Episode Reward: ", ep_reward, " ", fcnt, j) ep_reward = 0 if not args.sample_duration > 0 or (args.done_swapping <= j): # print("reached end") # print(step) if args.trace_queue_len > -1: trace_queue = rollouts.insert_trace(trace_queue) trace_queue = [] break else: # need to clear out trace queue trace_queue = rollouts.insert_trace(trace_queue) trace_queue = [] # time.sleep(.1) # print(m, args.reward_check) # rl = time.time() # print("run loop", start - rl) rewards = proxy_environment.computeReward(m + 1) # print(rewards, proxy_environment.changepoint_queue) # print(rewards.sum()) # a = time.time() # print("reward time", a-s) change, target = proxy_environment.determineChanged(m + 1) proxy_environment.determine_swaps( m + 1, needs_rewards=True) # doesn't need to generate rewards # print("reward time", time.time() - start) # print("rewards", torch.sum(rewards)) # reenter to get next value current_state, current_resp = proxy_environment.getHistState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, lp, qv = train_models.get_action(values, action_probs, log_probs, Q_vals) action = behavior_policy.take_action(ap, qv) trace_queue.append( (current_state.clone().detach(), action.clone().detach())) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts.insert( retest, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) # inserting the last state and unused action retest = True # need to re-insert value with true state # ######## rollouts.insert_hindsight_target(change, target) rollouts.insert_rewards(args, rewards) name = train_models.currentName() option_counter[name] += m + 1 option_value[name] += rewards.sum(dim=1)[train_models.option_index] last_total_steps = total_steps completed = learning_algorithm.interUpdateModel( total_steps, rewards, change, done) # rw = time.time() # print("rewards", rl - rw, start - rw) if completed or (done and not args.sample_duration > 0): break retest = args.buffer_steps > 0 or args.lag_num > 0 # if we roll, don't retest # print("steptime", time.time() - start) # start = time.time() # print(done) # print(rollouts.base_rollouts.extracted_state, rollouts.base_rollouts.rewards) # print("rew, state", rollouts.rewards[0,-50:], rollouts.extracted_state[-50:]) # print("inserttime", time.time() - start) # print("states and actions (es, cs, a, m)", rollouts.extracted_state, rollouts.current_state, rollouts.actions, rollouts.masks) # print("actions and Qvals (qv, vp, ap)", rollouts.Qvals, rollouts.value_preds, rollouts.action_probs) # start = time.time() total_duration += total_steps # if done: # trace_queue = rollouts.insert_trace(trace_queue) # trace_queue = [] # insert first # else: # trace_queue = rollouts.insert_trace(trace_queue) # print(rollouts.extracted_state) # print(rewards) # rollouts.compute_returns(args, values) # don't need to compute returns because they are computed upon reward reception # print("returns and rewards (rew, ret)", rollouts.rewards, rollouts.returns) # print("returns and return queue", rollouts.returns, rollouts.return_queue) # print("reward check (cs, rw, rol rw, rt", rollouts.current_state, rewards, rollouts.rewards, rollouts.returns) # print(name, rollouts.extracted_state, rollouts.rewards, rollouts.actions) # n = 0 # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj): # n+=1 # except: # pass # print("learning at", j, n) #### logging # print(rollouts.base_rollouts.rewards.shape) reward_total = rollouts.get_current( names=['rewards'])[0][train_models.option_index].sum(dim=0) # print("reward_total", reward_total.shape) final_rewards.append(reward_total) #### logging # start = time.time() learning_algorithm.step_counter += 1 if j >= args.warm_up: # TODO: clean up this to learning algorithm? value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = learning_algorithm.step( args, train_models, rollouts) if args.dist_interval != -1 and j % args.dist_interval == 0: learning_algorithm.distibutional_sparcity_step( args, train_models, rollouts) # print("di", time.time() - start) if args.correlate_steps > 0 and j % args.diversity_interval == 0: loss = learning_algorithm.correlate_diversity_step( args, train_models, rollouts) # print("corr", time.time() - start) if args.greedy_epsilon_decay > 0 and j % args.greedy_epsilon_decay == 0 and j != 0: behavior_policy.epsilon = max( args.min_greedy_epsilon, behavior_policy.epsilon * 0.9) # TODO: more advanced greedy epsilon methods # print("eps", time.time() - start) if args.sample_schedule > 0 and j % sample_schedule == 0 and j != 0: learning_algorithm.sample_duration = ( j // args.sample_schedule + 1) * args.sample_duration learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) args.changepoint_queue_len = max( learning_algorithm.max_duration, args.changepoint_queue_len) sample_schedule = args.sample_schedule * ( j // args.sample_schedule + 1 ) # sum([args.sample_schedule * (i+1) for i in range(j // args.sample_schedule + 1)]) if args.retest_schedule > 0 and j % args.retest_schedule == 0 and j != 0: learning_algorithm.retest += 1 learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) args.changepoint_queue_len = max( learning_algorithm.max_duration, args.changepoint_queue_len) # print("resample", time.time() - start) if j > args.done_swapping: learning_algorithm.reset_current_duration( learning_algorithm.sample_duration, args.reward_check) else: value_loss, action_loss, dist_entropy, output_entropy, entropy_loss, action_log_probs = None, None, None, None, None, None parameter = proxy_environment.get_next_parameter() if args.reward_swapping: parameter = completed learning_algorithm.updateModel(parameter) # s = time.time() # print("learning step time", s-a) # n = 0 # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj): # n+=1 # except: # pass # print("objects at", j, n) # print("update", time.time() - start) # print("learn time", time.time() - rw) if j % args.save_interval == 0 and args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options #### logging if j % args.log_interval == 0: print("Qvalue and state", pytorch_model.unwrap(Q_vals.squeeze()), pytorch_model.unwrap(current_state.squeeze())) print("probs and state", pytorch_model.unwrap(action_probs.squeeze()), pytorch_model.unwrap(current_state.squeeze())) for name in train_models.names(): if option_counter[name] > 0: print(name, option_value[name] / option_counter[name], [ option_actions[name][i] / option_counter[name] for i in range(len(option_actions[name])) ]) # if j % (args.log_interval * 20) == 0: option_value[name] = 0 option_counter[name] = 0 for i in range(len(option_actions[name])): option_actions[name][i] = 0 end = time.time() final_rewards = torch.stack(final_rewards).detach() average_rewards.append(final_rewards.sum()) average_counts.append(total_duration) acount = np.sum(average_counts) best_reward = true_reward true_reward = true_reward / total_steps mean_reward = true_reward if len(base_env.episode_rewards) > 0: true_reward = np.median(base_env.episode_rewards) mean_reward = np.mean(base_env.episode_rewards) best_reward = np.max(base_env.episode_rewards) el, vl, al = unwrap_or_none(entropy_loss), unwrap_or_none( value_loss), unwrap_or_none(action_loss) total_elapsed += total_duration log_stats = "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {}, value loss {}, policy loss {}, average_reward {}, true_reward median: {}, mean: {}, max: {}".format( j, total_elapsed, int(total_elapsed / (end - start)), final_rewards.mean(), np.median(final_rewards.cpu()), final_rewards.min(), final_rewards.max(), el, vl, al, torch.stack(average_rewards).sum() / acount, true_reward, mean_reward, best_reward) if acount > 300: average_counts.pop(0) average_rewards.pop(0) true_reward = 0.0 print(log_stats) final_rewards = list() total_duration = 0 #### logging if args.save_models and args.train: # no point in saving if not training print("=========SAVING MODELS==========") train_models.save(save_path) # TODO: implement save_options proxy_environment.close_files()
def forward(self, x, reward): x = pytorch_model.unwrap(x) # print(reward[0], x) return self.dope_rainbow.step(reward[0], x)
def remove_mean(imgs, focus, nb_size=(4, 9)): in_np = isinstance(imgs, np.ndarray) if not in_np: imgs = pytorch_model.unwrap(imgs) imgs = np.array(imgs) focus = (focus * imgs.shape[2:]).astype(int)
def forward(self, img, ret_numpy=False, ret_extra=False): # out = img out = self.preprocess(img) for layer in self.layers: out = layer(out) return out if not ret_numpy else pytorch_model.unwrap(out)
def precompute(self, states, actions, resps): trajectory = pytorch_model.unwrap(states[:-1,:self.traj_dim]) saliency_trajectory = pytorch_model.unwrap(states[:-1,self.traj_dim:]) # print("states shape", trajectory.shape, saliency_trajectory.shape) assignments, cps = self.model.get_mode(trajectory, saliency_trajectory) return assignments, cps
def step(self, action): # TODO: action is tensor, might not be safe assumption # t = time.time() uaction = pytorch_model.unwrap(action.long()) raw_state, reward, done, infos = self.screen.step([uaction]) for info in infos: if 'episode' in info.keys(): self.episode_rewards.append(info['episode']['r']) # a = time.time() # print("screen step", a - t) raw_state = np.squeeze(raw_state) # raw_state[:10,:] = 0.0 self.current_raw = raw_state factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]} self.current_action = action self.reward = reward[0] # cv2.imshow('frame',raw_state) # if cv2.waitKey(10000) & 0xFF == ord('q'): # pass if self.focus_model is not None: factor_state = self.focus_model.forward( pytorch_model.wrap(raw_state.astype(float) / 255.0, cuda=True).unsqueeze(0).unsqueeze(0), ret_numpy=True) # t = time.time() # print("model step", t - a) for key in factor_state.keys(): factor_state[key] *= 84 factor_state[key] = (np.squeeze(factor_state[key]), (1.0, )) self.factor_state = factor_state self.last_action = uaction # rs = raw_state.copy() time_dict = factor_state pval = "" for k in time_dict.keys(): if k != 'Action' and k != 'Reward': raw_state[int(time_dict[k][0][0]), :] = 255 raw_state[:, int(time_dict[k][0][1])] = 255 if k == 'Action' or k == 'Reward': pval += k + ": " + str(time_dict[k][1]) + ", " else: pval += k + ": " + str(time_dict[k][0]) + ", " print(pval[:-2]) raw_state = cv2.resize(raw_state, (336, 336)) cv2.imshow('frame', raw_state) if cv2.waitKey(1) & 0xFF == ord(' ') & 0xFF == ord('c'): pass # logging if len(self.save_path) > 0: if self.recycle > 0: state_path = os.path.join( self.save_path, str((self.itr % self.recycle) // 2000)) count = self.itr % self.recycle else: state_path = os.path.join(self.save_path, str(self.itr // 2000)) count = self.itr try: os.makedirs(state_path) except OSError: pass if self.itr != 0: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'a') else: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'w') # create file if it does not exist for key in factor_state.keys(): writeable = list(factor_state[key][0]) + list( factor_state[key][1]) object_dumps.write( key + ":" + " ".join([str(fs) for fs in writeable]) + "\t") # TODO: attributes are limited to single floats object_dumps.write( "\n") # TODO: recycling does not stop object dumping imio.imsave( os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw) self.itr += 1 # print("elapsed ", time.time() - t) return raw_state, factor_state, done
def testRL(args, save_path, true_environment, proxy_chain, proxy_environment, state_class, behavior_policy, num_actions, reward_classes=None): print("#######") print("Evaluating Options") print("#######") # if option_chain is not None: #TODO: implement this base_env = proxy_chain[0] base_env.set_save(0, args.save_dir, args.save_recycle) if reward_classes is not None: proxy_environment.reward_fns = reward_classes args.changepoint_queue_len = max(args.changepoint_queue_len, args.num_iters * args.num_update_model) proxy_environment.initialize(args, proxy_chain, proxy_environment.reward_fns, state_class, behavior_policy) print(base_env.save_path) behavior_policy.initialize(args, num_actions) train_models = proxy_environment.models train_models.initialize(args, len(reward_classes), state_class, num_actions) proxy_environment.duplicate(args) proxy_environment.set_save(0, args.save_dir, args.save_recycle) state = pytorch_model.wrap(proxy_environment.getState(), cuda=args.cuda) resp = proxy_environment.getResp() print(state.shape) raw_state = base_env.getState() cs, cr = proxy_environment.getHistState() hist_state = pytorch_model.wrap(cs, cuda=args.cuda) cp_state = proxy_environment.changepoint_state([raw_state]) rollouts = RolloutOptionStorage( args.num_processes, (state_class.shape, ), proxy_environment.action_size, cr.flatten().shape[0], state.shape, hist_state.shape, args.buffer_steps, args.changepoint_queue_len, args.trace_len, args.trace_queue_len, args.dilated_stack, args.target_stack, args.dilated_queue_len, train_models.currentOptionParam().shape[1:], len(train_models.models), cp_state[0].shape, args.lag_num, args.cuda) option_actions = { option.name: collections.Counter() for option in train_models.models } total_duration = 0 start = time.time() fcnt = 0 final_rewards = list() option_counter = collections.Counter() option_value = collections.Counter() raw_states = dict() ep_reward = 0 rollouts.set_parameters(args.num_iters * args.num_update_model) # if args.num_iters > rollouts.changepoint_queue_len: # rollouts.set_changepoint_queue(args.num_iters) done = False ctr = 0 raw_indexes = dict() for i in range(args.num_iters): train_models.option_index = np.random.randint(train_models.num_options) train_models.currentModel().test = True if train_models.currentName() not in raw_states: raw_states[train_models.currentName()] = [] raw_indexes[train_models.currentName()] = [] for j in range(args.num_update_model): raw_indexes[train_models.currentName()].append(ctr) ctr += 1 fcnt += 1 raw_actions = [] rollouts.cuda() current_state, current_resp = proxy_environment.getHistState() values, log_probs, action_probs, Q_vals = train_models.determine_action( current_state.unsqueeze(0), current_resp.unsqueeze(0)) v, ap, lp, qv = train_models.get_action(values, action_probs, log_probs, Q_vals) cp_state = proxy_environment.changepoint_state([raw_state]) ep_reward += base_env.reward # print(ap, qv) action = behavior_policy.take_action(ap, qv) # print(train_models.currentName(), action, qv.squeeze()) rollouts.insert( False, state, current_state, pytorch_model.wrap(args.greedy_epsilon, cuda=args.cuda), done, current_resp, action, cp_state[0], train_models.currentOptionParam(), train_models.option_index, None, None, action_probs, Q_vals, values) state, raw_state, resp, done, action_list = proxy_environment.step( action, model=False ) #, render=len(args.record_rollouts) != 0, save_path=args.record_rollouts, itr=fcnt) print(train_models.currentName(), j, action) cv2.imshow('frame', raw_state[0]) if cv2.waitKey(50) & 0xFF == ord('q'): break raw_states[train_models.currentName()].append(raw_state) option_actions[train_models.currentName()][int( pytorch_model.unwrap(action.squeeze()))] += 1 if done: print("Episode Reward: ", ep_reward, " ", fcnt) ep_reward = 0 # print("reached end") # proxy_environment.determine_swacurrent_durationps(length, needs_rewards=True) # doesn't need to generate rewards # rewards = proxy_environment.computeReward(args.num_update_model) # print(rewards) if len(base_env.episode_rewards) > 0: true_reward = np.median(base_env.episode_rewards) mean_reward = np.mean(base_env.episode_rewards) best_reward = np.max(base_env.episode_rewards) print("true reward median: %f, mean: %f, max: %f" % (true_reward, mean_reward, best_reward)) print(args.num_iters) print(action_probs) print("Episode Reward: ", ep_reward, " ", fcnt) print(proxy_environment.reward_fns) rewards = proxy_environment.computeReward(args.num_iters * args.num_update_model) # print(rewards.shape) # print(rewards.sum()) rollouts.insert_rewards(args, rewards) total_duration += j save_rols = copy.deepcopy(rollouts) if len(args.save_dir) > 0: save_to_pickle(os.path.join(args.save_dir, "rollouts.pkl"), save_rols) for i in range(train_models.num_options): print(rollouts.base_rollouts.rewards.shape, raw_indexes) reward_total = rollouts.base_rollouts.rewards.sum( dim=1)[i] / (args.num_iters * args.num_update_model) # print(rollouts.base_rollouts.rewards, raw_indexes, rollouts.base_rollouts.rewards.shape) reward_adjusted = rollouts.base_rollouts.rewards[ i, np.array(raw_indexes[train_models.models[i].name]) + args.num_stack].sum(dim=0) / len( raw_indexes[train_models.models[i].name]) print("Num policy steps:", len(raw_indexes[train_models.models[i].name])) print("Rewards during Policy:", reward_adjusted) print("Rewards for Policy:", reward_total)