def get_states(args, true_environment, length_constraint=50000, raws=None, dumps=None): dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) use_raw = 'raw' in args.state_forms state_class.minmax = compute_minmax(state_class, dataset_path) states, resps, raws, dumps = load_states( state_class.get_state, dataset_path, length_constraint=length_constraint, use_raw=use_raw, raws=raws, dumps=dumps) return states, resps, num_actions, state_class, environments, raws, dumps
def __init__(self, args): super().__init__(None, args) self.form = args.reward_form self.state_class = GetState(target="Block", state_forms=[ ("Block", "multifull"), ("Ball", "bounds") ]) # should be a block state class self.parameters = np.array([0, 0]) self.max_dist = np.linalg.norm([60, 20]) self.cuda = args.cuda self.parameter_minmax = [np.array([0, 0]), np.array([84, 84])]
class BlockReward(ChangepointReward): def __init__(self, args): super().__init__(None, args) self.form = args.reward_form self.state_class = GetState(target="Block", state_forms=[ ("Block", "multifull"), ("Ball", "bounds") ]) # should be a block state class self.parameters = np.array([0, 0]) self.max_dist = np.linalg.norm([60, 20]) self.cuda = args.cuda self.parameter_minmax = [np.array([0, 0]), np.array([84, 84])] def compute_reward(self, states, actions, resps): rewards = torch.zeros(len(states)) change_indexes, ats, st = self.state_class.determine_delta_target( pytorch_model.unwrap(states)) if len(change_indexes) > 0: dists = np.linalg.norm(self.parameters - st[0]) rewards[change_indexes[0]] = (self.max_dist - dists) / self.max_dist rewards[states[:, -2] == 79] = -1.0 if self.cuda: rewards = rewards.cuda() return rewards def determineChanged(self, states, actions, resps): change_indexes, ats, states = self.state_class.determine_delta_target( pytorch_model.unwrap(states)) change = len(change_indexes) > 0 if change: return change, states[0] return change, None def get_possible_parameters(self, state): last_shape = self.state_class.shapes[(self.state_class.names[0], self.state_class.fnames[0])][0] state = state[:last_shape] # print(state, last_shape, state.shape, self.state_class.shapes[(self.state_class.names[-1], self.state_class.fnames[-1])]) state = state.view(-1, 3) idxes = state[:, 2].nonzero()[:, 0].squeeze() # print(idxes, state[idxes,:2]) return state[idxes, :2] def get_trajectories(self, full_states): states = [] resps = [] for state in full_states: state, resp = self.state_class.get_state(state) states.append(state) resps.append(resp) return pytorch_model.wrap(np.stack(states), cuda=self.cuda)
def __init__(self, model, args): ''' model is a changepoint model ''' self.name = args.train_edge self.head, self.tail = get_edge(args.train_edge) self.model = model self.cuda = args.cuda self.traj_dim = 2 #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change self.parameter_minmax = [np.array([0]), np.array([84])] # TODO: where does this come from? self.state_class = GetState(0, self.head, state_forms=[(self.head, 'bounds'), *[(tail, 'bounds') for tail in self.tail]]) # TODO: technically, multibounds for both
def generate_soft_dataset(states, resps, true_environment, reward_fns, args): pre_load_weights = args.load_weights args.load_weights = True option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) print(args.load_weights) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments train_models = proxy_environment.models head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) train_models.initialize(args, len(reward_fns), state_class, num_actions) train_models.session(args) proxy_environment.duplicate(args) # assumes that we are loading weights args.load_weights = pre_load_weights soft_actions = [[] for i in range(train_models.num_options)] for oidx in range(train_models.num_options): train_models.option_index = oidx if args.model_form == 'population': train_models.currentModel().use_mean = True for i in range(len(states) // 30 + 1): state = states[i * 30:(i + 1) * 30] resp = resps[i * 30:(i + 1) * 30] values, dist_entropy, action_probs, Q_vals = train_models.determine_action( pytorch_model.wrap(state, cuda=args.cuda), pytorch_model.wrap(resp, cuda=args.cuda)) # print (action_probs) values, action_probs, Q_vals = train_models.get_action( values, action_probs, Q_vals) soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist() print("soft actions", np.sum(np.array(soft_actions[0]), axis=0)) for i in range(len(soft_actions)): soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda) return np.array(soft_actions)
if args.load_weights: print(proxy_environment.models.cuda) proxy_environment.models.cuda(device=args.gpu) train_models = proxy_environment.models else: train_models = MultiOption(len(reward_paths), models[args.model_form]) proxy_chain = environments if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path, filename=args.focus_dumps_name) if args.normalize: minv = [] maxv = [] for f in args.state_forms: if f == 'prox': minv += [-84, -84] maxv += [84, 84] elif f == 'bounds': minv += [0, 0] maxv += [84, 84] state_class.minmax = np.stack((np.array(minv), np.array(maxv)))
dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl")) print(reward_paths) reward_paths.sort(key=lambda x: int(x.split("__")[2])) head, tail = get_edge(args.train_edge) reward_classes = [load_from_pickle(pth) for pth in reward_paths] # train_models = MultiOption(1, BasicModel) train_models = MultiOption(len(reward_paths), models[args.model_form]) # learning_algorithm = DQN_optimizer() learning_algorithm = learning_algorithms[args.optimizer_form]() # learning_algorithm = DDPG_optimizer() environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) proxy_chain = environments if len(environments) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list(zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path) behavior_policy = behavior_policies[args.behavior_policy]() # behavior_policy = EpsilonGreedyProbs() trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm, proxy_environment, proxy_chain, reward_classes, state_class, behavior_policy=behavior_policy)
net_params=net_params, ) paddle_model.set_parameters(params) ball_model_net_params_path = 'ObjectRecognition/net_params/two_layer.json' net_params = json.loads(open(ball_model_net_params_path).read()) params = load_param('ObjectRecognition/models/ball.npy') ball_model = ModelFocusCNN( image_shape=(84, 84), net_params=net_params, ) ball_model.set_parameters(params) model = ModelCollectionDAG() model.add_model('Paddle', paddle_model, [], augment_fn=util.remove_mean) model.add_model('Ball', ball_model, ['Paddle']) print(model) state_function = GetState('Action', state_forms=[("Action", "feature")]) states, resps, raws, dumps = load_states(state_function.get_state, args.record_rollouts, use_raw=True) print(states) raws = pytorch_model.wrap(np.array(raws), cuda=False).unsqueeze(1) dumps = model.forward(raws, ret_numpy=True) focus_dumps = [{} for _ in range(len(states))] for key in dumps: for i, val in enumerate(dumps[key]): focus_dumps[i][key] = val focus_dumps_file = open( os.path.join(args.record_rollouts, "focus_dumps.txt"), 'w') for action, factor_state in zip(states, focus_dumps): for key in factor_state.keys(): focus_dumps_file.write(
reward_classes = [block_rewards()] # reward_classes = [bounce_rewards(0), bounce_rewards(1), bounce_rewards(2), bounce_rewards(3)] train_models = MultiOption(len(reward_paths), models[args.model_form]) learning_algorithm = learning_algorithms[args.optimizer_form]() environments = option_chain.initialize(args) environments.pop(-1) proxy_chain = environments if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(num_actions, tail, state_forms=list( zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path) print(state_class.minmax) behavior_policy = EpsilonGreedyQ() # behavior_policy = EpsilonGreedyProbs() trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm, proxy_chain, reward_classes, state_class, behavior_policy=behavior_policy)