def get_states(args,
               true_environment,
               length_constraint=50000,
               raws=None,
               dumps=None):
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    use_raw = 'raw' in args.state_forms
    state_class.minmax = compute_minmax(state_class, dataset_path)
    states, resps, raws, dumps = load_states(
        state_class.get_state,
        dataset_path,
        length_constraint=length_constraint,
        use_raw=use_raw,
        raws=raws,
        dumps=dumps)
    return states, resps, num_actions, state_class, environments, raws, dumps
Esempio n. 2
0
 def __init__(self, args):
     super().__init__(None, args)
     self.form = args.reward_form
     self.state_class = GetState(target="Block",
                                 state_forms=[
                                     ("Block", "multifull"),
                                     ("Ball", "bounds")
                                 ])  # should be a block state class
     self.parameters = np.array([0, 0])
     self.max_dist = np.linalg.norm([60, 20])
     self.cuda = args.cuda
     self.parameter_minmax = [np.array([0, 0]), np.array([84, 84])]
Esempio n. 3
0
class BlockReward(ChangepointReward):
    def __init__(self, args):
        super().__init__(None, args)
        self.form = args.reward_form
        self.state_class = GetState(target="Block",
                                    state_forms=[
                                        ("Block", "multifull"),
                                        ("Ball", "bounds")
                                    ])  # should be a block state class
        self.parameters = np.array([0, 0])
        self.max_dist = np.linalg.norm([60, 20])
        self.cuda = args.cuda
        self.parameter_minmax = [np.array([0, 0]), np.array([84, 84])]

    def compute_reward(self, states, actions, resps):
        rewards = torch.zeros(len(states))
        change_indexes, ats, st = self.state_class.determine_delta_target(
            pytorch_model.unwrap(states))
        if len(change_indexes) > 0:
            dists = np.linalg.norm(self.parameters - st[0])
            rewards[change_indexes[0]] = (self.max_dist -
                                          dists) / self.max_dist
        rewards[states[:, -2] == 79] = -1.0
        if self.cuda:
            rewards = rewards.cuda()
        return rewards

    def determineChanged(self, states, actions, resps):
        change_indexes, ats, states = self.state_class.determine_delta_target(
            pytorch_model.unwrap(states))
        change = len(change_indexes) > 0
        if change:
            return change, states[0]
        return change, None

    def get_possible_parameters(self, state):
        last_shape = self.state_class.shapes[(self.state_class.names[0],
                                              self.state_class.fnames[0])][0]
        state = state[:last_shape]
        # print(state, last_shape, state.shape, self.state_class.shapes[(self.state_class.names[-1], self.state_class.fnames[-1])])
        state = state.view(-1, 3)
        idxes = state[:, 2].nonzero()[:, 0].squeeze()
        # print(idxes, state[idxes,:2])
        return state[idxes, :2]

    def get_trajectories(self, full_states):
        states = []
        resps = []
        for state in full_states:
            state, resp = self.state_class.get_state(state)
            states.append(state)
            resps.append(resp)
        return pytorch_model.wrap(np.stack(states), cuda=self.cuda)
Esempio n. 4
0
 def __init__(self, model, args):
     '''
     model is a changepoint model
     '''
     self.name = args.train_edge
     self.head, self.tail = get_edge(args.train_edge)
     self.model = model
     self.cuda = args.cuda
     self.traj_dim = 2 #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change
     self.parameter_minmax = [np.array([0]), np.array([84])] # TODO: where does this come from?
     self.state_class = GetState(0, self.head, state_forms=[(self.head, 'bounds'), *[(tail, 'bounds') for tail in self.tail]]) # TODO: technically, multibounds for both
def generate_soft_dataset(states, resps, true_environment, reward_fns, args):
    pre_load_weights = args.load_weights
    args.load_weights = True
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    print(args.load_weights)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    train_models = proxy_environment.models
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)

    train_models.initialize(args, len(reward_fns), state_class, num_actions)
    train_models.session(args)
    proxy_environment.duplicate(args)  # assumes that we are loading weights
    args.load_weights = pre_load_weights

    soft_actions = [[] for i in range(train_models.num_options)]
    for oidx in range(train_models.num_options):
        train_models.option_index = oidx
        if args.model_form == 'population':
            train_models.currentModel().use_mean = True
        for i in range(len(states) // 30 + 1):
            state = states[i * 30:(i + 1) * 30]
            resp = resps[i * 30:(i + 1) * 30]
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                pytorch_model.wrap(state, cuda=args.cuda),
                pytorch_model.wrap(resp, cuda=args.cuda))
            # print (action_probs)
            values, action_probs, Q_vals = train_models.get_action(
                values, action_probs, Q_vals)
            soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist()
    print("soft actions", np.sum(np.array(soft_actions[0]), axis=0))
    for i in range(len(soft_actions)):
        soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda)
    return np.array(soft_actions)
Esempio n. 6
0
 if args.load_weights:
     print(proxy_environment.models.cuda)
     proxy_environment.models.cuda(device=args.gpu)
     train_models = proxy_environment.models
 else:
     train_models = MultiOption(len(reward_paths), models[args.model_form])
 proxy_chain = environments
 if len(
         environments
 ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
     num_actions = len(environments[-1].reward_fns)
 else:
     num_actions = environments[-1].num_actions
 print(args.state_names, args.state_forms)
 state_class = GetState(head,
                        state_forms=list(
                            zip(args.state_names, args.state_forms)))
 state_class.minmax = compute_minmax(state_class,
                                     dataset_path,
                                     filename=args.focus_dumps_name)
 if args.normalize:
     minv = []
     maxv = []
     for f in args.state_forms:
         if f == 'prox':
             minv += [-84, -84]
             maxv += [84, 84]
         elif f == 'bounds':
             minv += [0, 0]
             maxv += [84, 84]
     state_class.minmax = np.stack((np.array(minv), np.array(maxv)))
Esempio n. 7
0
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args)
    reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl"))
    print(reward_paths)
    reward_paths.sort(key=lambda x: int(x.split("__")[2]))

    head, tail = get_edge(args.train_edge)

    reward_classes = [load_from_pickle(pth) for pth in reward_paths]
    # train_models = MultiOption(1, BasicModel)
    train_models = MultiOption(len(reward_paths), models[args.model_form])
    # learning_algorithm = DQN_optimizer()
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    # learning_algorithm = DDPG_optimizer()
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if len(environments) > 1: # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    print(args.state_names, args.state_forms)
    state_class = GetState(head, state_forms=list(zip(args.state_names, args.state_forms)))
    state_class.minmax = compute_minmax(state_class, dataset_path)
    behavior_policy = behavior_policies[args.behavior_policy]()
    # behavior_policy = EpsilonGreedyProbs()
    trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm, proxy_environment,
            proxy_chain, reward_classes, state_class, behavior_policy=behavior_policy)
Esempio n. 8
0
     net_params=net_params,
 )
 paddle_model.set_parameters(params)
 ball_model_net_params_path = 'ObjectRecognition/net_params/two_layer.json'
 net_params = json.loads(open(ball_model_net_params_path).read())
 params = load_param('ObjectRecognition/models/ball.npy')
 ball_model = ModelFocusCNN(
     image_shape=(84, 84),
     net_params=net_params,
 )
 ball_model.set_parameters(params)
 model = ModelCollectionDAG()
 model.add_model('Paddle', paddle_model, [], augment_fn=util.remove_mean)
 model.add_model('Ball', ball_model, ['Paddle'])
 print(model)
 state_function = GetState('Action', state_forms=[("Action", "feature")])
 states, resps, raws, dumps = load_states(state_function.get_state,
                                          args.record_rollouts,
                                          use_raw=True)
 print(states)
 raws = pytorch_model.wrap(np.array(raws), cuda=False).unsqueeze(1)
 dumps = model.forward(raws, ret_numpy=True)
 focus_dumps = [{} for _ in range(len(states))]
 for key in dumps:
     for i, val in enumerate(dumps[key]):
         focus_dumps[i][key] = val
 focus_dumps_file = open(
     os.path.join(args.record_rollouts, "focus_dumps.txt"), 'w')
 for action, factor_state in zip(states, focus_dumps):
     for key in factor_state.keys():
         focus_dumps_file.write(
    reward_classes = [block_rewards()]
    # reward_classes = [bounce_rewards(0), bounce_rewards(1), bounce_rewards(2), bounce_rewards(3)]
    train_models = MultiOption(len(reward_paths), models[args.model_form])
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    environments = option_chain.initialize(args)
    environments.pop(-1)
    proxy_chain = environments
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    print(args.state_names, args.state_forms)
    state_class = GetState(num_actions,
                           tail,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    state_class.minmax = compute_minmax(state_class, dataset_path)
    print(state_class.minmax)
    behavior_policy = EpsilonGreedyQ()
    # behavior_policy = EpsilonGreedyProbs()
    trainRL(args,
            option_chain.save_dir,
            true_environment,
            train_models,
            learning_algorithm,
            proxy_chain,
            reward_classes,
            state_class,
            behavior_policy=behavior_policy)