def get_states(args,
               true_environment,
               length_constraint=50000,
               raws=None,
               dumps=None):
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    use_raw = 'raw' in args.state_forms
    state_class.minmax = compute_minmax(state_class, dataset_path)
    states, resps, raws, dumps = load_states(
        state_class.get_state,
        dataset_path,
        length_constraint=length_constraint,
        use_raw=use_raw,
        raws=raws,
        dumps=dumps)
    return states, resps, num_actions, state_class, environments, raws, dumps
def pretrain(args, true_environment, desired, num_actions, state_class, states,
             resps, targets, criteria, reward_fns):
    # args = get_args()
    # true_environment = Paddle()
    # true_environment = PaddleNoBlocks()
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if args.load_weights:
        train_models = proxy_environment.models
    else:
        train_models = MultiOption(1, models[args.model_form])
    head, tail = get_edge(args.train_edge)
    print(args.state_names, args.state_forms)
    print(state_class.minmax)
    # behavior_policy = EpsilonGreedyProbs()
    save_dir = args.save_graph
    if args.save_graph == "graph":
        save_dir = option_chain.save_dir
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)
    fit(args, save_dir, true_environment, train_models, state_class, desired,
        states, resps, targets, num_actions, criteria, proxy_environment,
        reward_fns)
Esempio n. 3
0
 def __init__(self, args, reward_function, minmax=None):
     '''
     wraps a novelty reward over an existing reward function
     '''
     self.name = reward_function.name
     self.head, self.tail = get_edge(args.train_edge)
     self.reward_function = reward_function
     self.cuda = args.cuda
     self.traj_dim = reward_function.traj_dim #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change
     self.novelty_decay = args.novelty_decay
Esempio n. 4
0
 def __init__(self, model, args):
     '''
     model is a changepoint model
     '''
     self.name = args.train_edge
     self.head, self.tail = get_edge(args.train_edge)
     self.model = model
     self.cuda = args.cuda
     self.traj_dim = 2 #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change
     self.parameter_minmax = [np.array([0]), np.array([84])] # TODO: where does this come from?
     self.state_class = GetState(0, self.head, state_forms=[(self.head, 'bounds'), *[(tail, 'bounds') for tail in self.tail]]) # TODO: technically, multibounds for both
def get_option_actions(pth, train_edge, num_actions, weighting_lambda, length_constraint = 50000):
    head, tail = get_edge(train_edge)
    action_file = open(os.path.join(pth, tail[0] + "_actions.txt"), 'r')
    actions = []
    for act in action_file:
        # print(act, os.path.join(pth, train_edge + "_actions.txt"))
        actions.append(int(act))
        if len(actions) > length_constraint:
            actions.pop(0)
    action_file.close()
    actions = hot_actions(actions, num_actions)
    actions = smooth_weight(actions, weighting_lambda)
    return actions
def generate_soft_dataset(states, resps, true_environment, reward_fns, args):
    pre_load_weights = args.load_weights
    args.load_weights = True
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    print(args.load_weights)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    train_models = proxy_environment.models
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)

    train_models.initialize(args, len(reward_fns), state_class, num_actions)
    train_models.session(args)
    proxy_environment.duplicate(args)  # assumes that we are loading weights
    args.load_weights = pre_load_weights

    soft_actions = [[] for i in range(train_models.num_options)]
    for oidx in range(train_models.num_options):
        train_models.option_index = oidx
        if args.model_form == 'population':
            train_models.currentModel().use_mean = True
        for i in range(len(states) // 30 + 1):
            state = states[i * 30:(i + 1) * 30]
            resp = resps[i * 30:(i + 1) * 30]
            values, dist_entropy, action_probs, Q_vals = train_models.determine_action(
                pytorch_model.wrap(state, cuda=args.cuda),
                pytorch_model.wrap(resp, cuda=args.cuda))
            # print (action_probs)
            values, action_probs, Q_vals = train_models.get_action(
                values, action_probs, Q_vals)
            soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist()
    print("soft actions", np.sum(np.array(soft_actions[0]), axis=0))
    for i in range(len(soft_actions)):
        soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda)
    return np.array(soft_actions)
 def __init__(self, args, direc=0):
     super().__init__(None, args)
     self.traj_dim = 2  # SET THIS
     self.head, self.tail = get_edge(args.train_edge)
     self.name = args.reward_form
     self.anydir = direc == -1
     self.dir = None
     if direc == 0:
         self.dir = pytorch_model.wrap(np.array([0, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 1:
         self.dir = pytorch_model.wrap(np.array([0, -1]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 2:
         self.dir = pytorch_model.wrap(np.array([0, 1]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 3:
         self.dir = pytorch_model.wrap(np.array([-1, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     elif direc == 4:
         self.dir = pytorch_model.wrap(np.array([1, 0]), cuda=args.cuda)
         self.dir.requires_grad = False
     self.epsilon = 1e-3
Esempio n. 8
0
 def __init__(self, train_edge):
     self.head, self.tail = get_edge(train_edge)
Esempio n. 9
0
        else:
            true_environment = FocusEnvironment(model,
                                                display=args.display_focus)
    elif args.env.find('Atari') != -1:
        true_environment = FocusAtariEnvironment(model,
                                                 args.env[len("Atari"):],
                                                 args.seed, 0, args.save_dir)
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl"))
    print(reward_paths)
    reward_paths.sort(key=lambda x: int(x.split("__")[2]))

    head, tail = get_edge(args.train_edge)
    if args.reward_form == 'rawdist' and args.env == 'SelfPusher':
        true_environment.use_distance_reward()
        args.reward_form = 'raw'
    if args.reward_form != 'raw':
        reward_classes = [load_from_pickle(pth) for pth in reward_paths]
        for rc in reward_classes:
            if type(rc) == ChangepointMarkovReward:
                rc.markovModel = rc.markovModel.cuda(args.gpu)
    else:
        reward_classes = [RawReward(args)]
    # train_models = MultiOption(1, BasicModel)
    # learning_algorithm = DQN_optimizer()
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    # learning_algorithm = DDPG_optimizer()
    environments = option_chain.initialize(args)
Esempio n. 10
0
 def __init__(self, args):
     super().__init__(None, args)
     self.traj_dim = 2  # SET THIS
     self.head, self.tail = get_edge(args.train_edge)
     self.name = "x"