def get_states(args,
               true_environment,
               length_constraint=50000,
               raws=None,
               dumps=None):
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    head, tail = get_edge(args.train_edge)
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    use_raw = 'raw' in args.state_forms
    state_class.minmax = compute_minmax(state_class, dataset_path)
    states, resps, raws, dumps = load_states(
        state_class.get_state,
        dataset_path,
        length_constraint=length_constraint,
        use_raw=use_raw,
        raws=raws,
        dumps=dumps)
    return states, resps, num_actions, state_class, environments, raws, dumps
예제 #2
0
        train_models = proxy_environment.models
    else:
        train_models = MultiOption(len(reward_paths), models[args.model_form])
    proxy_chain = environments
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    print(args.state_names, args.state_forms)
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    state_class.minmax = compute_minmax(state_class,
                                        dataset_path,
                                        filename=args.focus_dumps_name)
    if args.normalize:
        minv = []
        maxv = []
        for f in args.state_forms:
            if f == 'prox':
                minv += [-84, -84]
                maxv += [84, 84]
            elif f == 'bounds':
                minv += [0, 0]
                maxv += [84, 84]
        state_class.minmax = np.stack((np.array(minv), np.array(maxv)))
        print(state_class.minmax)

    behavior_policy = behavior_policies[args.behavior_policy]()
    train_models = MultiOption(len(reward_classes), models[args.model_form])
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if len(
            environments
    ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    print(args.state_names, args.state_forms)
    state_class = GetState(head,
                           state_forms=list(
                               zip(args.state_names, args.state_forms)))
    # softcomputed minmax (buggy)
    state_class.minmax = compute_minmax(state_class, dataset_path)
    # HARDCODED MINMAX AT 84,84!!!!
    minv = []
    maxv = []
    for f in args.state_forms:
        if f == 'prox':
            minv += [-84, -84]
            maxv += [84, 84]
        elif f == 'bounds':
            minv += [0, 0]
            maxv += [84, 84]
    state_class.minmax = np.stack((np.array(minv), np.array(maxv)))
    print("state class minmax", state_class.minmax)

    for reward_class in reward_classes:
        reward_class.traj_dim = state_class.shape
예제 #4
0
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args)
    reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl"))
    print(reward_paths)
    reward_paths.sort(key=lambda x: int(x.split("__")[2]))

    head, tail = get_edge(args.train_edge)

    reward_classes = [load_from_pickle(pth) for pth in reward_paths]
    # train_models = MultiOption(1, BasicModel)
    train_models = MultiOption(len(reward_paths), models[args.model_form])
    # learning_algorithm = DQN_optimizer()
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    # learning_algorithm = DDPG_optimizer()
    environments = option_chain.initialize(args)
    print(environments)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if len(environments) > 1: # there is a difference in the properties of a proxy environment and the true environment
        num_actions = len(environments[-1].reward_fns)
    else:
        num_actions = environments[-1].num_actions
    print(args.state_names, args.state_forms)
    state_class = GetState(head, state_forms=list(zip(args.state_names, args.state_forms)))
    state_class.minmax = compute_minmax(state_class, dataset_path)
    behavior_policy = behavior_policies[args.behavior_policy]()
    # behavior_policy = EpsilonGreedyProbs()
    trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm, proxy_environment,
            proxy_chain, reward_classes, state_class, behavior_policy=behavior_policy)