def __init__(self, base_environment, save_path, train_edge, args):
     '''
     OptionChain should contain all of the requisite information, which is a sequence of proxy environments
     edges are stored in 
     TODO: proxy environments depend on the path to reach a proxy environment, which would have overlap.
         replace redundant overlap
     '''
     # self.nodes = nodesargs.base_nodeon": true_environment}
     # self.edges = edges #[]
     self.environments = dict()
     self.save_path = save_path
     self.base_environment = base_environment
     self.edges = set()
     self.nodes = dict()
     self.test = not args.train
     try:
         os.makedirs(save_path)
     except OSError:
         print("existing paths already")
         dirs = [
             d.split("/")[-1]
             for d in glob.glob(os.path.join(save_path, '*'))
         ]
         # TODO: currently loads all edges, though this has the potential to be unwieldy
         print(dirs)
         for d in dirs:
             # TODO: only single tail edges currently
             print(d, args.load_weights, train_edge)
             edge = (d.split("->")[0], d.split("->")[1])
             self.add_edge(edge)
             if d != train_edge or self.test:  # the train edge does not need to load, unless testing, in which case train-edge is the test edge
                 print("loading", edge)
                 model_path = os.path.join(save_path, d)
                 models = MultiOption()
                 models.load(args, model_path)
                 has_test = True
                 try:
                     proxy_env = load_from_pickle(
                         os.path.join(save_path, d, "env.pkl"))
                 except FileNotFoundError as e:
                     proxy_env = ProxyEnvironment(d)
                     has_test = False
                 proxy_env.set_models(models)
                 if has_test:
                     proxy_env.set_test(
                     )  # changes behavior policy to testing mode (no random actions)
                 proxy_env.name = d
                 print(proxy_env.__dict__)
                 self.environments[edge] = proxy_env
             elif d == train_edge and args.load_weights:
                 print("training", d)
                 model_path = os.path.join(save_path, d)
                 models = MultiOption()
                 models.load(args, model_path)
                 proxy_env = ProxyEnvironment(d)
                 self.environments[edge] = proxy_env
                 proxy_env.set_models(models)
             else:
                 self.environments[edge] = ProxyEnvironment(d)
     # in the case that the train edge does not have directories set up
     tedge = (train_edge.split("->")[0], train_edge.split("->")[1])
     if tedge not in self.edges:
         os.makedirs(os.path.join(save_path, train_edge))
         self.add_edge(tedge)
         self.environments[tedge] = ProxyEnvironment(tedge)
     self.save_dir = os.path.join(save_path, train_edge) + "/"
                                                 args.env[len("Atari"):],
                                                 args.seed, 0, args.save_dir)
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl"))
    print(reward_paths)
    reward_paths.sort(key=lambda x: int(x.split("__")[2]))

    head, tail = get_edge(args.train_edge)
    if args.reward_form == 'rawdist' and args.env == 'SelfPusher':
        true_environment.use_distance_reward()
        args.reward_form = 'raw'
    if args.reward_form != 'raw':
        reward_classes = [load_from_pickle(pth) for pth in reward_paths]
        for rc in reward_classes:
            if type(rc) == ChangepointMarkovReward:
                rc.markovModel = rc.markovModel.cuda(args.gpu)
    else:
        reward_classes = [RawReward(args)]
    # train_models = MultiOption(1, BasicModel)
    # learning_algorithm = DQN_optimizer()
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    # learning_algorithm = DDPG_optimizer()
    environments = option_chain.initialize(args)
    print("ENVS: ", [e.name for e in environments])
    proxy_environment = environments.pop(-1)
    if args.load_weights:
        print(proxy_environment.models.cuda)
        proxy_environment.models.cuda(device=args.gpu)
Exemple #3
0
    # dp-gmm (if not included, default)
    # determiner (TODO: add determiner args later)
    # window (if used)
    # reward-form
    # train (used for trainable rewards)
    # segment
    # atari action->paddle: python get_reward.py --record-rollouts data/atarirandom/ --changepoint-dir data/atarigraph/ --train-edge "Action->Paddle" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --focus-dumps-name focus_dumps.txt --dp-gmm atari
    # python get_reward.py --record-rollouts data/atarirandom/ --changepoint-dir data/atarigraph/ --train-edge "Action->Paddle" --transforms WProx --determiner prox --reward-form changepoint --num-stack 1 --focus-dumps-name focus_dumps.txt --dp-gmm atari
    # python get_reward.py --record-rollouts data/ataripaddle/ --changepoint-dir data/atarigraph/ --train-edge "Paddle->Ball" --transforms WProx --determiner prox --reward-form changepoint --num-stack 1 --focus-dumps-name focus_dumps.txt --dp-gmm atariball --period 5
    # python get_reward.py --record-rollouts data/pusherrandom/ --changepoint-dir data/fullpusher/ --train-edge "Action->Gripper" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --gpu 1
    # python get_reward.py --record-rollouts data/extragripper/ --changepoint-dir data/pushergraphvec/ --train-edge "Gripper->Block" --transforms SProxVel --determiner merged --reward-form changepoint --segment --num-stack 2 --gpu 1 --cluster-model FDPGMM --period 9 --dp-gmm block --min-cluster 5
    # python get_reward.py --record-rollouts data/pusherrandom/ --changepoint-dir data/fullpusher/ --train-edge "Action->Gripper" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --gpu 1 > pusher/reward_training.txt
    dataset_path = args.record_rollouts
    changepoints_path = args.record_rollouts  # these are the same for creating rewards
    head, tail = get_edge(args.train_edge)
    cp_dict = load_from_pickle(
        os.path.join(changepoints_path, "changepoints-" + head + ".pkl"))
    changepoints, models = get_cp_models_from_dict(cp_dict)
    obj_dumps = read_obj_dumps(dataset_path,
                               i=-1,
                               rng=args.num_iters,
                               filename=args.focus_dumps_name)

    trajectory = get_individual_data(head, obj_dumps, pos_val_hash=1)
    # TODO: automatically determine if correlate pos_val_hash is 1 or 2
    # TODO: multiple tail support
    if tail[0] == "Action":
        correlate_trajectory = get_individual_data(tail[0],
                                                   obj_dumps,
                                                   pos_val_hash=2)
        new_ct = np.zeros(
            (len(correlate_trajectory), int(np.max(correlate_trajectory)) + 1))