def __init__(self, base_environment, save_path, train_edge, args): ''' OptionChain should contain all of the requisite information, which is a sequence of proxy environments edges are stored in TODO: proxy environments depend on the path to reach a proxy environment, which would have overlap. replace redundant overlap ''' # self.nodes = nodesargs.base_nodeon": true_environment} # self.edges = edges #[] self.environments = dict() self.save_path = save_path self.base_environment = base_environment self.edges = set() self.nodes = dict() self.test = not args.train try: os.makedirs(save_path) except OSError: print("existing paths already") dirs = [ d.split("/")[-1] for d in glob.glob(os.path.join(save_path, '*')) ] # TODO: currently loads all edges, though this has the potential to be unwieldy print(dirs) for d in dirs: # TODO: only single tail edges currently print(d, args.load_weights, train_edge) edge = (d.split("->")[0], d.split("->")[1]) self.add_edge(edge) if d != train_edge or self.test: # the train edge does not need to load, unless testing, in which case train-edge is the test edge print("loading", edge) model_path = os.path.join(save_path, d) models = MultiOption() models.load(args, model_path) has_test = True try: proxy_env = load_from_pickle( os.path.join(save_path, d, "env.pkl")) except FileNotFoundError as e: proxy_env = ProxyEnvironment(d) has_test = False proxy_env.set_models(models) if has_test: proxy_env.set_test( ) # changes behavior policy to testing mode (no random actions) proxy_env.name = d print(proxy_env.__dict__) self.environments[edge] = proxy_env elif d == train_edge and args.load_weights: print("training", d) model_path = os.path.join(save_path, d) models = MultiOption() models.load(args, model_path) proxy_env = ProxyEnvironment(d) self.environments[edge] = proxy_env proxy_env.set_models(models) else: self.environments[edge] = ProxyEnvironment(d) # in the case that the train edge does not have directories set up tedge = (train_edge.split("->")[0], train_edge.split("->")[1]) if tedge not in self.edges: os.makedirs(os.path.join(save_path, train_edge)) self.add_edge(tedge) self.environments[tedge] = ProxyEnvironment(tedge) self.save_dir = os.path.join(save_path, train_edge) + "/"
args.env[len("Atari"):], args.seed, 0, args.save_dir) dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl")) print(reward_paths) reward_paths.sort(key=lambda x: int(x.split("__")[2])) head, tail = get_edge(args.train_edge) if args.reward_form == 'rawdist' and args.env == 'SelfPusher': true_environment.use_distance_reward() args.reward_form = 'raw' if args.reward_form != 'raw': reward_classes = [load_from_pickle(pth) for pth in reward_paths] for rc in reward_classes: if type(rc) == ChangepointMarkovReward: rc.markovModel = rc.markovModel.cuda(args.gpu) else: reward_classes = [RawReward(args)] # train_models = MultiOption(1, BasicModel) # learning_algorithm = DQN_optimizer() learning_algorithm = learning_algorithms[args.optimizer_form]() # learning_algorithm = DDPG_optimizer() environments = option_chain.initialize(args) print("ENVS: ", [e.name for e in environments]) proxy_environment = environments.pop(-1) if args.load_weights: print(proxy_environment.models.cuda) proxy_environment.models.cuda(device=args.gpu)
# dp-gmm (if not included, default) # determiner (TODO: add determiner args later) # window (if used) # reward-form # train (used for trainable rewards) # segment # atari action->paddle: python get_reward.py --record-rollouts data/atarirandom/ --changepoint-dir data/atarigraph/ --train-edge "Action->Paddle" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --focus-dumps-name focus_dumps.txt --dp-gmm atari # python get_reward.py --record-rollouts data/atarirandom/ --changepoint-dir data/atarigraph/ --train-edge "Action->Paddle" --transforms WProx --determiner prox --reward-form changepoint --num-stack 1 --focus-dumps-name focus_dumps.txt --dp-gmm atari # python get_reward.py --record-rollouts data/ataripaddle/ --changepoint-dir data/atarigraph/ --train-edge "Paddle->Ball" --transforms WProx --determiner prox --reward-form changepoint --num-stack 1 --focus-dumps-name focus_dumps.txt --dp-gmm atariball --period 5 # python get_reward.py --record-rollouts data/pusherrandom/ --changepoint-dir data/fullpusher/ --train-edge "Action->Gripper" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --gpu 1 # python get_reward.py --record-rollouts data/extragripper/ --changepoint-dir data/pushergraphvec/ --train-edge "Gripper->Block" --transforms SProxVel --determiner merged --reward-form changepoint --segment --num-stack 2 --gpu 1 --cluster-model FDPGMM --period 9 --dp-gmm block --min-cluster 5 # python get_reward.py --record-rollouts data/pusherrandom/ --changepoint-dir data/fullpusher/ --train-edge "Action->Gripper" --transforms SVel SCorAvg --determiner overlap --reward-form markov --segment --train --num-stack 2 --gpu 1 > pusher/reward_training.txt dataset_path = args.record_rollouts changepoints_path = args.record_rollouts # these are the same for creating rewards head, tail = get_edge(args.train_edge) cp_dict = load_from_pickle( os.path.join(changepoints_path, "changepoints-" + head + ".pkl")) changepoints, models = get_cp_models_from_dict(cp_dict) obj_dumps = read_obj_dumps(dataset_path, i=-1, rng=args.num_iters, filename=args.focus_dumps_name) trajectory = get_individual_data(head, obj_dumps, pos_val_hash=1) # TODO: automatically determine if correlate pos_val_hash is 1 or 2 # TODO: multiple tail support if tail[0] == "Action": correlate_trajectory = get_individual_data(tail[0], obj_dumps, pos_val_hash=2) new_ct = np.zeros( (len(correlate_trajectory), int(np.max(correlate_trajectory)) + 1))