def get_states(args, true_environment, length_constraint=50000, raws=None, dumps=None): dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) use_raw = 'raw' in args.state_forms state_class.minmax = compute_minmax(state_class, dataset_path) states, resps, raws, dumps = load_states( state_class.get_state, dataset_path, length_constraint=length_constraint, use_raw=use_raw, raws=raws, dumps=dumps) return states, resps, num_actions, state_class, environments, raws, dumps
train_models = proxy_environment.models else: train_models = MultiOption(len(reward_paths), models[args.model_form]) proxy_chain = environments if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path, filename=args.focus_dumps_name) if args.normalize: minv = [] maxv = [] for f in args.state_forms: if f == 'prox': minv += [-84, -84] maxv += [84, 84] elif f == 'bounds': minv += [0, 0] maxv += [84, 84] state_class.minmax = np.stack((np.array(minv), np.array(maxv))) print(state_class.minmax) behavior_policy = behavior_policies[args.behavior_policy]()
train_models = MultiOption(len(reward_classes), models[args.model_form]) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) # softcomputed minmax (buggy) state_class.minmax = compute_minmax(state_class, dataset_path) # HARDCODED MINMAX AT 84,84!!!! minv = [] maxv = [] for f in args.state_forms: if f == 'prox': minv += [-84, -84] maxv += [84, 84] elif f == 'bounds': minv += [0, 0] maxv += [84, 84] state_class.minmax = np.stack((np.array(minv), np.array(maxv))) print("state class minmax", state_class.minmax) for reward_class in reward_classes: reward_class.traj_dim = state_class.shape
dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl")) print(reward_paths) reward_paths.sort(key=lambda x: int(x.split("__")[2])) head, tail = get_edge(args.train_edge) reward_classes = [load_from_pickle(pth) for pth in reward_paths] # train_models = MultiOption(1, BasicModel) train_models = MultiOption(len(reward_paths), models[args.model_form]) # learning_algorithm = DQN_optimizer() learning_algorithm = learning_algorithms[args.optimizer_form]() # learning_algorithm = DDPG_optimizer() environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) proxy_chain = environments if len(environments) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list(zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path) behavior_policy = behavior_policies[args.behavior_policy]() # behavior_policy = EpsilonGreedyProbs() trainRL(args, option_chain.save_dir, true_environment, train_models, learning_algorithm, proxy_environment, proxy_chain, reward_classes, state_class, behavior_policy=behavior_policy)