def get_states(args, true_environment, length_constraint=50000, raws=None, dumps=None): dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) print(environments) proxy_environment = environments.pop(-1) head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) use_raw = 'raw' in args.state_forms state_class.minmax = compute_minmax(state_class, dataset_path) states, resps, raws, dumps = load_states( state_class.get_state, dataset_path, length_constraint=length_constraint, use_raw=use_raw, raws=raws, dumps=dumps) return states, resps, num_actions, state_class, environments, raws, dumps
def pretrain(args, true_environment, desired, num_actions, state_class, states, resps, targets, criteria, reward_fns): # args = get_args() # true_environment = Paddle() # true_environment = PaddleNoBlocks() dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments if args.load_weights: train_models = proxy_environment.models else: train_models = MultiOption(1, models[args.model_form]) head, tail = get_edge(args.train_edge) print(args.state_names, args.state_forms) print(state_class.minmax) # behavior_policy = EpsilonGreedyProbs() save_dir = args.save_graph if args.save_graph == "graph": save_dir = option_chain.save_dir proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) fit(args, save_dir, true_environment, train_models, state_class, desired, states, resps, targets, num_actions, criteria, proxy_environment, reward_fns)
def __init__(self, args, reward_function, minmax=None): ''' wraps a novelty reward over an existing reward function ''' self.name = reward_function.name self.head, self.tail = get_edge(args.train_edge) self.reward_function = reward_function self.cuda = args.cuda self.traj_dim = reward_function.traj_dim #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change self.novelty_decay = args.novelty_decay
def __init__(self, model, args): ''' model is a changepoint model ''' self.name = args.train_edge self.head, self.tail = get_edge(args.train_edge) self.model = model self.cuda = args.cuda self.traj_dim = 2 #TODO: the dimension of the input trajectory is currently pre-set at 2, the dim of a location. Once we figure out dynamic setting, this can change self.parameter_minmax = [np.array([0]), np.array([84])] # TODO: where does this come from? self.state_class = GetState(0, self.head, state_forms=[(self.head, 'bounds'), *[(tail, 'bounds') for tail in self.tail]]) # TODO: technically, multibounds for both
def get_option_actions(pth, train_edge, num_actions, weighting_lambda, length_constraint = 50000): head, tail = get_edge(train_edge) action_file = open(os.path.join(pth, tail[0] + "_actions.txt"), 'r') actions = [] for act in action_file: # print(act, os.path.join(pth, train_edge + "_actions.txt")) actions.append(int(act)) if len(actions) > length_constraint: actions.pop(0) action_file.close() actions = hot_actions(actions, num_actions) actions = smooth_weight(actions, weighting_lambda) return actions
def generate_soft_dataset(states, resps, true_environment, reward_fns, args): pre_load_weights = args.load_weights args.load_weights = True option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) print(args.load_weights) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments train_models = proxy_environment.models head, tail = get_edge(args.train_edge) if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) train_models.initialize(args, len(reward_fns), state_class, num_actions) train_models.session(args) proxy_environment.duplicate(args) # assumes that we are loading weights args.load_weights = pre_load_weights soft_actions = [[] for i in range(train_models.num_options)] for oidx in range(train_models.num_options): train_models.option_index = oidx if args.model_form == 'population': train_models.currentModel().use_mean = True for i in range(len(states) // 30 + 1): state = states[i * 30:(i + 1) * 30] resp = resps[i * 30:(i + 1) * 30] values, dist_entropy, action_probs, Q_vals = train_models.determine_action( pytorch_model.wrap(state, cuda=args.cuda), pytorch_model.wrap(resp, cuda=args.cuda)) # print (action_probs) values, action_probs, Q_vals = train_models.get_action( values, action_probs, Q_vals) soft_actions[oidx] += pytorch_model.unwrap(action_probs).tolist() print("soft actions", np.sum(np.array(soft_actions[0]), axis=0)) for i in range(len(soft_actions)): soft_actions[i] = smooth_weight(soft_actions[i], args.weighting_lambda) return np.array(soft_actions)
def __init__(self, args, direc=0): super().__init__(None, args) self.traj_dim = 2 # SET THIS self.head, self.tail = get_edge(args.train_edge) self.name = args.reward_form self.anydir = direc == -1 self.dir = None if direc == 0: self.dir = pytorch_model.wrap(np.array([0, 0]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 1: self.dir = pytorch_model.wrap(np.array([0, -1]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 2: self.dir = pytorch_model.wrap(np.array([0, 1]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 3: self.dir = pytorch_model.wrap(np.array([-1, 0]), cuda=args.cuda) self.dir.requires_grad = False elif direc == 4: self.dir = pytorch_model.wrap(np.array([1, 0]), cuda=args.cuda) self.dir.requires_grad = False self.epsilon = 1e-3
def __init__(self, train_edge): self.head, self.tail = get_edge(train_edge)
else: true_environment = FocusEnvironment(model, display=args.display_focus) elif args.env.find('Atari') != -1: true_environment = FocusAtariEnvironment(model, args.env[len("Atari"):], args.seed, 0, args.save_dir) dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) reward_paths = glob.glob(os.path.join(option_chain.save_dir, "*rwd.pkl")) print(reward_paths) reward_paths.sort(key=lambda x: int(x.split("__")[2])) head, tail = get_edge(args.train_edge) if args.reward_form == 'rawdist' and args.env == 'SelfPusher': true_environment.use_distance_reward() args.reward_form = 'raw' if args.reward_form != 'raw': reward_classes = [load_from_pickle(pth) for pth in reward_paths] for rc in reward_classes: if type(rc) == ChangepointMarkovReward: rc.markovModel = rc.markovModel.cuda(args.gpu) else: reward_classes = [RawReward(args)] # train_models = MultiOption(1, BasicModel) # learning_algorithm = DQN_optimizer() learning_algorithm = learning_algorithms[args.optimizer_form]() # learning_algorithm = DDPG_optimizer() environments = option_chain.initialize(args)
def __init__(self, args): super().__init__(None, args) self.traj_dim = 2 # SET THIS self.head, self.tail = get_edge(args.train_edge) self.name = "x"