def pretrain(args, true_environment, desired, num_actions, state_class, states, resps, targets, criteria, reward_fns): # args = get_args() # true_environment = Paddle() # true_environment = PaddleNoBlocks() dataset_path = args.record_rollouts changepoint_path = args.changepoint_dir option_chain = OptionChain(true_environment, args.changepoint_dir, args.train_edge, args) environments = option_chain.initialize(args) proxy_environment = environments.pop(-1) proxy_chain = environments if args.load_weights: train_models = proxy_environment.models else: train_models = MultiOption(1, models[args.model_form]) head, tail = get_edge(args.train_edge) print(args.state_names, args.state_forms) print(state_class.minmax) # behavior_policy = EpsilonGreedyProbs() save_dir = args.save_graph if args.save_graph == "graph": save_dir = option_chain.save_dir proxy_environment.initialize(args, proxy_chain, reward_fns, state_class, behavior_policy=None) fit(args, save_dir, true_environment, train_models, state_class, desired, states, resps, targets, num_actions, criteria, proxy_environment, reward_fns)
rc.markovModel = rc.markovModel.cuda(args.gpu) else: reward_classes = [RawReward(args)] # train_models = MultiOption(1, BasicModel) # learning_algorithm = DQN_optimizer() learning_algorithm = learning_algorithms[args.optimizer_form]() # learning_algorithm = DDPG_optimizer() environments = option_chain.initialize(args) print("ENVS: ", [e.name for e in environments]) proxy_environment = environments.pop(-1) if args.load_weights: print(proxy_environment.models.cuda) proxy_environment.models.cuda(device=args.gpu) train_models = proxy_environment.models else: train_models = MultiOption(len(reward_paths), models[args.model_form]) proxy_chain = environments if len( environments ) > 1: # there is a difference in the properties of a proxy environment and the true environment num_actions = len(environments[-1].reward_fns) else: num_actions = environments[-1].num_actions print(args.state_names, args.state_forms) state_class = GetState(head, state_forms=list( zip(args.state_names, args.state_forms))) state_class.minmax = compute_minmax(state_class, dataset_path, filename=args.focus_dumps_name) if args.normalize:
if __name__ == "__main__": # Example command line: # python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test # with 3 rewards: python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1 # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1 args = get_args() true_environment = ChainMDP(30) # train_models = MultiOption(1, BasicModel) reward_classes = [ RewardLeft(None, args), RewardCenter(None, args), RewardRight(None, args) ] train_models = MultiOption(len(reward_classes), models[args.model_form]) learning_algorithm = learning_algorithms[args.optimizer_form]() option_chain = OptionChain( true_environment, args.record_rollouts, args.train_edge, args) # here, train_edge should act like a save folder minmax = (0, 30) state_class = GetRaw(3, minmax=minmax, state_shape=[1]) behavior_policy = EpsilonGreedyQ() # behavior_policy = EpsilonGreedyProbs() proxy_chain = option_chain.initialize( args ) # the last term is None since the last environment is not yet made proxy_chain.pop(-1) print(proxy_chain) trainRL(args, option_chain.save_dir,
def __init__(self, base_environment, save_path, train_edge, args): ''' OptionChain should contain all of the requisite information, which is a sequence of proxy environments edges are stored in TODO: proxy environments depend on the path to reach a proxy environment, which would have overlap. replace redundant overlap ''' # self.nodes = nodesargs.base_nodeon": true_environment} # self.edges = edges #[] self.environments = dict() self.save_path = save_path self.base_environment = base_environment self.edges = set() self.nodes = dict() self.test = not args.train try: os.makedirs(save_path) except OSError: print("existing paths already") dirs = [ d.split("/")[-1] for d in glob.glob(os.path.join(save_path, '*')) ] # TODO: currently loads all edges, though this has the potential to be unwieldy print(dirs) for d in dirs: # TODO: only single tail edges currently print(d, args.load_weights, train_edge) edge = (d.split("->")[0], d.split("->")[1]) self.add_edge(edge) if d != train_edge or self.test: # the train edge does not need to load, unless testing, in which case train-edge is the test edge print("loading", edge) model_path = os.path.join(save_path, d) models = MultiOption() models.load(args, model_path) has_test = True try: proxy_env = load_from_pickle( os.path.join(save_path, d, "env.pkl")) except FileNotFoundError as e: proxy_env = ProxyEnvironment(d) has_test = False proxy_env.set_models(models) if has_test: proxy_env.set_test( ) # changes behavior policy to testing mode (no random actions) proxy_env.name = d print(proxy_env.__dict__) self.environments[edge] = proxy_env elif d == train_edge and args.load_weights: print("training", d) model_path = os.path.join(save_path, d) models = MultiOption() models.load(args, model_path) proxy_env = ProxyEnvironment(d) self.environments[edge] = proxy_env proxy_env.set_models(models) else: self.environments[edge] = ProxyEnvironment(d) # in the case that the train edge does not have directories set up tedge = (train_edge.split("->")[0], train_edge.split("->")[1]) if tedge not in self.edges: os.makedirs(os.path.join(save_path, train_edge)) self.add_edge(tedge) self.environments[tedge] = ProxyEnvironment(tedge) self.save_dir = os.path.join(save_path, train_edge) + "/"