def pretrain(args, true_environment, desired, num_actions, state_class, states,
             resps, targets, criteria, reward_fns):
    # args = get_args()
    # true_environment = Paddle()
    # true_environment = PaddleNoBlocks()
    dataset_path = args.record_rollouts
    changepoint_path = args.changepoint_dir
    option_chain = OptionChain(true_environment, args.changepoint_dir,
                               args.train_edge, args)
    environments = option_chain.initialize(args)
    proxy_environment = environments.pop(-1)
    proxy_chain = environments
    if args.load_weights:
        train_models = proxy_environment.models
    else:
        train_models = MultiOption(1, models[args.model_form])
    head, tail = get_edge(args.train_edge)
    print(args.state_names, args.state_forms)
    print(state_class.minmax)
    # behavior_policy = EpsilonGreedyProbs()
    save_dir = args.save_graph
    if args.save_graph == "graph":
        save_dir = option_chain.save_dir
    proxy_environment.initialize(args,
                                 proxy_chain,
                                 reward_fns,
                                 state_class,
                                 behavior_policy=None)
    fit(args, save_dir, true_environment, train_models, state_class, desired,
        states, resps, targets, num_actions, criteria, proxy_environment,
        reward_fns)
예제 #2
0
             rc.markovModel = rc.markovModel.cuda(args.gpu)
 else:
     reward_classes = [RawReward(args)]
 # train_models = MultiOption(1, BasicModel)
 # learning_algorithm = DQN_optimizer()
 learning_algorithm = learning_algorithms[args.optimizer_form]()
 # learning_algorithm = DDPG_optimizer()
 environments = option_chain.initialize(args)
 print("ENVS: ", [e.name for e in environments])
 proxy_environment = environments.pop(-1)
 if args.load_weights:
     print(proxy_environment.models.cuda)
     proxy_environment.models.cuda(device=args.gpu)
     train_models = proxy_environment.models
 else:
     train_models = MultiOption(len(reward_paths), models[args.model_form])
 proxy_chain = environments
 if len(
         environments
 ) > 1:  # there is a difference in the properties of a proxy environment and the true environment
     num_actions = len(environments[-1].reward_fns)
 else:
     num_actions = environments[-1].num_actions
 print(args.state_names, args.state_forms)
 state_class = GetState(head,
                        state_forms=list(
                            zip(args.state_names, args.state_forms)))
 state_class.minmax = compute_minmax(state_class,
                                     dataset_path,
                                     filename=args.focus_dumps_name)
 if args.normalize:
예제 #3
0
if __name__ == "__main__":
    # Example command line:
    # python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test
    # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 10000 --save-dir data/test
    # with 3 rewards: python rl_template.py --model-form basic --optimizer-form DQN --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1
    # python rl_template.py --model-form tab --optimizer-form TabQ --record-rollouts "data/testchain/" --train-edge "Action->chain" --num-stack 1 --train --num-iters 1000 --save-dir data/test --num-update-model 1
    args = get_args()
    true_environment = ChainMDP(30)
    # train_models = MultiOption(1, BasicModel)
    reward_classes = [
        RewardLeft(None, args),
        RewardCenter(None, args),
        RewardRight(None, args)
    ]
    train_models = MultiOption(len(reward_classes), models[args.model_form])
    learning_algorithm = learning_algorithms[args.optimizer_form]()
    option_chain = OptionChain(
        true_environment, args.record_rollouts, args.train_edge,
        args)  # here, train_edge should act like a save folder
    minmax = (0, 30)
    state_class = GetRaw(3, minmax=minmax, state_shape=[1])
    behavior_policy = EpsilonGreedyQ()
    # behavior_policy = EpsilonGreedyProbs()
    proxy_chain = option_chain.initialize(
        args
    )  # the last term is None since the last environment is not yet made
    proxy_chain.pop(-1)
    print(proxy_chain)
    trainRL(args,
            option_chain.save_dir,
예제 #4
0
 def __init__(self, base_environment, save_path, train_edge, args):
     '''
     OptionChain should contain all of the requisite information, which is a sequence of proxy environments
     edges are stored in 
     TODO: proxy environments depend on the path to reach a proxy environment, which would have overlap.
         replace redundant overlap
     '''
     # self.nodes = nodesargs.base_nodeon": true_environment}
     # self.edges = edges #[]
     self.environments = dict()
     self.save_path = save_path
     self.base_environment = base_environment
     self.edges = set()
     self.nodes = dict()
     self.test = not args.train
     try:
         os.makedirs(save_path)
     except OSError:
         print("existing paths already")
         dirs = [
             d.split("/")[-1]
             for d in glob.glob(os.path.join(save_path, '*'))
         ]
         # TODO: currently loads all edges, though this has the potential to be unwieldy
         print(dirs)
         for d in dirs:
             # TODO: only single tail edges currently
             print(d, args.load_weights, train_edge)
             edge = (d.split("->")[0], d.split("->")[1])
             self.add_edge(edge)
             if d != train_edge or self.test:  # the train edge does not need to load, unless testing, in which case train-edge is the test edge
                 print("loading", edge)
                 model_path = os.path.join(save_path, d)
                 models = MultiOption()
                 models.load(args, model_path)
                 has_test = True
                 try:
                     proxy_env = load_from_pickle(
                         os.path.join(save_path, d, "env.pkl"))
                 except FileNotFoundError as e:
                     proxy_env = ProxyEnvironment(d)
                     has_test = False
                 proxy_env.set_models(models)
                 if has_test:
                     proxy_env.set_test(
                     )  # changes behavior policy to testing mode (no random actions)
                 proxy_env.name = d
                 print(proxy_env.__dict__)
                 self.environments[edge] = proxy_env
             elif d == train_edge and args.load_weights:
                 print("training", d)
                 model_path = os.path.join(save_path, d)
                 models = MultiOption()
                 models.load(args, model_path)
                 proxy_env = ProxyEnvironment(d)
                 self.environments[edge] = proxy_env
                 proxy_env.set_models(models)
             else:
                 self.environments[edge] = ProxyEnvironment(d)
     # in the case that the train edge does not have directories set up
     tedge = (train_edge.split("->")[0], train_edge.split("->")[1])
     if tedge not in self.edges:
         os.makedirs(os.path.join(save_path, train_edge))
         self.add_edge(tedge)
         self.environments[tedge] = ProxyEnvironment(tedge)
     self.save_dir = os.path.join(save_path, train_edge) + "/"