def create_symbolic_action_distributions(action_space, base_output_size): if action_space == "full": bernoulli_dist = distributions.Bernoulli(base_output_size, 2) item_dist = distributions.Categorical(base_output_size, 6) quantity_dist = distributions.Categorical(base_output_size, 5) move_dist = distributions.Categorical(base_output_size, 4) # clear_dist = distributions.Categorical(base_output_size, 4) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, item_dist, quantity_dist, move_dist)) elif action_space == "move-only": bernoulli_dist = distributions.Bernoulli(base_output_size, 1) move_dist = distributions.Categorical(base_output_size, 4) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, move_dist)) elif action_space == "move-continuous": bernoulli_dist = distributions.Bernoulli(base_output_size, 1) move_dist = distributions.DiagGaussian(base_output_size, 2) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, move_dist)) elif action_space == "move-uniform": bernoulli_dist = distributions.Bernoulli(base_output_size, 1) move_x = distributions.Categorical(base_output_size, 9) move_y = distributions.Categorical(base_output_size, 9) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, move_x, move_y)) elif action_space == "rooms": action_dist = distributions.Categorical(base_output_size, 3) move_x = distributions.Categorical(base_output_size, 5) move_y = distributions.Categorical(base_output_size, 5) item_dist = distributions.Categorical(base_output_size, 6) quantity_dist = distributions.Categorical(base_output_size, 5) dist = distributions.DistributionGeneratorTuple( (action_dist, move_x, move_y, item_dist, quantity_dist)) return dist
def experiment(variant): common.initialise(variant) expl_envs, eval_envs = common.create_environments(variant) ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_envs) obs_dim = obs_shape[1] qf = CNN( input_width=obs_dim, input_height=obs_dim, input_channels=channels, output_size=8, kernel_sizes=[8, 4], n_channels=[16, 32], strides=[4, 2], paddings=[0, 0], hidden_sizes=[256], ) # CHANGE TO ORDINAL ACTION SPACE action_space = gym.spaces.Box(-np.inf, np.inf, (8, )) expl_envs.action_space = action_space eval_envs.action_space = action_space base = common.create_networks(variant, n, mlp, channels, fc_input) bernoulli_dist = distributions.Bernoulli(base.output_size, 4) passenger_dist = distributions.Categorical(base.output_size, 5) delivered_dist = distributions.Categorical(base.output_size, 5) continuous_dist = distributions.DiagGaussian(base.output_size, 2) dist = distributions.DistributionGeneratorTuple( (bernoulli_dist, continuous_dist, passenger_dist, delivered_dist)) eval_policy = LearnPlanPolicy( ScriptedPolicy(qf, variant["always_return"]), num_processes=variant["num_processes"], vectorised=True, json_to_screen=expl_envs.observation_space.converter, ) expl_policy = LearnPlanPolicy( ScriptedPolicy(qf, variant["always_return"]), num_processes=variant["num_processes"], vectorised=True, json_to_screen=expl_envs.observation_space.converter, ) eval_path_collector = HierarchicalStepCollector( eval_envs, eval_policy, ptu.device, max_num_epoch_paths_saved=variant["algorithm_kwargs"] ["num_eval_steps_per_epoch"], num_processes=variant["num_processes"], render=variant["render"], gamma=1, no_plan_penalty=variant.get("no_plan_penalty", False), ) expl_path_collector = HierarchicalStepCollector( expl_envs, expl_policy, ptu.device, max_num_epoch_paths_saved=variant["num_steps"], num_processes=variant["num_processes"], render=variant["render"], gamma=variant["trainer_kwargs"]["gamma"], no_plan_penalty=variant.get("no_plan_penalty", False), ) # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step] trainer = PPOTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"]) # missing: by this point, rollout back in sync. replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs) # added: replay buffer is new algorithm = TorchIkostrikovRLAlgorithm( trainer=trainer, exploration_env=expl_envs, evaluation_env=eval_envs, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], # batch_size, # max_path_length, # num_epochs, # num_eval_steps_per_epoch, # num_expl_steps_per_train_loop, # num_trains_per_train_loop, # num_train_loops_per_epoch=1, # min_num_steps_before_training=0, ) algorithm.to(ptu.device) # missing: device back in sync algorithm.evaluate()
def experiment(variant): common.initialise(variant) expl_envs, eval_envs = common.create_environments(variant) ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_envs) # # CHANGE TO ORDINAL ACTION SPACE # action_space = gym.spaces.Box(-np.inf, np.inf, (8,)) # expl_envs.action_space = action_space # eval_envs.action_space = action_space ANCILLARY_GOAL_SIZE = variant["ancillary_goal_size"] SYMBOLIC_ACTION_SIZE = 12 base = common.create_networks(variant, n, mlp, channels, fc_input) control_base = common.create_networks( variant, n, mlp, channels, fc_input + SYMBOLIC_ACTION_SIZE) # for uvfa goal representation dist = common.create_symbolic_action_distributions(variant["action_space"], base.output_size) control_dist = distributions.Categorical(base.output_size, action_space.n) eval_learner = WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=True, dist=dist, num_processes=variant["num_processes"], obs_space=obs_space, ) planner = ENHSPPlanner() # multihead # eval_controller = CraftController( # MultiPolicy( # obs_shape, # action_space, # ptu.device, # 18, # base=base, # deterministic=True, # num_processes=variant["num_processes"], # obs_space=obs_space, # ) # ) # expl_controller = CraftController( # MultiPolicy( # obs_shape, # action_space, # ptu.device, # 18, # base=base, # deterministic=False, # num_processes=variant["num_processes"], # obs_space=obs_space, # ) # ) # uvfa eval_controller = CraftController( WrappedPolicy( obs_shape, action_space, ptu.device, base=control_base, dist=control_dist, deterministic=True, num_processes=variant["num_processes"], obs_space=obs_space, symbolic_action_size=SYMBOLIC_ACTION_SIZE, ), n=n, ) expl_controller = CraftController( WrappedPolicy( obs_shape, action_space, ptu.device, base=control_base, dist=control_dist, deterministic=False, num_processes=variant["num_processes"], obs_space=obs_space, symbolic_action_size=SYMBOLIC_ACTION_SIZE, ), n=n, ) function_env = gym.make(variant["env_name"]) eval_policy = LearnPlanPolicy( eval_learner, planner, eval_controller, num_processes=variant["num_processes"], vectorised=True, env=function_env, ) expl_learner = WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=False, dist=dist, num_processes=variant["num_processes"], obs_space=obs_space, ) expl_policy = LearnPlanPolicy( expl_learner, planner, expl_controller, num_processes=variant["num_processes"], vectorised=True, env=function_env, ) eval_path_collector = ThreeTierStepCollector( eval_envs, eval_policy, ptu.device, ANCILLARY_GOAL_SIZE, SYMBOLIC_ACTION_SIZE, max_num_epoch_paths_saved=variant["algorithm_kwargs"] ["num_eval_steps_per_epoch"], num_processes=variant["num_processes"], render=variant["render"], gamma=1, no_plan_penalty=True, meta_num_epoch_paths=variant["meta_num_steps"], ) expl_path_collector = ThreeTierStepCollector( expl_envs, expl_policy, ptu.device, ANCILLARY_GOAL_SIZE, SYMBOLIC_ACTION_SIZE, max_num_epoch_paths_saved=variant["num_steps"], num_processes=variant["num_processes"], render=variant["render"], gamma=variant["trainer_kwargs"]["gamma"], no_plan_penalty=variant.get("no_plan_penalty", False), meta_num_epoch_paths=variant["meta_num_steps"], ) # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step] learn_trainer = PPOTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"]) control_trainer = PPOTrainer(actor_critic=expl_policy.controller.policy, **variant["trainer_kwargs"]) trainer = MultiTrainer([control_trainer, learn_trainer]) # missing: by this point, rollout back in sync. replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs) # added: replay buffer is new algorithm = TorchIkostrikovRLAlgorithm( trainer=trainer, exploration_env=expl_envs, evaluation_env=eval_envs, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], # batch_size, # max_path_length, # num_epochs, # num_eval_steps_per_epoch, # num_expl_steps_per_train_loop, # num_trains_per_train_loop, # num_train_loops_per_epoch=1, # min_num_steps_before_training=0, ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): common.initialise(variant) expl_envs, eval_envs = common.create_environments(variant) ( obs_shape, obs_space, action_space, n, mlp, channels, fc_input, ) = common.get_spaces(expl_envs) # # CHANGE TO ORDINAL ACTION SPACE # action_space = gym.spaces.Box(-np.inf, np.inf, (8,)) # expl_envs.action_space = action_space # eval_envs.action_space = action_space ANCILLARY_GOAL_SIZE = variant[ "ancillary_goal_size"] # This is the length of the action space for the learner SYMBOLIC_ACTION_SIZE = 12 GRID_SIZE = 31 base = common.create_networks(variant, n, mlp, channels, fc_input) control_base = common.create_networks( variant, n, mlp, channels, fc_input + SYMBOLIC_ACTION_SIZE) # for uvfa goal representation dist = common.create_symbolic_action_distributions(variant["action_space"], base.output_size) control_dist = distributions.Categorical(base.output_size, action_space.n) eval_learner = WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=True, dist=dist, num_processes=variant["num_processes"], obs_space=obs_space, ) planner = ENHSPPlanner() # collect filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/e77c75eed02e4b38a0a308789fbfcbd8/data/params.pkl" # collect with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_collect_policy = policies["exploration/policy"] loaded_collect_policy.rnn_hxs = loaded_collect_policy.rnn_hxs[0].unsqueeze( 0) eval_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) expl_collect = CraftController(loaded_collect_policy, n=GRID_SIZE) # other filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/cf5c31afe0724acd8f6398d77a80443e/data/params.pkl" # other # filepath = "/home/achester/Documents/symbolic-goal-generation/data/params.pkl" with (open(filepath, "rb")) as openfile: while True: try: policies = pickle.load(openfile) except EOFError: break loaded_other_policy = policies["exploration/policy"] loaded_other_policy.rnn_hxs = loaded_other_policy.rnn_hxs[0].unsqueeze(0) eval_other = CraftController(loaded_other_policy, n=GRID_SIZE) expl_other = CraftController(loaded_other_policy, n=GRID_SIZE) eval_controller = PretrainedController([eval_collect, eval_other]) expl_controller = PretrainedController([expl_collect, expl_other]) function_env = gym.make(variant["env_name"]) eval_policy = LearnPlanPolicy( eval_learner, planner, eval_controller, num_processes=variant["num_processes"], vectorised=True, env=function_env, ) expl_learner = WrappedPolicy( obs_shape, action_space, ptu.device, base=base, deterministic=False, dist=dist, num_processes=variant["num_processes"], obs_space=obs_space, ) expl_policy = LearnPlanPolicy( expl_learner, planner, expl_controller, num_processes=variant["num_processes"], vectorised=True, env=function_env, ) eval_path_collector = ThreeTierStepCollector( eval_envs, eval_policy, ptu.device, ANCILLARY_GOAL_SIZE, SYMBOLIC_ACTION_SIZE, max_num_epoch_paths_saved=variant["algorithm_kwargs"] ["num_eval_steps_per_epoch"], num_processes=variant["num_processes"], render=variant["render"], gamma=1, no_plan_penalty=True, meta_num_epoch_paths=variant["meta_num_steps"], ) expl_path_collector = ThreeTierStepCollector( expl_envs, expl_policy, ptu.device, ANCILLARY_GOAL_SIZE, SYMBOLIC_ACTION_SIZE, max_num_epoch_paths_saved=variant["num_steps"], num_processes=variant["num_processes"], render=variant["render"], gamma=variant["trainer_kwargs"]["gamma"], no_plan_penalty=variant.get("no_plan_penalty", False), meta_num_epoch_paths=variant["meta_num_steps"], ) learn_trainer = PPOTrainer(actor_critic=expl_policy.learner, **variant["trainer_kwargs"]) control_trainer = DummyTrainer() trainer = MultiTrainer([control_trainer, learn_trainer]) replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs) algorithm = TorchIkostrikovRLAlgorithm( trainer=trainer, exploration_env=expl_envs, evaluation_env=eval_envs, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant["algorithm_kwargs"], # batch_size, # max_path_length, # num_epochs, # num_eval_steps_per_epoch, # num_expl_steps_per_train_loop, # num_trains_per_train_loop, # num_train_loops_per_epoch=1, # min_num_steps_before_training=0, ) algorithm.to(ptu.device) algorithm.train()