Esempio n. 1
0
def create_symbolic_action_distributions(action_space, base_output_size):
    if action_space == "full":
        bernoulli_dist = distributions.Bernoulli(base_output_size, 2)
        item_dist = distributions.Categorical(base_output_size, 6)
        quantity_dist = distributions.Categorical(base_output_size, 5)
        move_dist = distributions.Categorical(base_output_size, 4)
        # clear_dist = distributions.Categorical(base_output_size, 4)
        dist = distributions.DistributionGeneratorTuple(
            (bernoulli_dist, item_dist, quantity_dist, move_dist))
    elif action_space == "move-only":
        bernoulli_dist = distributions.Bernoulli(base_output_size, 1)
        move_dist = distributions.Categorical(base_output_size, 4)
        dist = distributions.DistributionGeneratorTuple(
            (bernoulli_dist, move_dist))
    elif action_space == "move-continuous":
        bernoulli_dist = distributions.Bernoulli(base_output_size, 1)
        move_dist = distributions.DiagGaussian(base_output_size, 2)
        dist = distributions.DistributionGeneratorTuple(
            (bernoulli_dist, move_dist))
    elif action_space == "move-uniform":
        bernoulli_dist = distributions.Bernoulli(base_output_size, 1)
        move_x = distributions.Categorical(base_output_size, 9)
        move_y = distributions.Categorical(base_output_size, 9)
        dist = distributions.DistributionGeneratorTuple(
            (bernoulli_dist, move_x, move_y))
    elif action_space == "rooms":
        action_dist = distributions.Categorical(base_output_size, 3)
        move_x = distributions.Categorical(base_output_size, 5)
        move_y = distributions.Categorical(base_output_size, 5)
        item_dist = distributions.Categorical(base_output_size, 6)
        quantity_dist = distributions.Categorical(base_output_size, 5)
        dist = distributions.DistributionGeneratorTuple(
            (action_dist, move_x, move_y, item_dist, quantity_dist))
    return dist
def experiment(variant):
    common.initialise(variant)

    expl_envs, eval_envs = common.create_environments(variant)

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_envs)

    obs_dim = obs_shape[1]

    qf = CNN(
        input_width=obs_dim,
        input_height=obs_dim,
        input_channels=channels,
        output_size=8,
        kernel_sizes=[8, 4],
        n_channels=[16, 32],
        strides=[4, 2],
        paddings=[0, 0],
        hidden_sizes=[256],
    )
    # CHANGE TO ORDINAL ACTION SPACE
    action_space = gym.spaces.Box(-np.inf, np.inf, (8, ))
    expl_envs.action_space = action_space
    eval_envs.action_space = action_space

    base = common.create_networks(variant, n, mlp, channels, fc_input)

    bernoulli_dist = distributions.Bernoulli(base.output_size, 4)
    passenger_dist = distributions.Categorical(base.output_size, 5)
    delivered_dist = distributions.Categorical(base.output_size, 5)
    continuous_dist = distributions.DiagGaussian(base.output_size, 2)
    dist = distributions.DistributionGeneratorTuple(
        (bernoulli_dist, continuous_dist, passenger_dist, delivered_dist))

    eval_policy = LearnPlanPolicy(
        ScriptedPolicy(qf, variant["always_return"]),
        num_processes=variant["num_processes"],
        vectorised=True,
        json_to_screen=expl_envs.observation_space.converter,
    )
    expl_policy = LearnPlanPolicy(
        ScriptedPolicy(qf, variant["always_return"]),
        num_processes=variant["num_processes"],
        vectorised=True,
        json_to_screen=expl_envs.observation_space.converter,
    )

    eval_path_collector = HierarchicalStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"]
        ["num_eval_steps_per_epoch"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=1,
        no_plan_penalty=variant.get("no_plan_penalty", False),
    )
    expl_path_collector = HierarchicalStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=variant["trainer_kwargs"]["gamma"],
        no_plan_penalty=variant.get("no_plan_penalty", False),
    )
    # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step]

    trainer = PPOTrainer(actor_critic=expl_policy.learner,
                         **variant["trainer_kwargs"])
    # missing: by this point, rollout back in sync.
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)
    # added: replay buffer is new
    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)
    # missing: device back in sync
    algorithm.evaluate()
Esempio n. 3
0
def experiment(variant):
    common.initialise(variant)

    expl_envs, eval_envs = common.create_environments(variant)

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_envs)

    # # CHANGE TO ORDINAL ACTION SPACE
    # action_space = gym.spaces.Box(-np.inf, np.inf, (8,))
    # expl_envs.action_space = action_space
    # eval_envs.action_space = action_space
    ANCILLARY_GOAL_SIZE = variant["ancillary_goal_size"]
    SYMBOLIC_ACTION_SIZE = 12

    base = common.create_networks(variant, n, mlp, channels, fc_input)
    control_base = common.create_networks(
        variant, n, mlp, channels,
        fc_input + SYMBOLIC_ACTION_SIZE)  # for uvfa goal representation

    dist = common.create_symbolic_action_distributions(variant["action_space"],
                                                       base.output_size)

    control_dist = distributions.Categorical(base.output_size, action_space.n)

    eval_learner = WrappedPolicy(
        obs_shape,
        action_space,
        ptu.device,
        base=base,
        deterministic=True,
        dist=dist,
        num_processes=variant["num_processes"],
        obs_space=obs_space,
    )

    planner = ENHSPPlanner()

    # multihead
    # eval_controller = CraftController(
    #     MultiPolicy(
    #         obs_shape,
    #         action_space,
    #         ptu.device,
    #         18,
    #         base=base,
    #         deterministic=True,
    #         num_processes=variant["num_processes"],
    #         obs_space=obs_space,
    #     )
    # )

    # expl_controller = CraftController(
    #     MultiPolicy(
    #         obs_shape,
    #         action_space,
    #         ptu.device,
    #         18,
    #         base=base,
    #         deterministic=False,
    #         num_processes=variant["num_processes"],
    #         obs_space=obs_space,
    #     )
    # )

    # uvfa
    eval_controller = CraftController(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=control_base,
            dist=control_dist,
            deterministic=True,
            num_processes=variant["num_processes"],
            obs_space=obs_space,
            symbolic_action_size=SYMBOLIC_ACTION_SIZE,
        ),
        n=n,
    )

    expl_controller = CraftController(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=control_base,
            dist=control_dist,
            deterministic=False,
            num_processes=variant["num_processes"],
            obs_space=obs_space,
            symbolic_action_size=SYMBOLIC_ACTION_SIZE,
        ),
        n=n,
    )
    function_env = gym.make(variant["env_name"])

    eval_policy = LearnPlanPolicy(
        eval_learner,
        planner,
        eval_controller,
        num_processes=variant["num_processes"],
        vectorised=True,
        env=function_env,
    )

    expl_learner = WrappedPolicy(
        obs_shape,
        action_space,
        ptu.device,
        base=base,
        deterministic=False,
        dist=dist,
        num_processes=variant["num_processes"],
        obs_space=obs_space,
    )

    expl_policy = LearnPlanPolicy(
        expl_learner,
        planner,
        expl_controller,
        num_processes=variant["num_processes"],
        vectorised=True,
        env=function_env,
    )

    eval_path_collector = ThreeTierStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        ANCILLARY_GOAL_SIZE,
        SYMBOLIC_ACTION_SIZE,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"]
        ["num_eval_steps_per_epoch"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=1,
        no_plan_penalty=True,
        meta_num_epoch_paths=variant["meta_num_steps"],
    )
    expl_path_collector = ThreeTierStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        ANCILLARY_GOAL_SIZE,
        SYMBOLIC_ACTION_SIZE,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=variant["trainer_kwargs"]["gamma"],
        no_plan_penalty=variant.get("no_plan_penalty", False),
        meta_num_epoch_paths=variant["meta_num_steps"],
    )
    # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step]

    learn_trainer = PPOTrainer(actor_critic=expl_policy.learner,
                               **variant["trainer_kwargs"])
    control_trainer = PPOTrainer(actor_critic=expl_policy.controller.policy,
                                 **variant["trainer_kwargs"])
    trainer = MultiTrainer([control_trainer, learn_trainer])
    # missing: by this point, rollout back in sync.
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)
    # added: replay buffer is new
    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)

    algorithm.train()
Esempio n. 4
0
def experiment(variant):
    common.initialise(variant)

    expl_envs, eval_envs = common.create_environments(variant)

    (
        obs_shape,
        obs_space,
        action_space,
        n,
        mlp,
        channels,
        fc_input,
    ) = common.get_spaces(expl_envs)

    # # CHANGE TO ORDINAL ACTION SPACE
    # action_space = gym.spaces.Box(-np.inf, np.inf, (8,))
    # expl_envs.action_space = action_space
    # eval_envs.action_space = action_space
    ANCILLARY_GOAL_SIZE = variant[
        "ancillary_goal_size"]  # This is the length of the action space for the learner
    SYMBOLIC_ACTION_SIZE = 12
    GRID_SIZE = 31

    base = common.create_networks(variant, n, mlp, channels, fc_input)
    control_base = common.create_networks(
        variant, n, mlp, channels,
        fc_input + SYMBOLIC_ACTION_SIZE)  # for uvfa goal representation

    dist = common.create_symbolic_action_distributions(variant["action_space"],
                                                       base.output_size)

    control_dist = distributions.Categorical(base.output_size, action_space.n)

    eval_learner = WrappedPolicy(
        obs_shape,
        action_space,
        ptu.device,
        base=base,
        deterministic=True,
        dist=dist,
        num_processes=variant["num_processes"],
        obs_space=obs_space,
    )

    planner = ENHSPPlanner()

    # collect

    filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/e77c75eed02e4b38a0a308789fbfcbd8/data/params.pkl"  # collect
    with (open(filepath, "rb")) as openfile:
        while True:
            try:
                policies = pickle.load(openfile)
            except EOFError:
                break

    loaded_collect_policy = policies["exploration/policy"]
    loaded_collect_policy.rnn_hxs = loaded_collect_policy.rnn_hxs[0].unsqueeze(
        0)

    eval_collect = CraftController(loaded_collect_policy, n=GRID_SIZE)

    expl_collect = CraftController(loaded_collect_policy, n=GRID_SIZE)

    # other
    filepath = "/home/achester/anaconda3/envs/goal-gen/.guild/runs/cf5c31afe0724acd8f6398d77a80443e/data/params.pkl"  # other
    # filepath = "/home/achester/Documents/symbolic-goal-generation/data/params.pkl"
    with (open(filepath, "rb")) as openfile:
        while True:
            try:
                policies = pickle.load(openfile)
            except EOFError:
                break

    loaded_other_policy = policies["exploration/policy"]
    loaded_other_policy.rnn_hxs = loaded_other_policy.rnn_hxs[0].unsqueeze(0)

    eval_other = CraftController(loaded_other_policy, n=GRID_SIZE)
    expl_other = CraftController(loaded_other_policy, n=GRID_SIZE)

    eval_controller = PretrainedController([eval_collect, eval_other])
    expl_controller = PretrainedController([expl_collect, expl_other])

    function_env = gym.make(variant["env_name"])

    eval_policy = LearnPlanPolicy(
        eval_learner,
        planner,
        eval_controller,
        num_processes=variant["num_processes"],
        vectorised=True,
        env=function_env,
    )

    expl_learner = WrappedPolicy(
        obs_shape,
        action_space,
        ptu.device,
        base=base,
        deterministic=False,
        dist=dist,
        num_processes=variant["num_processes"],
        obs_space=obs_space,
    )

    expl_policy = LearnPlanPolicy(
        expl_learner,
        planner,
        expl_controller,
        num_processes=variant["num_processes"],
        vectorised=True,
        env=function_env,
    )

    eval_path_collector = ThreeTierStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        ANCILLARY_GOAL_SIZE,
        SYMBOLIC_ACTION_SIZE,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"]
        ["num_eval_steps_per_epoch"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=1,
        no_plan_penalty=True,
        meta_num_epoch_paths=variant["meta_num_steps"],
    )
    expl_path_collector = ThreeTierStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        ANCILLARY_GOAL_SIZE,
        SYMBOLIC_ACTION_SIZE,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
        gamma=variant["trainer_kwargs"]["gamma"],
        no_plan_penalty=variant.get("no_plan_penalty", False),
        meta_num_epoch_paths=variant["meta_num_steps"],
    )

    learn_trainer = PPOTrainer(actor_critic=expl_policy.learner,
                               **variant["trainer_kwargs"])
    control_trainer = DummyTrainer()
    trainer = MultiTrainer([control_trainer, learn_trainer])

    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)

    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)

    algorithm.train()