コード例 #1
0
    def __init__(
        self,
        name,
        s_size,
        a_size,
        trainer,
        model_path,
        global_episodes,
        env_name,
        seed,
        test,
        cell_units,
        params,
        testing_trial=False,
    ):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_" +
                                                    str(self.number))
        self.is_test = test

        self.s_size = s_size
        self.a_size = a_size
        self.params = params
        self.MINI_BATCH = 30
        self.REWARD_FACTOR = 0.001

        # Create the local copy of the network and the tensorflow op to copy global parameters to local network
        # self.update_local_ops = update_target_graph('global', self.name)

        self.testing_trial = testing_trial
        if not self.testing_trial:
            self.scenario_name = params["train_scenario_name"]
            self.attempt_limit = params["train_attempt_limit"]
        else:
            self.scenario_name = params["test_scenario_name"]
            self.attempt_limit = params["test_attempt_limit"]

        self.scenario = select_scenario(self.scenario_name,
                                        params["use_physics"])
        env = gym.make(env_name)

        self.agent = A3CAgent(env, self.s_size, self.a_size, self.name,
                              self.params)
        self.agent.env.reward_mode = params["reward_mode"]
        self.agent.env.use_physics = params["use_physics"]
        self.trial_count = 0
        self.agent.env.seed(seed)
コード例 #2
0
def generate_solutions_by_trial(scenario_name, trial_name):
    solution_chains = []
    scenario = select_scenario(scenario_name, use_physics=False)

    # TODO(mjedmonds): extract these from the environment/scenario somehow. these are hard-coded
    lever_cpt_choice = GRAPH_INT_TYPE(1)
    door_cpt_choice = GRAPH_INT_TYPE(0)
    lever_ending_state = GRAPH_INT_TYPE(ENTITY_STATES["LEVER_PUSHED"])
    door_ending_state = GRAPH_INT_TYPE(ENTITY_STATES["DOOR_OPENED"])

    scenario_solutions = scenario.solutions
    trial_levers = LEVER_CONFIGS[trial_name]
    for scenario_solution in scenario_solutions:
        solution_actions = []
        solution_states = []
        solution_cpt_choices = []
        solution_attributes = []
        solution_outcomes = []
        for action_log in scenario_solution:
            action_name = action_log.name
            state_name = action_name.split("_")[1]
            if state_name == "door":
                ending_state = door_ending_state
                cpt_choice = door_cpt_choice
            else:
                # determine position of lever based on role
                for trial_lever in trial_levers:
                    if (get_one_of(
                            trial_lever,
                        ["LeverRole", "LeverRoleEnum"]) == state_name):
                        state_name = trial_lever.LeverPosition.name
                ending_state = lever_ending_state
                cpt_choice = lever_cpt_choice

            action_name = "push_" + state_name
            attributes = (state_name, "GREY")
            solution_actions.append(action_name)
            solution_states.append(state_name)
            solution_attributes.append(attributes)
            solution_cpt_choices.append(cpt_choice)
            solution_outcomes.append(ending_state)
        solution_chains.append(
            CausalChainCompact(
                states=tuple(solution_states),
                actions=tuple(solution_actions),
                conditional_probability_table_choices=tuple(
                    solution_cpt_choices),
                outcomes=tuple(solution_outcomes),
                attributes=tuple(solution_attributes),
            ))
    return solution_chains
コード例 #3
0
def generate_solutions_by_trial_causal_relation(scenario_name, trial_name):
    solution_chains = []
    scenario = select_scenario(scenario_name, use_physics=False)

    # todo: extract these from the environment/scenario somehow. these are hard-coded
    lever_causal_relation_type = CausalRelationType.one_to_zero
    door_causal_relation_type = CausalRelationType.zero_to_one

    scenario_solutions = scenario.solutions
    trial_levers = LEVER_CONFIGS[trial_name]
    for scenario_solution in scenario_solutions:
        solution_chain = []
        precondition = None
        for action_log in scenario_solution:
            action_name = action_log.name
            state_name = action_name.split("_")[1]
            if state_name == "door":
                causal_relation = door_causal_relation_type
            else:
                # determine position of lever based on role
                for trial_lever in trial_levers:
                    if trial_lever.LeverRoleEnum == state_name:
                        state_name = trial_lever.LeverPosition.name
                causal_relation = lever_causal_relation_type

            action_name = "push"
            attributes = (state_name, "GREY")
            solution_chain.append(
                CausalRelation(action=Action(action_name, attributes[0], None),
                               attributes=attributes,
                               causal_relation_type=causal_relation,
                               precondition=precondition))
            # setup precondition for next link in chain
            precondition = (attributes, causal_relation[1])
        solution_chains.append(tuple(solution_chain))
    return solution_chains
コード例 #4
0
def generate_solutions_by_trial_causal_relation(scenario_name, trial_name):
    solution_chains = []
    scenario = select_scenario(scenario_name, use_physics=False)

    # TODO(mjedmonds): extract these from the environment/scenario somehow. these are hard-coded
    lever_causal_relation_type = CausalRelationType.one_to_zero
    door_causal_relation_type = CausalRelationType.zero_to_one

    scenario_solutions = get_one_of(scenario, ["SOLUTIONS", "solutions"])
    trial_levers = LEVER_CONFIGS[trial_name]
    for scenario_solution in scenario_solutions:
        solution_chain = []
        precondition = None

        attributes = None
        causal_relation = None
        delay = 0

        first = True

        # We can't know what the delay is until we see the next action. So we don't create the
        # causal relation for an action until we see the next non-wildcard action.
        for action_log in scenario_solution:
            action_name = action_log.name
            if action_name == "*":
                if first:
                    raise ValueError(
                        "Solutions cannot start with a wildcard action.")
                delay += 1
                continue
            elif not first:
                solution_chain.append(
                    CausalRelation(
                        action=Action(name="push",
                                      obj=attributes[0],
                                      params=None),
                        attributes=attributes,
                        causal_relation_type=causal_relation,
                        precondition=precondition,
                        delay=delay,
                    ))
                delay = 0
                precondition = (attributes, causal_relation[1])

            first = False

            state_name = action_name.split("_")[1]
            if state_name == "door":
                causal_relation = door_causal_relation_type
            else:
                # determine position of lever based on role
                for trial_lever in trial_levers:
                    if (get_one_of(
                            trial_lever,
                        ["LeverRole", "LeverRoleEnum"]) == state_name):
                        state_name = trial_lever.LeverPosition.name
                causal_relation = lever_causal_relation_type

            attributes = (state_name, "GREY")

        # Append the last action
        solution_chain.append(
            CausalRelation(
                action=Action("push", attributes[0], None),
                attributes=attributes,
                causal_relation_type=causal_relation,
                precondition=precondition,
                delay=delay,
            ))

        solution_chains.append(tuple(solution_chain))
    return solution_chains
コード例 #5
0
def main():
    torch.set_default_tensor_type("torch.DoubleTensor")

    # general params
    # training params
    params_list = []
    env_list = ["CE3-CE4", "CE4"]
    memory_list = []
    fig_list = [create_reward_fig() for _ in env_list]
    log_list = [[] for _ in env_list]
    for env_name in env_list:
        # if len(sys.argv) < 2:
        #     # general params
        #     # training params
        #     # PICK ONE and comment others
        #     params = PARAMS['CE3-CE4']
        #     # params = PARAMS['CE3-CC4']
        #     # params = PARAMS['CC3-CE4']
        #     # params = PARAMS['CC3-CC4']
        #     # params = PARAMS['CE4']
        #     # params = PARAMS['CC4']
        # else:
        #     setting = sys.argv[1]
        #     params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        #     print('training_scenario: {}, testing_scenario: {}'.format(params['train_scenario_name'],
        #                                                                params['test_scenario_name']))
        #     params['reward_mode'] = sys.argv[2]
        memory_list.append(Memory())
        # generic
        params = PARAMS[env_name]
        params["gamma"] = 0.99
        params["reward_mode"] = "basic"
        # a2c
        params["epsilon"] = 0.95
        params["l2_reg"] = 1e-3
        # trpo
        params["max_kl"] = 1e-2
        params["damping"] = 1e-2
        # ppo
        params["clip_epsilon"] = 0.2
        params["optim_value_epochs"] = 1
        # maml
        params["backbone"] = "trpo"
        params["num_grad_update"] = 1
        params["lr_pre_update"] = 1e-3
        params["lr_meta_update"] = 1e-3
        params["pre_batch_size"] = 2048
        params["meta_batch_size"] = 2048

        # others
        params["use_gpu"] = True and torch.cuda.is_available()
        params["gpuid"] = int(sys.argv[4]) if len(sys.argv) >= 5 else 0

        params["Tensor"] = (
            torch.cuda.DoubleTensor if params["use_gpu"] else torch.DoubleTensor
        )
        params["ActionTensor"] = (
            torch.cuda.LongTensor if params["use_gpu"] else torch.LongTensor
        )

        random_seed = 1234
        params["use_physics"] = False
        params[
            "full_attempt_limit"
        ] = (
            False
        )  # run to the full attempt limit, regardless of whether or not all solutions were found
        params["num_training_iters"] = 1000
        params["num_training_trials"] = params["train_num_trials"]
        params["train_attempt_limit"] = 700

        params["num_testing_iters"] = 1000
        params["num_testing_trials"] = params["test_num_trials"]
        params["test_attempt_limit"] = 700

        # RL specific settings
        params["data_dir"] = os.path.dirname(ROOT_DIR) + "/OpenLockRLResults/subjects"

        params_list.append(params)

    # TODO: we assume all the scenarios share the same observation space
    scenario = select_scenario(
        params["train_scenario_name"], use_physics=params["use_physics"]
    )

    env = gym.make("openlock-v1")
    env.use_physics = params["use_physics"]
    env.full_attempt_limit = params["full_attempt_limit"]
    # set up observation space
    env.observation_space = ObservationSpace(
        len(scenario.levers), append_solutions_remaining=False
    )
    # set reward mode
    env.reward_mode = params["reward_mode"]
    print("Reward mode: {}".format(env.reward_mode))
    np.random.seed(random_seed)
    env.seed(random_seed)

    # dummy agent
    agent = MAMLAgent(env, 1, 1, params, env_list, require_log=False)
    trial_selected = agent.setup_trial(
        scenario_name=params["train_scenario_name"],
        action_limit=params["train_action_limit"],
        attempt_limit=params["train_attempt_limit"],
    )
    env.reset()

    state_size = agent.env.observation_space.multi_discrete.shape[0]
    action_size = len(env.action_space)
    agent = MAMLAgent(env, state_size, action_size, params, env_list)
    save_path = os.path.join(
        params["data_dir"],
        "3rd_model_log/maml-{}-{}-{}".format(
            "_".join(env_list), params["reward_mode"], agent.subject_id
        ),
    )
    load_path = sys.argv[3] if len(sys.argv) >= 4 else ""  # path without '.*' suffix
    os.makedirs(save_path, exist_ok=True)
    reward_counter_list = [env.reward_strategy.counter for _ in env_list]
    reward_attempt_count_list = [env.reward_strategy.attempt_count for _ in env_list]

    agent.env.reset()
    if load_path:
        agent.load(load_path)
        print("load model from {}".format(load_path))

    agent.env.human_agent = False
    agent.type_tag = "{}-{}-MAML".format("_".join(env_list), params["reward_mode"])
    # train over multiple iterations over all trials
    for iter_num in range(params_list[0]["num_training_iters"]):
        for ind, env_name in enumerate(env_list):
            agent.env.completed_trials = []
            agent.env.scenario = None
            agent.env.cur_trial = None
            agent.env.reward_strategy.counter = reward_counter_list[ind]
            agent.env.reward_strategy.attempt_count = reward_attempt_count_list[ind]
            print("[Train] Now meta train on {}".format(env_name))
            params = params_list[ind]
            memory = memory_list[ind]
            agent._update_params_and_mem(params, memory)

            for trial_num in range(0, params_list[0]["num_training_trials"]):
                agent.run_trial_maml(
                    scenario_name=params["train_scenario_name"],
                    fig=fig_list[ind],
                    action_limit=params["train_action_limit"],
                    attempt_limit=params["train_attempt_limit"],
                    trial_count=trial_num,
                    iter_num=iter_num,
                    env_ind=ind,
                )
                fig_list[ind], log_list[ind] = agent.log_values(
                    [
                        agent.trial_length[ind],
                        agent.trial_percent_attempt_success[ind],
                        agent.trial_percent_solution_found[ind],
                        agent.average_trial_rewards[ind],
                        agent.attempt_rewards[ind],
                    ],
                    fig_list[ind],
                    [
                        "Attempt Count Per Trial",
                        "Percentage of Successful Attempts in Trial",
                        "Percentage of Solutions Found in Trial",
                        "Average Trial Reward",
                        "Attempt Reward",
                    ],
                    agent.type_tag + "-{}".format(env_name),
                )
        pickle.dump(
            (agent.type_tag, log_list, params),
            open(os.path.join(save_path, "log.pkl"), "wb"),
        )
        # update
        for ind, env_name in enumerate(env_list):
            memory = memory_list[ind]
            params = params_list[ind]
            batch_a, batch_b = [], []
            if len(memory) > params["pre_batch_size"] + params["meta_batch_size"]:
                print("[Update] Now do an update with {}".format(env_name))
                batch_a.append(memory.sample(params["pre_batch_size"]))
                batch_b.append(memory.sample(params["meta_batch_size"]))
                memory.clear()
        print("[Update] Now update")
        agent.update(batch_a, batch_b, iter_num)
        agent.save(save_path, iter_num)
    print(
        "Trial complete for subject {}. Average reward: {}".format(
            agent.logger.subject_id, agent.average_trial_rewards[-1]
        )
    )
    fig.savefig(os.path.join(save_path, "log.png"))
コード例 #6
0
def train_transfer_test_transfer(agent, fig=None):
    # train all training cases/trials
    params = agent.params
    trial_count = 0
    agent, trial_count = run_trials(
        agent,
        trial_count,
        params["train_num_iters"],
        params["train_num_trials"],
        params["train_scenario_name"],
        params["train_action_limit"],
        params["train_attempt_limit"],
        params["use_dynamic_epsilon"],
        params["dynamic_epsilon_max"],
        params["dynamic_epsilon_decay"],
        test_trial=False,
        fig=fig,
    )

    agent.plot_rewards(
        agent.rewards,
        agent.epsilons,
        agent.writer.subject_path + "/training_rewards.png",
    )
    agent.plot_rewards_trial_switch_points(
        agent.rewards,
        agent.epsilons,
        agent.trial_switch_points,
        agent.writer.subject_path + "/training_rewards_switch_points.png",
        plot_xticks=False,
    )
    agent.test_start_reward_idx = len(agent.rewards)
    agent.test_start_trial_count = trial_count

    agent.save_weights(agent.writer.subject_path + "/models",
                       "/training_final.cpkt",
                       sess=agent.sess)

    # testing trial
    # print "INFO: STARTING TESTING TRIAL"
    if params["test_scenario_name"] is not None:

        # setup testing trial
        scenario = select_scenario(params["test_scenario_name"],
                                   use_physics=params["use_physics"])
        agent.env.update_scenario(scenario)
        agent.env.set_action_limit(params["test_action_limit"])
        agent.env.observation_space = ObservationSpace(
            len(scenario.levers), append_solutions_remaining=False)

        agent, trial_count = run_trials(
            agent,
            trial_count,
            params["test_num_iters"],
            params["test_num_trials"],
            params["test_scenario_name"],
            params["test_action_limit"],
            params["test_attempt_limit"],
            params["use_dynamic_epsilon"],
            params["dynamic_epsilon_max"],
            params["dynamic_epsilon_decay"],
            test_trial=True,
        )

        agent.plot_rewards(
            agent.rewards[agent.test_start_reward_idx:],
            agent.epsilons[agent.test_start_reward_idx:],
            agent.writer.subject_path + "/testing_rewards.png",
            width=6,
            height=6,
        )
        agent.save_weights(agent.writer.subject_path + "/models",
                           "/testing_final.h5")

    return agent
コード例 #7
0
def main():
    # general params
    # training params
    if len(sys.argv) < 2:
        # general params
        # training params
        # PICK ONE and comment others
        params = PARAMS["CE3-CE4"]
        # params = PARAMS['CE3-CC4']
        # params = PARAMS['CC3-CE4']
        # params = PARAMS['CC3-CC4']
        # params = PARAMS['CE4']
        # params = PARAMS['CC4']
    else:
        setting = sys.argv[1]
        params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        print("training_scenario: {}, testing_scenario: {}".format(
            params["train_scenario_name"], params["test_scenario_name"]))
        params["reward_mode"] = sys.argv[2]

    human_decay_mean = 0.7429  # from human data
    human_decay_median = 0.5480  # from human data

    # RL specific settings
    random_seed = 1234
    params["use_physics"] = False
    params[
        "full_attempt_limit"] = True  # run to the full attempt limit, regardless of whether or not all solutions were found
    params["train_num_iters"] = 100
    params["test_num_iters"] = 10
    # params['epsilon_decay'] = 0.9955
    # params['epsilon_decay'] = 0.9999
    params["epsilon_decay"] = 0.99999
    params["dynamic_epsilon_decay"] = 0.9955
    params["dynamic_epsilon_max"] = 0.5
    params["use_dynamic_epsilon"] = True
    params["test_num_trials"] = 5

    params["data_dir"] = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects"
    params["train_attempt_limit"] = 300
    params["test_attempt_limit"] = 300
    params["gamma"] = 0.8  # discount rate
    params["epsilon"] = 1.0  # exploration rate
    params["epsilon_min"] = 0.00
    params["learning_rate"] = 0.0005
    params["batch_size"] = 64

    # SINGLE TRIAL TRAINING
    # params['train_attempt_limit'] = 30000
    # params['epsilon_decay'] = 0.99995
    # params['use_dynamic_epsilon'] = False

    # dummy settings
    # params['train_num_iters'] = 10
    # params['test_num_iters'] = 10
    # params['train_attempt_limit'] = 30
    # params['test_attempt_limit'] = 30

    # human comparison settings
    # params['train_num_iters'] = 1
    # params['test_num_iters'] = 1
    # params['train_attempt_limit'] = 300000
    # params['test_attempt_limit'] = 300000
    # params['epsilon_decay'] = human_decay_mean
    # params['dynamic_epsilon_decay'] = human_decay_mean
    # params['dynamic_epsilon_max'] = 1
    # params['use_dynamic_epsilon'] = True

    scenario = select_scenario(params["train_scenario_name"],
                               use_physics=params["use_physics"])

    # setup initial env
    env = gym.make("openlock-v1")

    env.use_physics = params["use_physics"]
    env.full_attempt_limit = params["full_attempt_limit"]

    # set up observation space
    env.observation_space = ObservationSpace(len(scenario.levers),
                                             append_solutions_remaining=False)

    # set reward mode
    env.reward_mode = params["reward_mode"]
    print("Reward mode: {}".format(env.reward_mode))

    agent = DDPGAgent(env, 1, 1, params, None, "init")

    # create session/trial/experiment
    # TODO(mjedmonds): passing a fake agent here is a hack
    np.random.seed(random_seed)
    env.seed(random_seed)
    trial_selected = agent.setup_trial(
        scenario_name=params["train_scenario_name"],
        action_limit=params["train_action_limit"],
        attempt_limit=params["train_attempt_limit"],
    )

    env.reset()

    # setup agent
    state_size = agent.env.observation_space.multi_discrete.shape[0]
    action_size = len(agent.env.action_space)

    # agent = DQNAgent(state_size, action_size, params)

    sess = tf.Session()
    agent = DDPGAgent(env, state_size, action_size, params, sess, "DDPG")
    # update agent to be a properly initialized agent

    agent.env.reset()
    fig = create_reward_fig()
    agent.sess.run(tf.global_variables_initializer())

    # MULTI-TRIAL TRAINING, TESTING
    # runs through all training trials and testing trials
    agent = train_transfer_test_transfer(agent, fig)

    # SINGLE TRIAL TRAINING
    # agent, env, agent = train_single_trial(agent, env, agent, params, fig)

    agent.finish_subject()
    print("Training & testing complete for subject {}".format(
        agent.logger.subject_id))
コード例 #8
0
    human_config_data = common.load_human_config_json()

    # params["data_dir"] = os.path.dirname(ROOT_DIR) + "/OpenLockResults/subjects"
    params["data_dir"] = human_config_data["HUMAN_SAVE_DIR"]
    params["src_dir"] = "/tmp/openlocklearner/" + str(hash(
        time.time())) + "/src/"
    params["use_physics"] = True
    params["effect_probabilities"] = generate_effect_probabilities()

    # this section randomly selects a testing and training scenario
    # train_scenario_name, test_scenario_name = select_random_scenarios()
    # params['train_scenario_name'] = train_scenario_name
    # params['test_scenario_name'] = test_scenario_name

    scenario = select_scenario(params["train_scenario_name"])

    # todo: this should not be part of OpenLockLearnerAgent
    env = Agent.pre_instantiation_setup(params)
    env.lever_index_mode = "role"

    # create session/trial/experiment manager
    agent = HumanAgent(params, env)

    atexit.register(agent.cleanup)

    # used for debugging, runs a specific scenario & trial
    # run_specific_trial_and_scenario(manager, 'CC3', 'trial5', params['train_action_limit'], params['train_attempt_limit'])

    for trial_num in range(0, params["train_num_trials"]):
        agent.run_trial_human(
コード例 #9
0
def main():
    torch.set_default_tensor_type("torch.DoubleTensor")

    # general params
    # training params
    if len(sys.argv) < 2:
        # general params
        # training params
        # PICK ONE and comment others
        params = PARAMS["CE3-CE4"]
        # params = PARAMS['CE3-CC4']
        # params = PARAMS['CC3-CE4']
        # params = PARAMS['CC3-CC4']
        # params = PARAMS['CE4']
        # params = PARAMS['CC4']
    else:
        setting = sys.argv[1]
        params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        print("training_scenario: {}, testing_scenario: {}".format(
            params["train_scenario_name"], params["test_scenario_name"]))
        params["reward_mode"] = sys.argv[2]

    params["prioritized_replay"] = False
    params["max_mem_size"] = 10000
    params["eps_start"] = 0.90
    params["eps_end"] = 0.05
    params["eps_decay"] = 50
    params["gamma"] = 0.99
    params["learning_rate"] = 0.001
    params["epsilon"] = 0.95
    params["l2_reg"] = 1e-3
    params["batch_size"] = 2048
    params["target_update"] = 10
    params["use_gpu"] = True
    params["gpuid"] = int(sys.argv[5]) if len(sys.argv) >= 6 else 0

    random_seed = 1234
    params["use_physics"] = False
    params["full_attempt_limit"] = (
        False
    )  # run to the full attempt limit, regardless of whether or not all solutions were found
    params["num_training_iters"] = 200
    params["num_training_trials"] = params["train_num_trials"]
    params["train_attempt_limit"] = 700

    params["num_testing_iters"] = 200
    params["num_testing_trials"] = params["test_num_trials"]
    params["test_attempt_limit"] = 700

    # RL specific settings
    params["data_dir"] = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects"

    scenario = select_scenario(params["train_scenario_name"],
                               use_physics=params["use_physics"])

    env = gym.make("openlock-v1")
    env.use_physics = params["use_physics"]
    env.full_attempt_limit = params["full_attempt_limit"]
    # set up observation space
    env.observation_space = ObservationSpace(len(scenario.levers),
                                             append_solutions_remaining=False)
    # set reward mode
    env.reward_mode = params["reward_mode"]
    print("Reward mode: {}".format(env.reward_mode))

    # set whether to index by role or position
    env.lever_index_mode = "role"
    # env.lever_index_mode = 'position'

    agent = DDQNAgent(env, 1, 1, params)

    # create session/trial/experiment
    # TODO: passing a fake agent here is a hack
    np.random.seed(random_seed)
    env.seed(random_seed)

    # dummy agent
    agent = DQNAgent(env, 1, 1, params, require_log=False)
    trial_selected = agent.setup_trial(
        scenario_name=params["train_scenario_name"],
        action_limit=params["train_action_limit"],
        attempt_limit=params["train_attempt_limit"],
    )
    env.reset()

    state_size = agent.env.observation_space.multi_discrete.shape[0]
    action_size = len(env.action_space)
    agent = DQNAgent(env, state_size, action_size, params)
    load_path = (
        sys.argv[3] if len(sys.argv) >= 4 and sys.argv[3] != "-" else ""
    )  # path without '.*' suffix
    transfer_tag = (
        sys.argv[4] if len(sys.argv) >= 5 and sys.argv[4] != "-" else ""
    )  # i.e. CC3toCC4
    save_path = os.path.join(
        params["data_dir"],
        "3rd_model_log/dqn-{}-{}-{}".format(
            transfer_tag if transfer_tag else params["train_scenario_name"],
            params["reward_mode"],
            agent.subject_id,
        ),
    )
    os.makedirs(save_path, exist_ok=True)

    agent.env.reset()
    if load_path:
        agent.load(load_path)
        print("load model from {}".format(load_path))

    agent.env.human_agent = False
    agent.type_tag = "{}-{}-DQN".format(
        transfer_tag if transfer_tag else params["train_scenario_name"],
        params["reward_mode"],
    )
    # train over multiple iterations over all trials
    fig = create_reward_fig()
    update_count = 0
    for iter_num in range(params["num_training_iters"]):
        agent.env.completed_trials = []
        for trial_num in range(0, params["num_training_trials"]):
            agent.run_trial_dqn(
                scenario_name=params["train_scenario_name"],
                fig=fig,
                action_limit=params["train_action_limit"],
                attempt_limit=params["train_attempt_limit"],
                trial_count=trial_num,
                iter_num=iter_num,
            )
            fig, data = agent.log_values(
                [
                    agent.trial_length,
                    agent.trial_percent_attempt_success,
                    agent.trial_percent_solution_found,
                    agent.average_trial_rewards,
                    agent.attempt_rewards,
                ],
                fig,
                [
                    "Attempt Count Per Trial",
                    "Percentage of Successful Attempts in Trial",
                    "Percentage of Solutions Found in Trial",
                    "Average Trial Reward",
                    "Attempt Reward",
                ],
                agent.type_tag,
            )
            pickle.dump(
                (agent.type_tag, data, params),
                open(os.path.join(save_path, "log.pkl"), "wb"),
            )
            # update
            if len(agent.memory) > agent.batch_size:
                batch = agent.memory.sample(agent.batch_size)
                print("update with bs:{}".format(len(batch.state)))
                agent.update(batch, iter_num)
                update_count += 1
                if (update_count + 1) % params["target_update"]:
                    agent.target_q_net.load_state_dict(
                        agent.q_net.state_dict())
        agent.save(save_path, iter_num)
    print("Trial complete for subject {}. Average reward: {}".format(
        agent.logger.subject_id, agent.average_trial_rewards[-1]))
    fig.savefig(os.path.join(save_path, "log.png"))
コード例 #10
0
def main(argv):
    global master_network
    global global_episodes

    reward_mode = None
    if len(sys.argv) < 2:
        # general params
        # training params
        # PICK ONE and comment others
        params = PARAMS["CE3-CE4"]
        reward_mode = "negative_change_state_partial_action_seq_solution_multiplier"
        # params = PARAMS['CE3-CC4']
        # params = PARAMS['CC3-CE4']
        # params = PARAMS['CC3-CC4']
        # params = PARAMS['CE4']
        # params = PARAMS['CC4']
    else:
        setting = sys.argv[1]
        params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        print("training_scenario: {}, testing_scenario: {}".format(
            params["train_scenario_name"], params["test_scenario_name"]))
        reward_mode = sys.argv[2]

    use_physics = False
    num_training_iters = 100

    # RL specific settings
    params["data_dir"] = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects"
    params["train_attempt_limit"] = 300
    params["test_attempt_limit"] = 300
    params["use_physics"] = False
    params["num_training_iters"] = 100
    params["reward_mode"] = reward_mode

    # RL specific settings
    params["use_physics"] = False
    params["full_attempt_limit"] = (
        True
    )  # run to the full attempt limit, regardless of whether or not all solutions were found
    params["train_num_iters"] = 100
    params["test_num_iters"] = 10
    # params['epsilon_decay'] = 0.9955
    # params['epsilon_decay'] = 0.9999
    params["epsilon_decay"] = 0.9996
    params["dynamic_epsilon_decay"] = 0.999
    params["dynamic_epsilon_max"] = 0.1
    params["use_dynamic_epsilon"] = True
    params["test_num_trials"] = 5

    params["train_attempt_limit"] = 300
    params["test_attempt_limit"] = 300
    params["gamma"] = 0.8  # discount rate
    params["epsilon"] = 0.005  # exploration rate0.01 0.05
    params["epsilon_min"] = 0.001
    params["learning_rate"] = 0.0005
    params["batch_size"] = 64

    scenario = select_scenario(params["train_scenario_name"],
                               use_physics=use_physics)

    ENV_NAME = "openlock-v1"

    env = gym.make(ENV_NAME)
    env.reward_mode = reward_mode
    env.use_physics = params["use_physics"]
    env.full_attempt_limit = params["full_attempt_limit"]

    # set up observation space
    env.observation_space = ObservationSpace(len(scenario.levers),
                                             append_solutions_remaining=False)

    # set reward mode
    env.reward_mode = params["reward_mode"]
    print("Reward mode: {}".format(env.reward_mode))

    agent = A3CAgent(env, 1, 1, "init", params)
    # create session/trial/experiment
    trial_selected = agent.setup_trial(
        scenario_name=params["train_scenario_name"],
        action_limit=params["train_action_limit"],
        attempt_limit=params["train_attempt_limit"],
    )

    env.observation_space = ObservationSpace(len(scenario.levers))
    MODEL_DIR = "./OpenLockRLResults/subjects" + "/models"
    MONITOR_DIR = "./OpenLockRLResults/subjects" + "/monitor"

    MODEL_DIR = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects" + "/models"
    MONITOR_DIR = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects" + "/monitor"

    STATE_DIM = env.observation_space.multi_discrete.shape[0]
    ACTION_DIM = len(env.action_space)

    # delete temporary env
    env.close()

    tf.reset_default_graph()
    config = tf.ConfigProto(allow_soft_placement=True)

    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    with tf.device("/cpu:0"):
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        global_episodes = tf.Variable(0,
                                      dtype=tf.int32,
                                      name="global_episodes",
                                      trainable=False)
        trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        master_network = AC_Network(STATE_DIM, ACTION_DIM, "global", None,
                                    CELL_UNITS)  # Generate global network
        num_workers = (multiprocessing.cpu_count()
                       )  # Set workers to number of available CPU threads

        # For testing and visualisation we only need one worker
        if TEST_MODEL:
            num_workers = 1
        # num_workers = 8 set your own proper worker
        workers = []
        # Create worker causal_classes
        for i in range(num_workers):
            workers.append(
                Worker(
                    name=i,
                    s_size=STATE_DIM,
                    a_size=ACTION_DIM,
                    trainer=trainer,
                    model_path=MODEL_DIR,
                    global_episodes=global_episodes,
                    env_name=ENV_NAME,
                    seed=RANDOM_SEED,
                    test=TEST_MODEL,
                    cell_units=CELL_UNITS,
                    params=params,
                    testing_trial=TEST_MODEL,
                ))
        saver = tf.train.Saver(max_to_keep=num_workers)

        # Gym monitor
        if not TEST_MODEL:
            env = workers[0].get_env()
            env = gym.wrappers.Monitor(env,
                                       MONITOR_DIR,
                                       video_callable=False,
                                       force=True)

    with tf.Session(config=config) as sess:
        coord = tf.train.Coordinator()
        if LOAD_MODEL or TEST_MODEL:
            print("Loading Model...")
            ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        if TEST_MODEL:
            env = workers[0].get_env()
            # env = gym.wrappers.Monitor(env, MONITOR_DIR, force=True)
            workers[0].work(GAMMA, sess, coord, saver)
        else:
            # This is where the asynchronous magic happens.
            # Start the "work" process for each worker in a separate thread.
            print("Launching workers...")
            worker_threads = []
            for worker in workers:

                worker_work = lambda: worker.work(GAMMA, sess, coord, saver)
                t = threading.Thread(target=worker_work)
                t.start()
                time.sleep(1)
                worker_threads.append(t)
            coord.join(worker_threads)
コード例 #11
0
def main():
    torch.set_default_tensor_type("torch.DoubleTensor")

    # general params
    # training params
    if len(sys.argv) < 2:
        # general params
        # training params
        # PICK ONE and comment others
        params = PARAMS["CE3-CE4"]
        # params = PARAMS['CE3-CC4']
        # params = PARAMS['CC3-CE4']
        # params = PARAMS['CC3-CC4']
        # params = PARAMS['CE4']
        # params = PARAMS['CC4']
    else:
        setting = sys.argv[1]
        params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        print("training_scenario: {}, testing_scenario: {}".format(
            params["train_scenario_name"], params["test_scenario_name"]))
        params["reward_mode"] = sys.argv[2]

    # a2c
    params["epsilon"] = 0.95
    params["l2_reg"] = 1e-3
    # trpo
    params["max_kl"] = 1e-2
    params["damping"] = 1e-2
    # ppo
    params["clip_epsilon"] = 0.2
    params["optim_value_epochs"] = 1
    # maml
    params["backbone"] = "trpo"

    # generic
    params["learning_rate"] = 0.01
    params["batch_size"] = 2048
    params["gamma"] = 0.99
    params["reward_mode"] = "basic"
    params["use_gpu"] = True
    params["gpuid"] = int(sys.argv[4]) if len(sys.argv) >= 5 else 0

    params["Tensor"] = (torch.cuda.DoubleTensor
                        if params["use_gpu"] else torch.DoubleTensor)
    params["ActionTensor"] = (torch.cuda.LongTensor
                              if params["use_gpu"] else torch.LongTensor)

    random_seed = 1234
    params["use_physics"] = False
    params["full_attempt_limit"] = (
        False
    )  # run to the full attempt limit, regardless of whether or not all solutions were found
    params["num_training_iters"] = r00
    params["num_training_trials"] = params["train_num_trials"]
    params["train_attempt_limit"] = 700

    params["num_testing_iters"] = r00
    params["num_testing_trials"] = params["test_num_trials"]
    params["test_attempt_limit"] = 700

    # RL specific settings
    params["data_dir"] = os.path.dirname(
        ROOT_DIR) + "/OpenLockRLResults/subjects"

    scenario = select_scenario(params["train_scenario_name"],
                               use_physics=params["use_physics"])

    env = gym.make("openlock-v1")
    env.use_physics = params["use_physics"]
    env.full_attempt_limit = params["full_attempt_limit"]
    # set up observation space
    env.observation_space = ObservationSpace(len(scenario.levers),
                                             append_solutions_remaining=False)
    # set reward mode
    env.reward_mode = params["reward_mode"]
    print("Reward mode: {}".format(env.reward_mode))
    np.random.seed(random_seed)
    env.seed(random_seed)

    # dummy agent
    agent = MAML_K_Shot_Agent(env, 1, 1, params, require_log=False)
    trial_selected = agent.setup_trial(
        scenario_name=params["train_scenario_name"],
        action_limit=params["train_action_limit"],
        attempt_limit=params["train_attempt_limit"],
    )
    env.reset()

    state_size = agent.env.observation_space.multi_discrete.shape[0]
    action_size = len(env.action_space)
    agent = MAML_K_Shot_Agent(env, state_size, action_size, params)
    save_path = os.path.join(
        params["data_dir"],
        "3rd_model_log/k_shot-{}-{}-{}".format(params["train_scenario_name"],
                                               params["reward_mode"],
                                               agent.subject_id),
    )
    # save_path = os.path.join(params['data_dir'], '3rd_model_log/k_shot-CC3-{}-{}-{}'.format(
    #                                    params['train_scenario_name'], params['reward_mode'],
    #                                    agent.subject_id))
    load_path = sys.argv[3] if len(
        sys.argv) >= 4 else ""  # path without '.*' suffix
    os.makedirs(save_path, exist_ok=True)

    agent.env.reset()
    if load_path:
        agent.load(load_path)
        print("load model from {}".format(load_path))
    else:
        print("[Warn] No meta-trained model found, will transfer from scratch")

    agent.env.human_agent = False
    agent.type_tag = "{}-K_Shot".format(params["train_scenario_name"])
    # train over multiple iterations over all trials
    fig = create_reward_fig()
    for iter_num in range(params["num_training_iters"]):
        agent.env.completed_trials = []
        for trial_num in range(0, params["num_training_trials"]):
            agent.run_trial_maml_k_shot(
                scenario_name=params["train_scenario_name"],
                fig=fig,
                action_limit=params["train_action_limit"],
                attempt_limit=params["train_attempt_limit"],
                trial_count=trial_num,
                iter_num=iter_num,
            )
            fig, data = agent.log_values(
                [
                    agent.trial_length,
                    agent.trial_percent_attempt_success,
                    agent.trial_percent_solution_found,
                    agent.average_trial_rewards,
                    agent.attempt_rewards,
                ],
                fig,
                [
                    "Attempt Count Per Trial",
                    "Percentage of Successful Attempts in Trial",
                    "Percentage of Solutions Found in Trial",
                    "Average Trial Reward",
                    "Attempt Reward",
                ],
                agent.type_tag,
            )
            pickle.dump(
                (agent.type_tag, data, params),
                open(os.path.join(save_path, "log.pkl"), "wb"),
            )
            # update
            if len(agent.memory) > params["batch_size"]:
                batch = agent.memory.sample()
                print("update with bs:{}".format(len(batch.state)))
                agent.update(batch, iter_num)
                agent.memory.clear()
        agent.save(save_path, iter_num)
    print("Trial complete for subject {}. Average reward: {}".format(
        agent.logger.subject_id, agent.average_trial_rewards[-1]))
    fig.savefig(os.path.join(save_path, "log.png"))