def __init__( self, name, s_size, a_size, trainer, model_path, global_episodes, env_name, seed, test, cell_units, params, testing_trial=False, ): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) self.is_test = test self.s_size = s_size self.a_size = a_size self.params = params self.MINI_BATCH = 30 self.REWARD_FACTOR = 0.001 # Create the local copy of the network and the tensorflow op to copy global parameters to local network # self.update_local_ops = update_target_graph('global', self.name) self.testing_trial = testing_trial if not self.testing_trial: self.scenario_name = params["train_scenario_name"] self.attempt_limit = params["train_attempt_limit"] else: self.scenario_name = params["test_scenario_name"] self.attempt_limit = params["test_attempt_limit"] self.scenario = select_scenario(self.scenario_name, params["use_physics"]) env = gym.make(env_name) self.agent = A3CAgent(env, self.s_size, self.a_size, self.name, self.params) self.agent.env.reward_mode = params["reward_mode"] self.agent.env.use_physics = params["use_physics"] self.trial_count = 0 self.agent.env.seed(seed)
def generate_solutions_by_trial(scenario_name, trial_name): solution_chains = [] scenario = select_scenario(scenario_name, use_physics=False) # TODO(mjedmonds): extract these from the environment/scenario somehow. these are hard-coded lever_cpt_choice = GRAPH_INT_TYPE(1) door_cpt_choice = GRAPH_INT_TYPE(0) lever_ending_state = GRAPH_INT_TYPE(ENTITY_STATES["LEVER_PUSHED"]) door_ending_state = GRAPH_INT_TYPE(ENTITY_STATES["DOOR_OPENED"]) scenario_solutions = scenario.solutions trial_levers = LEVER_CONFIGS[trial_name] for scenario_solution in scenario_solutions: solution_actions = [] solution_states = [] solution_cpt_choices = [] solution_attributes = [] solution_outcomes = [] for action_log in scenario_solution: action_name = action_log.name state_name = action_name.split("_")[1] if state_name == "door": ending_state = door_ending_state cpt_choice = door_cpt_choice else: # determine position of lever based on role for trial_lever in trial_levers: if (get_one_of( trial_lever, ["LeverRole", "LeverRoleEnum"]) == state_name): state_name = trial_lever.LeverPosition.name ending_state = lever_ending_state cpt_choice = lever_cpt_choice action_name = "push_" + state_name attributes = (state_name, "GREY") solution_actions.append(action_name) solution_states.append(state_name) solution_attributes.append(attributes) solution_cpt_choices.append(cpt_choice) solution_outcomes.append(ending_state) solution_chains.append( CausalChainCompact( states=tuple(solution_states), actions=tuple(solution_actions), conditional_probability_table_choices=tuple( solution_cpt_choices), outcomes=tuple(solution_outcomes), attributes=tuple(solution_attributes), )) return solution_chains
def generate_solutions_by_trial_causal_relation(scenario_name, trial_name): solution_chains = [] scenario = select_scenario(scenario_name, use_physics=False) # todo: extract these from the environment/scenario somehow. these are hard-coded lever_causal_relation_type = CausalRelationType.one_to_zero door_causal_relation_type = CausalRelationType.zero_to_one scenario_solutions = scenario.solutions trial_levers = LEVER_CONFIGS[trial_name] for scenario_solution in scenario_solutions: solution_chain = [] precondition = None for action_log in scenario_solution: action_name = action_log.name state_name = action_name.split("_")[1] if state_name == "door": causal_relation = door_causal_relation_type else: # determine position of lever based on role for trial_lever in trial_levers: if trial_lever.LeverRoleEnum == state_name: state_name = trial_lever.LeverPosition.name causal_relation = lever_causal_relation_type action_name = "push" attributes = (state_name, "GREY") solution_chain.append( CausalRelation(action=Action(action_name, attributes[0], None), attributes=attributes, causal_relation_type=causal_relation, precondition=precondition)) # setup precondition for next link in chain precondition = (attributes, causal_relation[1]) solution_chains.append(tuple(solution_chain)) return solution_chains
def generate_solutions_by_trial_causal_relation(scenario_name, trial_name): solution_chains = [] scenario = select_scenario(scenario_name, use_physics=False) # TODO(mjedmonds): extract these from the environment/scenario somehow. these are hard-coded lever_causal_relation_type = CausalRelationType.one_to_zero door_causal_relation_type = CausalRelationType.zero_to_one scenario_solutions = get_one_of(scenario, ["SOLUTIONS", "solutions"]) trial_levers = LEVER_CONFIGS[trial_name] for scenario_solution in scenario_solutions: solution_chain = [] precondition = None attributes = None causal_relation = None delay = 0 first = True # We can't know what the delay is until we see the next action. So we don't create the # causal relation for an action until we see the next non-wildcard action. for action_log in scenario_solution: action_name = action_log.name if action_name == "*": if first: raise ValueError( "Solutions cannot start with a wildcard action.") delay += 1 continue elif not first: solution_chain.append( CausalRelation( action=Action(name="push", obj=attributes[0], params=None), attributes=attributes, causal_relation_type=causal_relation, precondition=precondition, delay=delay, )) delay = 0 precondition = (attributes, causal_relation[1]) first = False state_name = action_name.split("_")[1] if state_name == "door": causal_relation = door_causal_relation_type else: # determine position of lever based on role for trial_lever in trial_levers: if (get_one_of( trial_lever, ["LeverRole", "LeverRoleEnum"]) == state_name): state_name = trial_lever.LeverPosition.name causal_relation = lever_causal_relation_type attributes = (state_name, "GREY") # Append the last action solution_chain.append( CausalRelation( action=Action("push", attributes[0], None), attributes=attributes, causal_relation_type=causal_relation, precondition=precondition, delay=delay, )) solution_chains.append(tuple(solution_chain)) return solution_chains
def main(): torch.set_default_tensor_type("torch.DoubleTensor") # general params # training params params_list = [] env_list = ["CE3-CE4", "CE4"] memory_list = [] fig_list = [create_reward_fig() for _ in env_list] log_list = [[] for _ in env_list] for env_name in env_list: # if len(sys.argv) < 2: # # general params # # training params # # PICK ONE and comment others # params = PARAMS['CE3-CE4'] # # params = PARAMS['CE3-CC4'] # # params = PARAMS['CC3-CE4'] # # params = PARAMS['CC3-CC4'] # # params = PARAMS['CE4'] # # params = PARAMS['CC4'] # else: # setting = sys.argv[1] # params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] # print('training_scenario: {}, testing_scenario: {}'.format(params['train_scenario_name'], # params['test_scenario_name'])) # params['reward_mode'] = sys.argv[2] memory_list.append(Memory()) # generic params = PARAMS[env_name] params["gamma"] = 0.99 params["reward_mode"] = "basic" # a2c params["epsilon"] = 0.95 params["l2_reg"] = 1e-3 # trpo params["max_kl"] = 1e-2 params["damping"] = 1e-2 # ppo params["clip_epsilon"] = 0.2 params["optim_value_epochs"] = 1 # maml params["backbone"] = "trpo" params["num_grad_update"] = 1 params["lr_pre_update"] = 1e-3 params["lr_meta_update"] = 1e-3 params["pre_batch_size"] = 2048 params["meta_batch_size"] = 2048 # others params["use_gpu"] = True and torch.cuda.is_available() params["gpuid"] = int(sys.argv[4]) if len(sys.argv) >= 5 else 0 params["Tensor"] = ( torch.cuda.DoubleTensor if params["use_gpu"] else torch.DoubleTensor ) params["ActionTensor"] = ( torch.cuda.LongTensor if params["use_gpu"] else torch.LongTensor ) random_seed = 1234 params["use_physics"] = False params[ "full_attempt_limit" ] = ( False ) # run to the full attempt limit, regardless of whether or not all solutions were found params["num_training_iters"] = 1000 params["num_training_trials"] = params["train_num_trials"] params["train_attempt_limit"] = 700 params["num_testing_iters"] = 1000 params["num_testing_trials"] = params["test_num_trials"] params["test_attempt_limit"] = 700 # RL specific settings params["data_dir"] = os.path.dirname(ROOT_DIR) + "/OpenLockRLResults/subjects" params_list.append(params) # TODO: we assume all the scenarios share the same observation space scenario = select_scenario( params["train_scenario_name"], use_physics=params["use_physics"] ) env = gym.make("openlock-v1") env.use_physics = params["use_physics"] env.full_attempt_limit = params["full_attempt_limit"] # set up observation space env.observation_space = ObservationSpace( len(scenario.levers), append_solutions_remaining=False ) # set reward mode env.reward_mode = params["reward_mode"] print("Reward mode: {}".format(env.reward_mode)) np.random.seed(random_seed) env.seed(random_seed) # dummy agent agent = MAMLAgent(env, 1, 1, params, env_list, require_log=False) trial_selected = agent.setup_trial( scenario_name=params["train_scenario_name"], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], ) env.reset() state_size = agent.env.observation_space.multi_discrete.shape[0] action_size = len(env.action_space) agent = MAMLAgent(env, state_size, action_size, params, env_list) save_path = os.path.join( params["data_dir"], "3rd_model_log/maml-{}-{}-{}".format( "_".join(env_list), params["reward_mode"], agent.subject_id ), ) load_path = sys.argv[3] if len(sys.argv) >= 4 else "" # path without '.*' suffix os.makedirs(save_path, exist_ok=True) reward_counter_list = [env.reward_strategy.counter for _ in env_list] reward_attempt_count_list = [env.reward_strategy.attempt_count for _ in env_list] agent.env.reset() if load_path: agent.load(load_path) print("load model from {}".format(load_path)) agent.env.human_agent = False agent.type_tag = "{}-{}-MAML".format("_".join(env_list), params["reward_mode"]) # train over multiple iterations over all trials for iter_num in range(params_list[0]["num_training_iters"]): for ind, env_name in enumerate(env_list): agent.env.completed_trials = [] agent.env.scenario = None agent.env.cur_trial = None agent.env.reward_strategy.counter = reward_counter_list[ind] agent.env.reward_strategy.attempt_count = reward_attempt_count_list[ind] print("[Train] Now meta train on {}".format(env_name)) params = params_list[ind] memory = memory_list[ind] agent._update_params_and_mem(params, memory) for trial_num in range(0, params_list[0]["num_training_trials"]): agent.run_trial_maml( scenario_name=params["train_scenario_name"], fig=fig_list[ind], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], trial_count=trial_num, iter_num=iter_num, env_ind=ind, ) fig_list[ind], log_list[ind] = agent.log_values( [ agent.trial_length[ind], agent.trial_percent_attempt_success[ind], agent.trial_percent_solution_found[ind], agent.average_trial_rewards[ind], agent.attempt_rewards[ind], ], fig_list[ind], [ "Attempt Count Per Trial", "Percentage of Successful Attempts in Trial", "Percentage of Solutions Found in Trial", "Average Trial Reward", "Attempt Reward", ], agent.type_tag + "-{}".format(env_name), ) pickle.dump( (agent.type_tag, log_list, params), open(os.path.join(save_path, "log.pkl"), "wb"), ) # update for ind, env_name in enumerate(env_list): memory = memory_list[ind] params = params_list[ind] batch_a, batch_b = [], [] if len(memory) > params["pre_batch_size"] + params["meta_batch_size"]: print("[Update] Now do an update with {}".format(env_name)) batch_a.append(memory.sample(params["pre_batch_size"])) batch_b.append(memory.sample(params["meta_batch_size"])) memory.clear() print("[Update] Now update") agent.update(batch_a, batch_b, iter_num) agent.save(save_path, iter_num) print( "Trial complete for subject {}. Average reward: {}".format( agent.logger.subject_id, agent.average_trial_rewards[-1] ) ) fig.savefig(os.path.join(save_path, "log.png"))
def train_transfer_test_transfer(agent, fig=None): # train all training cases/trials params = agent.params trial_count = 0 agent, trial_count = run_trials( agent, trial_count, params["train_num_iters"], params["train_num_trials"], params["train_scenario_name"], params["train_action_limit"], params["train_attempt_limit"], params["use_dynamic_epsilon"], params["dynamic_epsilon_max"], params["dynamic_epsilon_decay"], test_trial=False, fig=fig, ) agent.plot_rewards( agent.rewards, agent.epsilons, agent.writer.subject_path + "/training_rewards.png", ) agent.plot_rewards_trial_switch_points( agent.rewards, agent.epsilons, agent.trial_switch_points, agent.writer.subject_path + "/training_rewards_switch_points.png", plot_xticks=False, ) agent.test_start_reward_idx = len(agent.rewards) agent.test_start_trial_count = trial_count agent.save_weights(agent.writer.subject_path + "/models", "/training_final.cpkt", sess=agent.sess) # testing trial # print "INFO: STARTING TESTING TRIAL" if params["test_scenario_name"] is not None: # setup testing trial scenario = select_scenario(params["test_scenario_name"], use_physics=params["use_physics"]) agent.env.update_scenario(scenario) agent.env.set_action_limit(params["test_action_limit"]) agent.env.observation_space = ObservationSpace( len(scenario.levers), append_solutions_remaining=False) agent, trial_count = run_trials( agent, trial_count, params["test_num_iters"], params["test_num_trials"], params["test_scenario_name"], params["test_action_limit"], params["test_attempt_limit"], params["use_dynamic_epsilon"], params["dynamic_epsilon_max"], params["dynamic_epsilon_decay"], test_trial=True, ) agent.plot_rewards( agent.rewards[agent.test_start_reward_idx:], agent.epsilons[agent.test_start_reward_idx:], agent.writer.subject_path + "/testing_rewards.png", width=6, height=6, ) agent.save_weights(agent.writer.subject_path + "/models", "/testing_final.h5") return agent
def main(): # general params # training params if len(sys.argv) < 2: # general params # training params # PICK ONE and comment others params = PARAMS["CE3-CE4"] # params = PARAMS['CE3-CC4'] # params = PARAMS['CC3-CE4'] # params = PARAMS['CC3-CC4'] # params = PARAMS['CE4'] # params = PARAMS['CC4'] else: setting = sys.argv[1] params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] print("training_scenario: {}, testing_scenario: {}".format( params["train_scenario_name"], params["test_scenario_name"])) params["reward_mode"] = sys.argv[2] human_decay_mean = 0.7429 # from human data human_decay_median = 0.5480 # from human data # RL specific settings random_seed = 1234 params["use_physics"] = False params[ "full_attempt_limit"] = True # run to the full attempt limit, regardless of whether or not all solutions were found params["train_num_iters"] = 100 params["test_num_iters"] = 10 # params['epsilon_decay'] = 0.9955 # params['epsilon_decay'] = 0.9999 params["epsilon_decay"] = 0.99999 params["dynamic_epsilon_decay"] = 0.9955 params["dynamic_epsilon_max"] = 0.5 params["use_dynamic_epsilon"] = True params["test_num_trials"] = 5 params["data_dir"] = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" params["train_attempt_limit"] = 300 params["test_attempt_limit"] = 300 params["gamma"] = 0.8 # discount rate params["epsilon"] = 1.0 # exploration rate params["epsilon_min"] = 0.00 params["learning_rate"] = 0.0005 params["batch_size"] = 64 # SINGLE TRIAL TRAINING # params['train_attempt_limit'] = 30000 # params['epsilon_decay'] = 0.99995 # params['use_dynamic_epsilon'] = False # dummy settings # params['train_num_iters'] = 10 # params['test_num_iters'] = 10 # params['train_attempt_limit'] = 30 # params['test_attempt_limit'] = 30 # human comparison settings # params['train_num_iters'] = 1 # params['test_num_iters'] = 1 # params['train_attempt_limit'] = 300000 # params['test_attempt_limit'] = 300000 # params['epsilon_decay'] = human_decay_mean # params['dynamic_epsilon_decay'] = human_decay_mean # params['dynamic_epsilon_max'] = 1 # params['use_dynamic_epsilon'] = True scenario = select_scenario(params["train_scenario_name"], use_physics=params["use_physics"]) # setup initial env env = gym.make("openlock-v1") env.use_physics = params["use_physics"] env.full_attempt_limit = params["full_attempt_limit"] # set up observation space env.observation_space = ObservationSpace(len(scenario.levers), append_solutions_remaining=False) # set reward mode env.reward_mode = params["reward_mode"] print("Reward mode: {}".format(env.reward_mode)) agent = DDPGAgent(env, 1, 1, params, None, "init") # create session/trial/experiment # TODO(mjedmonds): passing a fake agent here is a hack np.random.seed(random_seed) env.seed(random_seed) trial_selected = agent.setup_trial( scenario_name=params["train_scenario_name"], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], ) env.reset() # setup agent state_size = agent.env.observation_space.multi_discrete.shape[0] action_size = len(agent.env.action_space) # agent = DQNAgent(state_size, action_size, params) sess = tf.Session() agent = DDPGAgent(env, state_size, action_size, params, sess, "DDPG") # update agent to be a properly initialized agent agent.env.reset() fig = create_reward_fig() agent.sess.run(tf.global_variables_initializer()) # MULTI-TRIAL TRAINING, TESTING # runs through all training trials and testing trials agent = train_transfer_test_transfer(agent, fig) # SINGLE TRIAL TRAINING # agent, env, agent = train_single_trial(agent, env, agent, params, fig) agent.finish_subject() print("Training & testing complete for subject {}".format( agent.logger.subject_id))
human_config_data = common.load_human_config_json() # params["data_dir"] = os.path.dirname(ROOT_DIR) + "/OpenLockResults/subjects" params["data_dir"] = human_config_data["HUMAN_SAVE_DIR"] params["src_dir"] = "/tmp/openlocklearner/" + str(hash( time.time())) + "/src/" params["use_physics"] = True params["effect_probabilities"] = generate_effect_probabilities() # this section randomly selects a testing and training scenario # train_scenario_name, test_scenario_name = select_random_scenarios() # params['train_scenario_name'] = train_scenario_name # params['test_scenario_name'] = test_scenario_name scenario = select_scenario(params["train_scenario_name"]) # todo: this should not be part of OpenLockLearnerAgent env = Agent.pre_instantiation_setup(params) env.lever_index_mode = "role" # create session/trial/experiment manager agent = HumanAgent(params, env) atexit.register(agent.cleanup) # used for debugging, runs a specific scenario & trial # run_specific_trial_and_scenario(manager, 'CC3', 'trial5', params['train_action_limit'], params['train_attempt_limit']) for trial_num in range(0, params["train_num_trials"]): agent.run_trial_human(
def main(): torch.set_default_tensor_type("torch.DoubleTensor") # general params # training params if len(sys.argv) < 2: # general params # training params # PICK ONE and comment others params = PARAMS["CE3-CE4"] # params = PARAMS['CE3-CC4'] # params = PARAMS['CC3-CE4'] # params = PARAMS['CC3-CC4'] # params = PARAMS['CE4'] # params = PARAMS['CC4'] else: setting = sys.argv[1] params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] print("training_scenario: {}, testing_scenario: {}".format( params["train_scenario_name"], params["test_scenario_name"])) params["reward_mode"] = sys.argv[2] params["prioritized_replay"] = False params["max_mem_size"] = 10000 params["eps_start"] = 0.90 params["eps_end"] = 0.05 params["eps_decay"] = 50 params["gamma"] = 0.99 params["learning_rate"] = 0.001 params["epsilon"] = 0.95 params["l2_reg"] = 1e-3 params["batch_size"] = 2048 params["target_update"] = 10 params["use_gpu"] = True params["gpuid"] = int(sys.argv[5]) if len(sys.argv) >= 6 else 0 random_seed = 1234 params["use_physics"] = False params["full_attempt_limit"] = ( False ) # run to the full attempt limit, regardless of whether or not all solutions were found params["num_training_iters"] = 200 params["num_training_trials"] = params["train_num_trials"] params["train_attempt_limit"] = 700 params["num_testing_iters"] = 200 params["num_testing_trials"] = params["test_num_trials"] params["test_attempt_limit"] = 700 # RL specific settings params["data_dir"] = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" scenario = select_scenario(params["train_scenario_name"], use_physics=params["use_physics"]) env = gym.make("openlock-v1") env.use_physics = params["use_physics"] env.full_attempt_limit = params["full_attempt_limit"] # set up observation space env.observation_space = ObservationSpace(len(scenario.levers), append_solutions_remaining=False) # set reward mode env.reward_mode = params["reward_mode"] print("Reward mode: {}".format(env.reward_mode)) # set whether to index by role or position env.lever_index_mode = "role" # env.lever_index_mode = 'position' agent = DDQNAgent(env, 1, 1, params) # create session/trial/experiment # TODO: passing a fake agent here is a hack np.random.seed(random_seed) env.seed(random_seed) # dummy agent agent = DQNAgent(env, 1, 1, params, require_log=False) trial_selected = agent.setup_trial( scenario_name=params["train_scenario_name"], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], ) env.reset() state_size = agent.env.observation_space.multi_discrete.shape[0] action_size = len(env.action_space) agent = DQNAgent(env, state_size, action_size, params) load_path = ( sys.argv[3] if len(sys.argv) >= 4 and sys.argv[3] != "-" else "" ) # path without '.*' suffix transfer_tag = ( sys.argv[4] if len(sys.argv) >= 5 and sys.argv[4] != "-" else "" ) # i.e. CC3toCC4 save_path = os.path.join( params["data_dir"], "3rd_model_log/dqn-{}-{}-{}".format( transfer_tag if transfer_tag else params["train_scenario_name"], params["reward_mode"], agent.subject_id, ), ) os.makedirs(save_path, exist_ok=True) agent.env.reset() if load_path: agent.load(load_path) print("load model from {}".format(load_path)) agent.env.human_agent = False agent.type_tag = "{}-{}-DQN".format( transfer_tag if transfer_tag else params["train_scenario_name"], params["reward_mode"], ) # train over multiple iterations over all trials fig = create_reward_fig() update_count = 0 for iter_num in range(params["num_training_iters"]): agent.env.completed_trials = [] for trial_num in range(0, params["num_training_trials"]): agent.run_trial_dqn( scenario_name=params["train_scenario_name"], fig=fig, action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], trial_count=trial_num, iter_num=iter_num, ) fig, data = agent.log_values( [ agent.trial_length, agent.trial_percent_attempt_success, agent.trial_percent_solution_found, agent.average_trial_rewards, agent.attempt_rewards, ], fig, [ "Attempt Count Per Trial", "Percentage of Successful Attempts in Trial", "Percentage of Solutions Found in Trial", "Average Trial Reward", "Attempt Reward", ], agent.type_tag, ) pickle.dump( (agent.type_tag, data, params), open(os.path.join(save_path, "log.pkl"), "wb"), ) # update if len(agent.memory) > agent.batch_size: batch = agent.memory.sample(agent.batch_size) print("update with bs:{}".format(len(batch.state))) agent.update(batch, iter_num) update_count += 1 if (update_count + 1) % params["target_update"]: agent.target_q_net.load_state_dict( agent.q_net.state_dict()) agent.save(save_path, iter_num) print("Trial complete for subject {}. Average reward: {}".format( agent.logger.subject_id, agent.average_trial_rewards[-1])) fig.savefig(os.path.join(save_path, "log.png"))
def main(argv): global master_network global global_episodes reward_mode = None if len(sys.argv) < 2: # general params # training params # PICK ONE and comment others params = PARAMS["CE3-CE4"] reward_mode = "negative_change_state_partial_action_seq_solution_multiplier" # params = PARAMS['CE3-CC4'] # params = PARAMS['CC3-CE4'] # params = PARAMS['CC3-CC4'] # params = PARAMS['CE4'] # params = PARAMS['CC4'] else: setting = sys.argv[1] params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] print("training_scenario: {}, testing_scenario: {}".format( params["train_scenario_name"], params["test_scenario_name"])) reward_mode = sys.argv[2] use_physics = False num_training_iters = 100 # RL specific settings params["data_dir"] = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" params["train_attempt_limit"] = 300 params["test_attempt_limit"] = 300 params["use_physics"] = False params["num_training_iters"] = 100 params["reward_mode"] = reward_mode # RL specific settings params["use_physics"] = False params["full_attempt_limit"] = ( True ) # run to the full attempt limit, regardless of whether or not all solutions were found params["train_num_iters"] = 100 params["test_num_iters"] = 10 # params['epsilon_decay'] = 0.9955 # params['epsilon_decay'] = 0.9999 params["epsilon_decay"] = 0.9996 params["dynamic_epsilon_decay"] = 0.999 params["dynamic_epsilon_max"] = 0.1 params["use_dynamic_epsilon"] = True params["test_num_trials"] = 5 params["train_attempt_limit"] = 300 params["test_attempt_limit"] = 300 params["gamma"] = 0.8 # discount rate params["epsilon"] = 0.005 # exploration rate0.01 0.05 params["epsilon_min"] = 0.001 params["learning_rate"] = 0.0005 params["batch_size"] = 64 scenario = select_scenario(params["train_scenario_name"], use_physics=use_physics) ENV_NAME = "openlock-v1" env = gym.make(ENV_NAME) env.reward_mode = reward_mode env.use_physics = params["use_physics"] env.full_attempt_limit = params["full_attempt_limit"] # set up observation space env.observation_space = ObservationSpace(len(scenario.levers), append_solutions_remaining=False) # set reward mode env.reward_mode = params["reward_mode"] print("Reward mode: {}".format(env.reward_mode)) agent = A3CAgent(env, 1, 1, "init", params) # create session/trial/experiment trial_selected = agent.setup_trial( scenario_name=params["train_scenario_name"], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], ) env.observation_space = ObservationSpace(len(scenario.levers)) MODEL_DIR = "./OpenLockRLResults/subjects" + "/models" MONITOR_DIR = "./OpenLockRLResults/subjects" + "/monitor" MODEL_DIR = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" + "/models" MONITOR_DIR = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" + "/monitor" STATE_DIM = env.observation_space.multi_discrete.shape[0] ACTION_DIM = len(env.action_space) # delete temporary env env.close() tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True) if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) with tf.device("/cpu:0"): np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) global_episodes = tf.Variable(0, dtype=tf.int32, name="global_episodes", trainable=False) trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) master_network = AC_Network(STATE_DIM, ACTION_DIM, "global", None, CELL_UNITS) # Generate global network num_workers = (multiprocessing.cpu_count() ) # Set workers to number of available CPU threads # For testing and visualisation we only need one worker if TEST_MODEL: num_workers = 1 # num_workers = 8 set your own proper worker workers = [] # Create worker causal_classes for i in range(num_workers): workers.append( Worker( name=i, s_size=STATE_DIM, a_size=ACTION_DIM, trainer=trainer, model_path=MODEL_DIR, global_episodes=global_episodes, env_name=ENV_NAME, seed=RANDOM_SEED, test=TEST_MODEL, cell_units=CELL_UNITS, params=params, testing_trial=TEST_MODEL, )) saver = tf.train.Saver(max_to_keep=num_workers) # Gym monitor if not TEST_MODEL: env = workers[0].get_env() env = gym.wrappers.Monitor(env, MONITOR_DIR, video_callable=False, force=True) with tf.Session(config=config) as sess: coord = tf.train.Coordinator() if LOAD_MODEL or TEST_MODEL: print("Loading Model...") ckpt = tf.train.get_checkpoint_state(MODEL_DIR) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) if TEST_MODEL: env = workers[0].get_env() # env = gym.wrappers.Monitor(env, MONITOR_DIR, force=True) workers[0].work(GAMMA, sess, coord, saver) else: # This is where the asynchronous magic happens. # Start the "work" process for each worker in a separate thread. print("Launching workers...") worker_threads = [] for worker in workers: worker_work = lambda: worker.work(GAMMA, sess, coord, saver) t = threading.Thread(target=worker_work) t.start() time.sleep(1) worker_threads.append(t) coord.join(worker_threads)
def main(): torch.set_default_tensor_type("torch.DoubleTensor") # general params # training params if len(sys.argv) < 2: # general params # training params # PICK ONE and comment others params = PARAMS["CE3-CE4"] # params = PARAMS['CE3-CC4'] # params = PARAMS['CC3-CE4'] # params = PARAMS['CC3-CC4'] # params = PARAMS['CE4'] # params = PARAMS['CC4'] else: setting = sys.argv[1] params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]] print("training_scenario: {}, testing_scenario: {}".format( params["train_scenario_name"], params["test_scenario_name"])) params["reward_mode"] = sys.argv[2] # a2c params["epsilon"] = 0.95 params["l2_reg"] = 1e-3 # trpo params["max_kl"] = 1e-2 params["damping"] = 1e-2 # ppo params["clip_epsilon"] = 0.2 params["optim_value_epochs"] = 1 # maml params["backbone"] = "trpo" # generic params["learning_rate"] = 0.01 params["batch_size"] = 2048 params["gamma"] = 0.99 params["reward_mode"] = "basic" params["use_gpu"] = True params["gpuid"] = int(sys.argv[4]) if len(sys.argv) >= 5 else 0 params["Tensor"] = (torch.cuda.DoubleTensor if params["use_gpu"] else torch.DoubleTensor) params["ActionTensor"] = (torch.cuda.LongTensor if params["use_gpu"] else torch.LongTensor) random_seed = 1234 params["use_physics"] = False params["full_attempt_limit"] = ( False ) # run to the full attempt limit, regardless of whether or not all solutions were found params["num_training_iters"] = r00 params["num_training_trials"] = params["train_num_trials"] params["train_attempt_limit"] = 700 params["num_testing_iters"] = r00 params["num_testing_trials"] = params["test_num_trials"] params["test_attempt_limit"] = 700 # RL specific settings params["data_dir"] = os.path.dirname( ROOT_DIR) + "/OpenLockRLResults/subjects" scenario = select_scenario(params["train_scenario_name"], use_physics=params["use_physics"]) env = gym.make("openlock-v1") env.use_physics = params["use_physics"] env.full_attempt_limit = params["full_attempt_limit"] # set up observation space env.observation_space = ObservationSpace(len(scenario.levers), append_solutions_remaining=False) # set reward mode env.reward_mode = params["reward_mode"] print("Reward mode: {}".format(env.reward_mode)) np.random.seed(random_seed) env.seed(random_seed) # dummy agent agent = MAML_K_Shot_Agent(env, 1, 1, params, require_log=False) trial_selected = agent.setup_trial( scenario_name=params["train_scenario_name"], action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], ) env.reset() state_size = agent.env.observation_space.multi_discrete.shape[0] action_size = len(env.action_space) agent = MAML_K_Shot_Agent(env, state_size, action_size, params) save_path = os.path.join( params["data_dir"], "3rd_model_log/k_shot-{}-{}-{}".format(params["train_scenario_name"], params["reward_mode"], agent.subject_id), ) # save_path = os.path.join(params['data_dir'], '3rd_model_log/k_shot-CC3-{}-{}-{}'.format( # params['train_scenario_name'], params['reward_mode'], # agent.subject_id)) load_path = sys.argv[3] if len( sys.argv) >= 4 else "" # path without '.*' suffix os.makedirs(save_path, exist_ok=True) agent.env.reset() if load_path: agent.load(load_path) print("load model from {}".format(load_path)) else: print("[Warn] No meta-trained model found, will transfer from scratch") agent.env.human_agent = False agent.type_tag = "{}-K_Shot".format(params["train_scenario_name"]) # train over multiple iterations over all trials fig = create_reward_fig() for iter_num in range(params["num_training_iters"]): agent.env.completed_trials = [] for trial_num in range(0, params["num_training_trials"]): agent.run_trial_maml_k_shot( scenario_name=params["train_scenario_name"], fig=fig, action_limit=params["train_action_limit"], attempt_limit=params["train_attempt_limit"], trial_count=trial_num, iter_num=iter_num, ) fig, data = agent.log_values( [ agent.trial_length, agent.trial_percent_attempt_success, agent.trial_percent_solution_found, agent.average_trial_rewards, agent.attempt_rewards, ], fig, [ "Attempt Count Per Trial", "Percentage of Successful Attempts in Trial", "Percentage of Solutions Found in Trial", "Average Trial Reward", "Attempt Reward", ], agent.type_tag, ) pickle.dump( (agent.type_tag, data, params), open(os.path.join(save_path, "log.pkl"), "wb"), ) # update if len(agent.memory) > params["batch_size"]: batch = agent.memory.sample() print("update with bs:{}".format(len(batch.state))) agent.update(batch, iter_num) agent.memory.clear() agent.save(save_path, iter_num) print("Trial complete for subject {}. Average reward: {}".format( agent.logger.subject_id, agent.average_trial_rewards[-1])) fig.savefig(os.path.join(save_path, "log.png"))