def load_database(n_parts, DB_ID, buffer_size, level): database = ExperienceBuffer(buffer_size, level) for i in range(0, n_parts): PATH = LOAD_PATH + DB_ID + '/SAC_training_level1_database_part_' + str( i) + '.p' database.buffer += pickle.load(open(PATH, 'rb')) return database
def train(params, log_dir, local_log, random_seed, trial, agent_id): # define device on which to run device = torch.device(params["DEVICE"]) # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)]) env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[84,84]) env.configure(stack_frames=4) env = wrap_env_malmo(env) if random_seed: env.seed(random_seed) print("Observation Space: ", env.observation_space) print("Action Space: ", env.action_space) # initialize agent bufer = ExperienceBuffer(params["REPLAY_SIZE"]) # buffer = ExperienceBuffer(params["REPLAY_SIZE"]) net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, trial, log_dir, params) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = COLORS[agent_id] local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []} # fill buffer with initial size - don't count these episodes agent.fill_buffer() # training loop ep_count = 0 while not agent.completed: ep_count += 1 episode_over = False episode_start = time.time() while not episode_over: # play step frame_start = time.time() episode_over, done_reward = agent.play_step(device=device) agent.frame_idx+= 1 #### Folllowing methods on episode basis if done_reward is not None: # calculate episode speed agent.ep_speed = time.time() - episode_start # reset trackers episode_start = time.time() # save to local log as well local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1]) local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1]) if params["INDEPENDENT_EVALUATION"]: offline_evaluation(params, agent) else: online_evaluation(params, agent) ## check if problem has been solved if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) agent.completed = True # if no sign of converging, also break if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]: agent.completed = True #### Folllowing methods on frame basis # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \ (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() # or hard updates else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.hard_update_target_network() ## learn loss_t = agent.learn(device) # record agent.frame_speed = 1000 / (time.time() - frame_start) if params["DEBUG"]: agent.record_frame(loss_t.detach().item()) # detach required? # del bufer to force gc later, occupies too much memory del bufer # closes tensorboard writer agent.writer.close()
agent = create_third_level_agent(concept_path, args.load_concept_id, args.n_concepts, noisy=noisy, n_heads=n_heads, init_log_alpha=args.init_log_alpha, latent_dim=args.vision_latent_dim, parallel=args.parallel_q_nets, lr=args.lr, lr_alpha=args.lr_alpha, lr_actor=args.lr_actor, min_entropy_factor=args.entropy_factor, lr_c=args.lr_c, lr_Alpha=args.lr_c_Alpha, entropy_update_rate=args.entropy_update_rate, init_Epsilon=args.init_epsilon_MC, delta_Epsilon=args.delta_epsilon_MC) if args.load_id is not None: if args.load_best: agent.load(MODEL_PATH + env_name + '/best_', args.load_id) else: agent.load(MODEL_PATH + env_name + '/last_', args.load_id) agents = collections.deque(maxlen=args.n_agents) agents.append(agent) os.makedirs(MODEL_PATH + env_name, exist_ok=True) database = ExperienceBuffer(buffer_size, level=2) trainer = Trainer(optimizer_kwargs=optimizer_kwargs) returns = trainer.loop(env, agents, database, n_episodes=n_episodes, render=args.render, max_episode_steps=n_steps_in_second_level_episode, store_video=store_video, wandb_project=wandb_project, MODEL_PATH=MODEL_PATH, train=(not args.eval), initialization=initialization, init_buffer_size=init_buffer_size, save_step_each=save_step_each, train_each=args.train_each, n_step_td=n_step_td, train_n_MC=args.train_n_mc, rest_n_MC=args.rest_n_mc, eval_MC=args.eval_MC) G = returns.mean() print("Mean episode return: {:.2f}".format(G))
parser.add_argument( "--vision_latent_dim", default=DEFAULT_VISION_LATENT_DIM, help="Dimensionality of feature vector added to inner state, default=" + str(DEFAULT_VISION_LATENT_DIM)) args = parser.parse_args() render_kwargs = { 'pixels': { 'width': 168, 'height': 84, 'camera_name': 'front_camera' } } database = ExperienceBuffer(args.buffer_size, level=3) trainer = Trainer() env_model_pairs = load_env_model_pairs(args.file) n_envs = len(env_model_pairs) n_episodes = (args.buffer_size * args.save_step_each) // args.n_steps store_video = False for env_number, (env_name, model_id) in enumerate(env_model_pairs.items()): task_database = ExperienceBuffer(args.buffer_size // n_envs, level=2) env = AntPixelWrapper( PixelObservationWrapper(gym.make(env_name).unwrapped, pixels_only=False, render_kwargs=render_kwargs.copy()))
def train(params, log_dir, local_log, random_seed, trial): # define device on which to run device = torch.device(params["DEVICE"]) ## Marlo specifics # get join tokens env = init_environment(params["DEFAULT_ENV_NAME"]) agents = [] for aid in range(params["NUM_AGENTS"]): # initialize bufer if params["SHARING"] and params["PRIORITIZED_SHARING"]: bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"]) else: bufer = ExperienceBuffer(params["REPLAY_SIZE"]) # initialize agent net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(aid), env, bufer, net, tgt_net, gamma, epsilon, tau, trial, log_dir, params) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = COLORS[aid] local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []} # fill buffer with initial size - don't count these episodes agent.fill_buffer() agents.append(agent) # training loop ep_count = 0 while sum(map(lambda agent:agent.completed, agents)) != len(agents): # overall count of episodes ep_count += 1 # sharing if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0: if params["PRIORITIZED_SHARING"]: share(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"], params["SHARING_THRESHOLD"]) else: share_no_mask(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"]) # each agent does one episode for agent in agents: ## Before 2 agents perform, act, do one round of experience share # given a sharing interval and it is not the first episode if not agent.completed: episode_over = False episode_start = time.time() while not episode_over: # play step frame_start = time.time() episode_over, done_reward = agent.play_step(device=device) agent.frame_idx+= 1 #### Folllowing methods on episode basis if done_reward is not None: # calculate episode speed agent.ep_speed = 1 / (time.time() - episode_start) # reset trackers episode_start = time.time() # save to local log as well local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1]) local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1]) if params["INDEPENDENT_EVALUATION"]: offline_evaluation(params, agent, log_dir) else: online_evaluation(params, agent, log_dir) ## check if problem has been solved # need a minimum number of episodes to evaluate if len(agent.total_rewards) >= params["NUMBER_EPISODES_MEAN"]: # and mean reward has to go above boundary if agent.mean_reward >= params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) agent.completed = True # if no sign of converging, also break if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]: agent.completed = True #### Folllowing methods on frame basis # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \ agent.frame_idx / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() # or hard updates else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.hard_update_target_network() ## learn loss_t = agent.learn(device) # record agent.frame_speed = 1 / (time.time() - frame_start) if params["DEBUG"]: agent.record_frame(loss_t.detach().item()) # detach required? # del bufer to force gc later, occupies too much memory del bufer for agent in agents: del agent.exp_buffer # closes tensorboard writer agent.writer.close()
def DQN_experiment(params, log_dir, random_seed=None): # define device on which to run device = torch.device(params["DEVICE"]) # fix replay start sie to be equal to replay size params["REPLAY_START_SIZE"] = params["REPLAY_SIZE"] ## initialize global variables # initialize local log trackers log_episodes_count = [] log_ma_steps = [] log_md_steps = [] log_ma_rewards = [] log_md_rewards = [] colors=['green','red','blue','yellow','cyan','magenta','grey','white'] # try several times and average results, needs to compensate for stochasticity for trial in range(params["NUM_TRIALS"]): # initialize environment agents = [] # need to be one env per agent env = make_env(params["DEFAULT_ENV_NAME"]) if random_seed: env.seed(random_seed) # initialize agents for idx in range(params["NUM_AGENTS"]): # initialize agent buffer = ExperienceBuffer(params["REPLAY_SIZE"], env) net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(idx+1), env, buffer, net, tgt_net, gamma, epsilon, tau, trial, log_dir) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = colors[idx] agents.append(agent) ######### training loop ################################ ts = time.time() # track start time ######### 1. Filling replay bugg ################################ # both agents fill their buffer prior to experience for agent in agents: while True: # add frame count agent.frame_idx+= 1 # play step episode_over, done_reward = agent.play_step(device=device) if params["DEBUG"]: agent.record() # check if minimum buffer size has been achieved. if not, move on, do not do learning if len(agent.exp_buffer) >= params["REPLAY_START_SIZE"]: agent.reset() break ######### 1. They start alternating ################################ episode_start = time.time() ep_count = 0 # while all agents have not completed: while sum(map(lambda agent:agent.completed, agents)) != len(agents): ep_count += 1 # agents alternate for agent in agents: ## Before 2 agents perform, act, do one round of experience share # given a sharing interval and it is not the first episode if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0: # agent 1 requests student, teacher = agents[0], agents[1] transfer_mask = student.request_share(threshold=0) transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[-1], transfer_mask) student.exp_buffer.extend(transfer_batch) # agent 2 requests student, teacher = agents[1], agents[0] transfer_mask = student.request_share(threshold=0) transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[1], transfer_mask) student.exp_buffer.extend(transfer_batch) # check if agent has not completed the task already # if it does, go to the next agent if not agent.completed: # play until episode is over episode_over = False while not episode_over: # add frame count agent.frame_idx+= 1 # play step episode_over, done_reward = agent.play_step(device=device) if done_reward is not None: # calculate speed agent.speed = (agent.frame_idx - agent.ts_frame) / (time.time() - ts) agent.ts_frame = agent.frame_idx ts = time.time() # get time between episodes ## verify completion and report metrics if params["INDEPENDENT_EVALUATION"]: if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0: agent.test_rewards = [] evaluation_start = time.time() for _ in range(100): done_reward = False while not done_reward: _, done_reward = agent.play_step(device=device, test=True) agent.test_rewards.append(done_reward) evaluation_time = time.time() - evaluation_start # only report after one episode ends agent.mean_reward = np.mean(agent.test_rewards) agent.std_reward = np.std(agent.test_rewards) # calculate elapsed time episode_end = time.time() episode_speed = params["TRACKING_INTERVAL"] / (episode_end - episode_start) episode_start = time.time() # report print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s, eval_time %.2f s" % ( agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed, evaluation_time ), agent.print_color)) ## check if reward has improved from last iteration if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) # save final version # save final version # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat") # mark as completed agent.completed = True # save local log log_episodes_count[agent.alias].append(len(agent.total_rewards)) log_steps[agent.alias].append(len(agent.total_rewards)) ## approach to track evaluation using moving averages: else: # only report after one episode ends agent.mean_reward = np.mean(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:]) agent.std_reward = np.std(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:]) # calculate elapsed time episode_end = time.time() episode_speed = 1 / (episode_end - episode_start) episode_start = time.time() # report if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0: print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s" % ( agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed ), agent.print_color)) ## check if reward has improved from last iteration if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) # save final version # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat") # mark as completed agent.completed = True # save local log log_episodes_count.append(len(agent.total_rewards)) log_ma_rewards.append(np.mean(agent.total_rewards[-params["REPORTING_INTERVAL"]:])) log_md_rewards.append(np.std(agent.total_rewards[-params["REPORTING_INTERVAL"]:])) log_ma_steps.append(np.mean(agent.total_steps[-params["REPORTING_INTERVAL"]:])) log_md_steps.append(np.std(agent.total_steps[-params["REPORTING_INTERVAL"]:])) # if no sign of converging, also break # but don't store the result if len(agent.total_rewards) > params["MAX_GAMES_PLAYED"]: agent.completed = True # decay epsilon after the first episodes that fill the buffer # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.tgt_net.load_state_dict(agent.net.state_dict()) ## learn # zero gradients agent.optimizer.zero_grad() # sample from buffer batch = agent.exp_buffer.sample(params["BATCH_SIZE"]) # calculate loss # decide to leave it on the agent as a static method, instead of floating around loss_t = agent.calc_loss(batch, device=device) # calculate gradients loss_t.backward() # gradient clipping if params["GRADIENT_CLIPPING"]: nn.utils.clip_grad_norm_(net.parameters(), params["GRAD_L2_CLIP"]) # optimize agent.optimizer.step() # track agent parameters, including loss function # detach loss before extracting value - not sure if needed, but better safe than sorry if params["DEBUG"]: agent.record(loss_t.detach().item()) for agent in agents: agent.writer.close() # return local log with results local_log = { "episodes_count": log_episodes_count, "ma_steps": log_ma_steps, "md_steps": log_md_steps, "ma_rewards": log_ma_rewards, "md_rewards": log_md_rewards } return local_log
# Initilize Weights-and-Biases project if wandb_project: wandb.init(project=project_name) # Log hyperparameters in WandB project wandb.config.update(args) wandb.config.active_multitask = DEFAULT_ACTIVE_MULTITASK wandb.config.active_dc_torque = DAFAULT_DC_TORQUE env = gym.make(args.env_name) agent = generate_agent(env, args.load_id, args.load_best, actor_critic_kwargs) database = ExperienceBuffer(args.buffer_size, level=1) trainer = Trainer(optimizer_kwargs=optimizer_kwargs) returns = trainer.loop(env, agent, database, n_episodes=n_episodes, render=args.render, max_episode_steps=args.n_steps_in_episode, store_video=store_video, wandb_project=wandb_project, MODEL_PATH=MODEL_PATH, train=(not args.eval), initialization=args.initialization, init_buffer_size=args.init_buffer_size)