def start_experiment(experiment): init_random_seeds(experiment.random_seed, cuda_determenistic=False) init_logger("logdir", experiment.logname) log().update_params(experiment.to_dict()) try: train(experiment) finally: log().save_logs()
def train(experiment): env = experiment.env(random_state=experiment.random_seed) memory = ReplayMemory(experiment.hyperparams.memory_config.memory_size) controller = ControllerDQN(env=env, memory=memory, params=experiment.hyperparams, prune_percent=experiment.prune_percent, pruner=experiment.pruner, stop_criterion=experiment.stop_criterion, device=experiment.device) agent = Agent(env, controller) EXPLORE_ITERS = 1 EXPLOIT_ITERS = 1 episodes, prune_iters, opt_steps = experiment.episodes, experiment.prune_iters, experiment.opt_steps for iter in range(prune_iters): pbar = tqdm(range(episodes)) cur_percent = (1 - experiment.prune_percent / 100)**iter explore_plot = "Explore_iter" + str(iter) + "_prune" + str(cur_percent) exploit_plot = "Exploit_iter" + str(iter) + "_prune" + str(cur_percent) log().add_plot(explore_plot, columns=("train_episode", "train_steps", "reward")) log().add_plot(exploit_plot, columns=("train_episode", "train_steps", "reward")) for episode in pbar: # once in EXPLORE_ITERS train rollouts, do EXPLOIT_ITERS exploit rollouts if episode % EXPLORE_ITERS == EXPLORE_ITERS - 1: for _ in range(EXPLOIT_ITERS): pbar.set_description( "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Exploit". format(iter + 1, prune_iters, episode + 1, episodes, controller.steps_done, opt_steps)) exploit(agent, episode, exploit_plot) pbar.set_description( "Iter[{}/{}] Episode [{}/{}] Step[{}/{}] Explore".format( iter + 1, prune_iters, episode + 1, episodes, controller.steps_done, opt_steps)) explore(agent, episode, explore_plot) if controller.steps_done >= opt_steps: break if controller.optimization_completed( ) and not iter + 1 == prune_iters: # no stop on last iteration break torch.cuda.empty_cache() log().save_logs() log().save_model(controller.get_state(), "model:iter{}:{}".format(iter, cur_percent)) controller.prune() controller.reinit()
def launch_after_training(params, net_state_dict, device, episodes, opt_steps): env = Assault(23) net = DQN(env.state_sz, env.action_sz, "vae", params["image_input"] == "True", device=device).to(device) net.load_state_dict(net_state_dict) controller = FixedController( lambda state, explore: net(state.to(device)).max(1)[1].item()) agent = Agent(env, controller) plot_name = "AfterTraining" log().add_plot(plot_name, columns=("train_episode", "train_steps", "reward")) pbar = tqdm(range(episodes)) total_steps = 0 for episode in pbar: pbar.set_description("Episode [{}/{}] Step[{}/{}] Exploit".format( episode + 1, episodes, total_steps, opt_steps)) reward, steps = agent.rollout(train=False) total_steps += steps log().add_plot_point(plot_name, (episode, total_steps, reward)) if total_steps >= opt_steps: break log().save_logs()
def exploit(agent, train_episode, plot_name): reward, steps = agent.rollout(train=False) log().add_plot_point(plot_name, (train_episode, agent.controller.steps_done, reward)) agent.controller.metrics["stability"].add(reward)
def explore(agent, train_episode, plot_name): reward, steps = agent.rollout(train=True) log().add_plot_point(plot_name, (train_episode, agent.controller.steps_done, reward))
def add(self, value): log().add_plot_point(self.name, json.dumps([x.item() for x in value]))
def __init__(self, name): self.name = name log().add_plot(name, columns=("metric_value", ))
def get_plot(self): return log().get_plot(self.name)
def add__(self, value): log().add_plot_point(self.name, value)