def main(): config = C( horizon=32, discount=0.99, ) env = gym.make("MiniGrid-Empty-Random-6x6-v0") env = gym_minigrid.wrappers.OneHotPartialObsWrapper(env) env = gym_minigrid.wrappers.ImgObsWrapper(env) env = wrappers.Batch(env) env = wrappers.Torch(env) model = ActorCriticModel(env.observation_space, env.action_space) optimizer = torch.optim.RMSprop(model.parameters(), 0.01, alpha=0.99, eps=1e-8) obs = env.reset() output = model(obs) for _ in tqdm(range(10000)): history = History() for i in range(config.horizon): transition = history.append_transition() transition.record(value=output.value) action = output.dist.sample() obs, reward, done, _ = env.step(action) transition.record(reward=reward, done=done) output_prime = model(obs) output = output_prime history = history.build() return_ = n_step_bootstrapped_return(history.reward, output_prime.value.detach(), history.done, config.discount) td_error = history.value - return_ loss = td_error.pow(2).mean() loss.backward() optimizer.step()
from all_the_tools.config import Config as C config = C( seed=42, env="MiniGrid-FourRooms-v0", episodes=100000, log_interval=10, transforms=[ C(type="gridworld"), ], gamma=0.99, entropy_weight=1e-2, adv_norm=True, grad_clip_norm=1.0, horizon=32, workers=32, model=C(encoder=C(type="gridworld", base_channels=8, out_features=32), rnn=C(type="lstm")), opt=C(type="adam", lr=1e-3), )
from all_the_tools.config import Config as C config = C( num_epochs=100, batch_size=128, loss="bce", opt=C( type="adam", args=C( lr=0.0002, betas=(0.5, 0.999), ), ), dsc=C( num_steps=1, clip=None, ), )
from all_the_tools.config import Config as C epochs = 10 batch_size = 64 config = C( seed=42, image_size=256, crop_size=256, model="effnet-b0", train=C( epochs=epochs, batch_size=batch_size, loss=["ce"], opt=C( type="adam", lr=1e-3, momentum=0.9, weight_decay=1e-4, la=C(lr=0.5, steps=5), ema=C(mom=0.96, steps=5), ), sched=C(type="warmup_cosine", epochs_warmup=0), ), eval=C(batch_size=batch_size), )
from all_the_tools.config import Config as C config = C( seed=42, epochs=100, model=None, train=C( batch_size=32, optimizer=C(type="sgd", lr=1e-2, momentum=0.9, weight_decay=1e-4), scheduler=C(type="cosine"), ), eval=C(batch_size=1), )
def main(**kwargs): config = C( random_seed=42, learning_rate=1e-4, horizon=32, discount=0.99, num_episodes=100000, num_workers=32, entropy_weight=1e-2, log_interval=100, ) for k in kwargs: config[k] = kwargs[k] utils.random_seed(config.random_seed) writer = SummaryWriter(config.experiment_path) # build env env = VecEnv([build_env for _ in range(config.num_workers)]) env = wrappers.TensorboardBatchMonitor(env, writer, log_interval=config.log_interval, fps_mul=0.5) env = wrappers.Torch(env) # build agent and optimizer agent = Agent(env.observation_space, env.action_space) optimizer = torch.optim.Adam(agent.parameters(), config.learning_rate * config.num_workers, betas=(0.0, 0.999)) # train metrics = { "episode/return": Stack(), "episode/length": Stack(), "rollout/reward": Stack(), "rollout/value_target": Stack(), "rollout/value": Stack(), "rollout/td_error": Stack(), "rollout/entropy": Stack(), "rollout/actor_loss": Stack(), "rollout/critic_loss": Stack(), "rollout/loss": Stack(), } episode = 0 opt_step = 0 pbar = tqdm(total=config.num_episodes) env.seed(config.random_seed) obs = env.reset() action = torch.zeros(config.num_workers, dtype=torch.int) memory = agent.zero_memory(config.num_workers) while episode < config.num_episodes: memory = tuple(x.detach() for x in memory) history = collect_rollout() rollout = history.build() _, value_prime, _ = agent(obs_prime, action, memory_prime) # value_target = utils.n_step_bootstrapped_return( # reward_t=rollout.reward, # done_t=rollout.done, # value_prime=value_prime.detach(), # gamma=config.discount, # ) value_target = utils.generalized_advantage_estimation( reward_t=rollout.reward, value_t=rollout.value.detach(), value_prime=value_prime.detach(), done_t=rollout.done, gamma=config.discount, lambda_=0.96, ) value_target += rollout.value.detach() td_error = value_target - rollout.value critic_loss = td_error.pow(2) actor_loss = (-rollout.log_prob * td_error.detach() - config.entropy_weight * rollout.entropy) loss = actor_loss + 0.5 * critic_loss optimizer.zero_grad() loss.sum(1).mean().backward() # nn.utils.clip_grad_norm_(agent.parameters(), 0.01) optimizer.step() opt_step += 1 metrics["rollout/reward"].update(rollout.reward.detach()) metrics["rollout/value"].update(rollout.value.detach()) metrics["rollout/value_target"].update(value_target.detach()) metrics["rollout/td_error"].update(td_error.detach()) metrics["rollout/entropy"].update(rollout.entropy.detach()) metrics["rollout/actor_loss"].update(actor_loss.detach()) metrics["rollout/critic_loss"].update(critic_loss.detach()) metrics["rollout/loss"].update(loss.detach()) if opt_step % 10 == 0: # td_error_std_normalized = td_error.std() / value_target.std() print("log rollout") total_norm = torch.norm( torch.stack([ torch.norm(p.grad.detach(), 2.0) for p in agent.parameters() ]), 2.0) writer.add_scalar(f"rollout/grad_norm", total_norm, global_step=episode) for k in [ "rollout/reward", "rollout/value_target", "rollout/value", "rollout/td_error", ]: v = metrics[k].compute_and_reset() writer.add_scalar(f"{k}/mean", v.mean(), global_step=episode) writer.add_histogram(f"{k}/hist", v, global_step=episode) for k in [ "rollout/entropy", "rollout/actor_loss", "rollout/critic_loss", "rollout/loss", ]: v = metrics[k].compute_and_reset() writer.add_scalar(f"{k}/mean", v.mean(), global_step=episode) writer.flush() # writer.add_scalar( # "rollout/td_error_std_normalized", td_error_std_normalized, global_step=episode # ) # writer.add_histogram("rollout/reward", rollout.reward, global_step=episode) env.close() writer.close()
from all_the_tools.config import Config as C config = C( seed=42, env="ReacherPyBulletEnv-v0", episodes=100000, log_interval=1000, transforms=[], gamma=0.99, entropy_weight=1e-2, horizon=8, workers=32, model=C(encoder=C(type="fc", out_features=32), rnn=C(type="noop")), opt=C(type="adam", lr=1e-3), )
from all_the_tools.config import Config as C config = C( epochs=20, dataset="mnist", image_size=32, batch_size=128, latent_size=128, model=C(base_features=16), opt=C(lr=2e-4), )
from all_the_tools.config import Config as C config = C( seed=42, env="Breakout-v0", episodes=100000, log_interval=100, transforms=[ C(type="adj_max"), C(type="grayscale"), # C(type='resize', size=84), C(type="stack", k=4, dim=0), C(type="skip", k=4), C(type="normalize"), ], gamma=0.99, entropy_weight=1e-2, horizon=8, workers=32, model=C(encoder=C( pre=C( type="conv", base_channels=16, out_features=128, ), rnn=None, )), opt=C(type="adam", lr=1e-3), )
epochs = 20 batch_size = 64 lr = 0.05 * batch_size / 256 # TODO: try "optimal" config config = C( seed=42, epochs=epochs, image_size=256, model=C(backbone="resnet34", ), train=C( batch_size=batch_size, random_resize_scale=2 / 3, opt=C( type="sgd", lr=lr, weight_decay=1e-4, look_ahead=None, sgd=C(momentum=0.9, ), ada_belief=C(weight_decouple=False, ), ), sched=C( type="cosine", multistep=C(steps=[round(epochs * 0.6), round(epochs * 0.8)], ), ), ), )
from all_the_tools.config import Config as C k = 0.5 epochs = 1000 batch_size = 128 config = C( seed=42, epochs=epochs, epochs_warmup=int(epochs * 0.1), log_interval=int(epochs * 0.01), model="resnet50", train=C( batch_size=int(batch_size * k), num_labeled=4000, mix_match=C(weight_u=75.0, temp=0.5, alpha=0.75), opt=C(type="sgd", lr=0.1 * k, momentum=0.9, weight_decay=1e-4), sched=C(type="warmup_cosine"), ), eval=C(batch_size=int(batch_size * k)), )
from all_the_tools.config import Config as C config = C(epochs=50, batch_size=10, opt=C(lr=2.5e-4), sched=C(steps=[30]))
def main(**kwargs): config = C( random_seed=42, learning_rate=1e-3, horizon=16, discount=0.995, num_observations=1000000, num_workers=32, entropy_weight=1e-2, episode_log_interval=100, opt_log_interval=10, average_reward_lr=0.001, clip_grad_norm=None, model=C( num_features=64, encoder=C(type="minigrid", ), memory=C(type="lstm", ), ), ) for k in kwargs: config[k] = kwargs[k] utils.random_seed(config.random_seed) writer = SummaryWriter(config.experiment_path) # build env env = VecEnv([build_env for _ in range(config.num_workers)]) env = wrappers.TensorboardBatchMonitor( env, writer, log_interval=config.episode_log_interval, fps_mul=0.5) env = wrappers.Torch(env) # build agent and optimizer agent = Agent( env.observation_space, env.action_space, **config.model, ) optimizer = torch.optim.Adam( agent.parameters(), config.learning_rate, betas=(0.0, 0.999), ) average_reward = 0 # load state # state = torch.load("./state.pth") # agent.load_state_dict(state["agent"]) # optimizer.load_state_dict(state["optimizer"]) # train metrics = { "episode/return": Stack(), "episode/length": Stack(), "rollout/reward": Stack(), "rollout/value_target": Stack(), "rollout/value": Stack(), "rollout/td_error": Stack(), "rollout/entropy": Stack(), "rollout/actor_loss": Stack(), "rollout/critic_loss": Stack(), "rollout/loss": Stack(), } opt_step = 0 observation_step = 0 pbar = tqdm(total=config.num_observations) env.seed(config.random_seed) obs = env.reset() action = torch.zeros(config.num_workers, dtype=torch.int) memory = agent.zero_memory(config.num_workers) # r_stats = utils.RunningStats() while observation_step < config.num_observations: history = History() memory = agent.detach_memory(memory) for i in range(config.horizon): transition = history.append_transition() dist, value, memory_prime = agent(obs, action, memory) transition.record(value=value, entropy=dist.entropy()) action = select_action(dist) transition.record(log_prob=dist.log_prob(action)) obs_prime, reward, done, info = env.step(action) observation_step += config.num_workers pbar.update(config.num_workers) # for r in reward: # r_stats.push(r) # reward = reward / r_stats.standard_deviation() transition.record(reward=reward, done=done) memory_prime = agent.reset_memory(memory_prime, done) obs, memory = obs_prime, memory_prime for i in info: if "episode" not in i: continue metrics["episode/return"].update(i["episode"]["r"]) metrics["episode/length"].update(i["episode"]["l"]) rollout = history.build() _, value_prime, _ = agent(obs_prime, action, memory_prime) # value_target = utils.n_step_bootstrapped_return( # reward_t=rollout.reward, # done_t=rollout.done, # value_prime=value_prime.detach(), # discount=config.discount, # ) advantage = utils.generalized_advantage_estimation( reward_t=rollout.reward, value_t=rollout.value.detach(), value_prime=value_prime.detach(), done_t=rollout.done, gamma=config.discount, lambda_=0.96, ) value_target = advantage + rollout.value.detach() # value_target = utils.differential_n_step_bootstrapped_return( # reward_t=rollout.reward, # done_t=rollout.done, # value_prime=value_prime.detach(), # average_reward=average_reward, # ) td_error = value_target - rollout.value critic_loss = 0.5 * td_error.pow(2) actor_loss = (-rollout.log_prob * td_error.detach() - config.entropy_weight * rollout.entropy) loss = actor_loss + critic_loss optimizer.zero_grad() agg(loss).backward() if config.clip_grad_norm is not None: nn.utils.clip_grad_norm_(agent.parameters(), config.clip_grad_norm) optimizer.step() average_reward += config.average_reward_lr * agg( td_error.detach()) # TODO: do not use td-error opt_step += 1 metrics["rollout/reward"].update(rollout.reward.detach()) metrics["rollout/value"].update(rollout.value.detach()) metrics["rollout/value_target"].update(value_target.detach()) metrics["rollout/td_error"].update(td_error.detach()) metrics["rollout/entropy"].update(rollout.entropy.detach()) metrics["rollout/actor_loss"].update(actor_loss.detach()) metrics["rollout/critic_loss"].update(critic_loss.detach()) metrics["rollout/loss"].update(loss.detach()) if opt_step % config.opt_log_interval == 0: print("log metrics") writer.add_scalar("rollout/average_reward", average_reward, global_step=observation_step) grad_norm = torch.norm( torch.stack([ torch.norm(p.grad.detach(), 2.0) for p in agent.parameters() ]), 2.0) writer.add_scalar("rollout/grad_norm", grad_norm, global_step=observation_step) for k in [ "rollout/reward", "rollout/value_target", "rollout/value", "rollout/td_error", ]: v = metrics[k].compute_and_reset() writer.add_scalar(f"{k}/mean", v.mean(), global_step=observation_step) writer.add_histogram(f"{k}/hist", v, global_step=observation_step) for k in [ "rollout/entropy", "rollout/actor_loss", "rollout/critic_loss", "rollout/loss", ]: v = metrics[k].compute_and_reset() writer.add_scalar(f"{k}/mean", v.mean(), global_step=observation_step) for k in [ "episode/return", "episode/length", ]: v = metrics[k].compute_and_reset() writer.add_scalar(f"{k}/mean", v.mean(), global_step=observation_step) writer.add_histogram(f"{k}/hist", v, global_step=observation_step) writer.flush() # torch.save( # { # "agent": agent.state_dict(), # "optimizer": optimizer.state_dict(), # "average_reward": average_reward, # }, # "./state.pth", # ) env.close() writer.close()
from all_the_tools.config import Config as C config = C( seed=42, epochs=1000, log_interval=1000 // 50, train=C( num_labeled=250, x_batch_size=64, u_batch_size=64 * 7, u_weight=1.0, tau=0.95, opt=C(type="sgd", lr=0.03, momentum=0.9, weight_decay=5e-4), sched=C(type="cosine"), ), eval=C(batch_size=64), )
from all_the_tools.config import Config as C config = C( epochs=8 * 5, dataset="celeba", image_size=128, batch_size=64, latent_size=128, grow_min_level=1, opt=C(lr=1e-3, beta=(0.0, 0.99)), )
from all_the_tools.config import Config as C batch_size = 128 epochs = 1000 config = C( seed=42, epochs=epochs, log_interval=max(epochs // 200, 1), train=C( num_labeled=4000, x_batch_size=batch_size, u_batch_size=batch_size, student=C(dropout=0.35, opt=C(type="sgd", lr=0.3, momentum=0.9, weight_decay=5e-4)), teacher=C(dropout=0.5, opt=C(type="sgd", lr=0.125, momentum=0.9, weight_decay=5e-4)), sched=C(type="warmup_cosine"), ), eval=C(batch_size=batch_size), )
from all_the_tools.config import Config as C config = C( num_epochs=500, batch_size=64, image_size=256, loss="wass", opt=C( type="rmsprop", args=C( lr=0.00005, ), ), gen=C( base_channels=512, kernel_size=4, ), dsc=C( num_steps=5, weight_clip=0.01, base_channels=512, kernel_size=3, ), )
from all_the_tools.config import Config as C k = 0.5 epochs = 100 batch_size = 128 config = C( seed=42, epochs=epochs, log_interval=int(epochs * 0.1), model="resnet50", train=C( batch_size=int(batch_size * k), opt=C(type="sgd", lr=0.1 * k, momentum=0.9, weight_decay=1e-4), sched=C(type="multistep", epochs=[int(epochs * 0.6), int(epochs * 0.8)]), ), eval=C(batch_size=int(batch_size * k)), )
config = C( seed=42, train_steps=1000, resize_size=image_size, crop_size=image_size, dataset="coco", model=C( freeze_bn=batch_size < 16, backbone="resnet50", levels=[ None, None, None, (0, 64), (64, 128), (128, 256), (256, 512), (512, float("inf")), ], ), train=C( epochs=90, batch_size=batch_size, acc_steps=acc_steps, opt=C(type="sgd", learning_rate=0.01, weight_decay=1e-4, momentum=0.9), sched=C(type="step", steps=[60, 80]), ), # sched=C( # type='warmup_cosine', # epochs_warmup=1)), eval=C(batch_size=batch_size * 2), )
from all_the_tools.config import Config as C config = C( num_epochs=500, batch_size=128, image_size=256, loss="bce", opt=C( type="adam", args=C( lr=0.0002, betas=(0.5, 0.999), ), ), gen=C( base_channels=512, kernel_size=4, ), dsc=C( num_steps=1, weight_clip=None, base_channels=512, kernel_size=3, ), )
from all_the_tools.config import Config as C config = C( num_epochs=100, batch_size=64, loss="wass", opt=C( type="rmsprop", args=C( lr=0.00005, ), ), dsc=C( num_steps=5, weight_clip=0.01, ), )
config = C( dataset="ffhq", num_epochs=10000, batches_in_epoch=512, batch_size=32, image_size=256, noise_size=noise_size, opt=C( type="adam", args=C( lr=0.002, betas=(0.0, 0.99), eps=1e-8, ), ), dsc=C( loss="logns", base_channels=32, max_channels=noise_size, reg_interval=16, r1_gamma=10, batch_std=4, ), gen=C( loss="logns", base_channels=32, max_channels=noise_size, reg_interval=8, pl_decay=0.01, pl_weight=1, # TODO: ema=0.999, ), )
def main(**kwargs): config = C( horizon=32, discount=0.99, num_episodes=100000, num_workers=8, e_greedy_eps=0.9, ) for k in kwargs: config[k] = kwargs[k] writer = SummaryWriter(config.experiment_path) env = VecEnv([build_env for _ in range(config.num_workers)]) env = wrappers.TensorboardBatchMonitor(env, writer, log_interval=100, fps_mul=0.5) env = wrappers.Torch(env) model = Agent(env.observation_space, env.action_space) optimizer = torch.optim.RMSprop(model.parameters(), 1e-4 * config.num_workers) episode = 0 pbar = tqdm(total=config.num_episodes) obs = env.reset() state = model.zero_state(config.num_workers) while episode < config.num_episodes: history = History() state = tuple(x.detach() for x in state) for i in range(config.horizon): transition = history.append_transition() action_value, state_prime = model(obs, state) action = select_action(action_value, eps=config.e_greedy_eps) transition.record( action_value_i=select_action_value(action_value, action)) obs_prime, reward, done, info = env.step(action) transition.record(reward=reward, done=done) state_prime = model.reset_state(state_prime, done) obs, state = obs_prime, state_prime for i in info: if "episode" not in i: continue episode += 1 writer.add_scalar("episode/return", i["episode"]["r"], global_step=episode) writer.add_scalar("episode/length", i["episode"]["l"], global_step=episode) pbar.update() rollout = history.build() action_value_prime, _ = model(obs_prime, state_prime) action_prime = select_action(action_value_prime, eps=config.e_greedy_eps) return_ = n_step_bootstrapped_return( reward_t=rollout.reward, value_prime=select_action_value(action_value_prime, action_prime).detach(), done_t=rollout.done, discount=config.discount, ) td_error = rollout.action_value_i - return_ loss = td_error.pow(2) optimizer.zero_grad() loss.mean().backward() optimizer.step() writer.add_scalar("rollout/action_value_i", rollout.action_value_i.mean(), global_step=episode) writer.add_scalar("rollout/td_error", td_error.mean(), global_step=episode) writer.add_scalar("rollout/loss", loss.mean(), global_step=episode) env.close() writer.close()
from all_the_tools.config import Config as C config = C( seed=42, env="MountainCar-v0", episodes=100000, log_interval=100, transforms=[], gamma=0.99, entropy_weight=1e-2, adv_norm=False, grad_clip_norm=1.0, horizon=32, workers=32, model=C(encoder=C(type="fc", out_features=32), rnn=C(type="noop")), opt=C(type="adam", lr=1e-3), )
from all_the_tools.config import Config as C config = C( num_epochs=10000, batch_size=32, image_size=128, noise_size=256, opt=C( type="adam", args=C( lr=0.001, betas=(0.0, 0.99), eps=1e-8, ), ), dsc=C( loss="sp", num_steps=1, weight_clip=None, base_channels=16, ), gen=C( loss="sp", base_channels=16, ), )
from all_the_tools.config import Config as C config = C( seed=42, env="CartPole-v1", episodes=10000, log_interval=100, transforms=[], gamma=0.99, entropy_weight=1e-2, horizon=8, workers=32, model=C(encoder=C(type="fc", out_features=32)), opt=C(type="adam", lr=1e-3), )
config = C( dataset="wikiart", num_epochs=10000, batches_in_epoch=128, batch_size=64, image_size=128, noise_size=noise_size, r1_gamma=10, opt=C( type="adam", args=C( lr=0.0025, betas=(0.0, 0.99), eps=1e-8, ), ), dsc=C( loss="logns", base_channels=32, max_channels=noise_size, reg_interval=16, batch_std=8, ), gen=C( loss="logns", base_channels=32, max_channels=noise_size, reg_interval=8, ema=0.999, ), )
from all_the_tools.config import Config as C epochs = 50 config = C( seed=42, sample_rate=22050, model=C(num_mels=80, base_features=256), train=C( epochs=epochs, batch_size=32, clip_grad_norm=1.0, opt=C(type="adam", lr=1e-3, beta=(0.9, 0.999), eps=1e-6, weight_decay=1e-6), sched=C(type="warmup_cosine", epochs_warmup=0), ), eval=C(batch_size=32), )