size=cfg["size"], max_ep_len=cfg["train"]["max_ep_len"], ) buffer = Buffer( num_env=num_env, maxlen=int(cfg["buffer"]["size"] / num_env), obs_shape=(4, ), device=cfg["buffer"]["device"], ) model = DQN(cfg["agent"]["rnn_size"]).cuda() pred = Predictor(buffer, cfg) if cfg["random"]: warmup = 1e8 else: cp = torch.load("models/dqn.pt") model.load_state_dict(cp) model.eval() pred.load() warmup = 0 actor = actor_iter(envs, model, pred, warmup, eps=0.01) reward = [] while len(reward) < 128: full_step = buffer.get_recent(2) step, hx, log = actor.send(full_step) buffer.append(step) if "reward" in log: reward.append(log["reward"]) wandb.log({"final_reward": np.mean(reward)})
from predictor import Predictor if __name__ == "__main__": cfg = load_cfg("default") cfg["env"] = "pol" num_env = cfg["agent"]["actors"] env = make_vec_envs( num=1, size=3, max_ep_len=cfg["train"]["max_ep_len"], seed=10, ) model = DQN(cfg["agent"]["rnn_size"], device="cpu") pred = Predictor(None, cfg, device="cpu") actor = actor_iter(env, model, pred, 0, eps=0) buffer = Buffer(num_env=1, maxlen=2, obs_shape=(4, ), device="cpu") cp = torch.load("models/dqn.pt", map_location="cpu") model.load_state_dict(cp) model.eval() pred.load() for n_iter in range(2000): full_step = buffer.get_recent(2, "cpu") step, hx, log_a = actor.send(full_step) buffer.append(step) # env.render() os.system("clear") env.remotes[0].send(('render', None)) env.remotes[0].recv()
num_env = cfg["agent"]["actors"] fstack = cfg["agent"]["frame_stack"] envs = make_vec_envs(cfg["env"], num_env, cfg["seed"], cfg["train"]["max_ep_len"]) buffer = Buffer( num_env=num_env, maxlen=int(cfg["buffer"]["size"] / num_env), obs_shape=envs.observation_space.shape, device=cfg["buffer"]["device"], ) model = DQN(envs.action_space.n, fstack).cuda().train() wmse = WMSE(buffer, cfg) pred = Predictor(buffer, wmse.encoder, envs.action_space.n, cfg) learner = Learner(model, buffer, pred, cfg) actor = actor_iter( envs, model, pred, cfg["buffer"]["warmup"], eps=cfg["agent"].get("eps") ) start_train = int(cfg["buffer"]["warmup"] / num_env) log_every = cfg["train"]["log_every"] train_every = cfg["train"]["learner_every"] wmse_every = cfg["train"]["w_mse_every"] def save(): torch.save(model.state_dict(), "models/dqn.pt") wmse.save() pred.save() count = trange(int(cfg["train"]["frames"] / 4 / num_env), smoothing=0.05) for n_iter in count: full_step = buffer.get_recent(fstack + 1)
fstack = cfg["agent"]["frame_stack"] envs = make_vec_envs(cfg["env"], num_env, max_ep_len=cfg["train"]["max_ep_len"]) num_action = envs.action_space.n buffer = Buffer( num_env=num_env, maxlen=int(cfg["buffer"]["size"] / num_env), obs_shape=envs.observation_space.shape, device=cfg["buffer"]["device"], ) wmse = WMSE(buffer, cfg) idf = IDF(buffer=buffer, num_action=num_action) cpc = CPC(buffer=buffer, num_action=num_action) actor = actor_iter(envs, None, None, cfg["buffer"]["warmup"], eps=1) pretrain = int(cfg["buffer"]["warmup"] / num_env) for n_iter in trange(pretrain): step, hx, log = next(actor) buffer.append(step) # batch = 256 for i in trange(20000): cur_log = wmse.train() if i % 200 == 0: wandb.log(cur_log) torch.save(wmse.encoder.state_dict(), "models/conv_wmse.pt") # batch = 256 for i in trange(20000):
wandb.init(project="lwm", config=cfg) num_env = cfg["agent"]["actors"] fstack = cfg["agent"]["frame_stack"] envs = make_vec_envs(cfg["env"], num_env, cfg["seed"]) buffer = Buffer( num_env=num_env, maxlen=int(cfg["buffer"]["size"] / num_env), obs_shape=envs.observation_space.shape, device=cfg["buffer"]["device"], ) model = DQN(envs.action_space.n, fstack).cuda().train() wmse = WMSE(buffer, cfg) pred = Predictor(buffer, wmse.encoder, envs.action_space.n, cfg) actor = actor_iter(envs, model, pred, 0, eps=0.001) wmse.load(), pred.load() cp = torch.load("models/dqn.pt", map_location="cuda") model.load_state_dict(cp) model.eval() while True: full_step = buffer.get_recent(fstack + 1) step, hx, log = actor.send(full_step) buffer.append(step) if "reward" in log: wandb.log({"final_reward": log["reward"]}) break wandb.save("models/dqn.pt")
envs = make_vec_envs( num=num_env, size=cfg["size"], max_ep_len=cfg["train"]["max_ep_len"], ) buffer = Buffer( num_env=num_env, maxlen=int(cfg["buffer"]["size"] / num_env), obs_shape=(4,), device=cfg["buffer"]["device"], ) model = DQN(cfg["agent"]["rnn_size"]).cuda().train() pred = Predictor(buffer, cfg) learner = Learner(model, buffer, pred, cfg) eps = cfg["agent"].get("eps") actor = actor_iter(envs, model, pred, cfg["buffer"]["warmup"], eps=eps) start_train = int(cfg["buffer"]["warmup"] / num_env) log_every = cfg["train"]["log_every"] train_every = cfg["train"]["learner_every"] count = trange(int(cfg["train"]["frames"] / num_env), smoothing=0.05) for n_iter in count: full_step = buffer.get_recent(2) step, hx, log = actor.send(full_step) buffer.append(step) if n_iter == start_train and cfg["add_ri"]: for i in trange(1000): cur_log = pred.train() if i % 100 == 0: