def __init__(self, gamma):
     self.s = []
     self.a = []
     self.r = []
     self.t = []
     self.ram = []
     self.gamma = gamma
     self.is_over = False
     self.device = get_device()
Ejemplo n.º 2
0
 def __init__(self, gamma, tslice=4, snorm=255):
     self.s = []
     self.a = []
     self.r = []
     self.t = []
     self.ram = []
     self.gamma = gamma
     self.is_over = False
     self.device = get_device()
     self.tslice = tslice
     self.tslice_range = tint(np.arange(tslice) - tslice + 1)
     self.snorm = snorm
Ejemplo n.º 3
0
def fill_buffer_with_expert(env, replay_buffer):
    model_path = f"rainbow_atari_models/{ARGS.env_name}.pth"
    with open(model_path, "rb") as f:
        m = torch.load(f)

    device = mm.get_device()
    dqn = DQN(
        odict({
            "history_length": 4,
            "hidden_size": 256,
            "architecture": "data-efficient",
            "atoms": 51,
            "noisy_std": 0.1,
            "V_min": -10,
            "V_max": 10,
            "device": device,
        }), env.num_actions)
    dqn.load_state_dict(m)
    dqn.eval()
    dqn.to(device)

    rand_classes = np.zeros(replay_buffer.size)
    ram2class = {}
    totr = 0
    obs = env.reset()
    replay_buffer.new_episode(obs, env.enumber % 2)
    it = 0
    while replay_buffer.idx < replay_buffer.size - 10:
        action = dqn.act_e_greedy(torch.tensor(obs).float().to(device) / 255,
                                  epsilon=0.01)
        obs_ram = env.getRAM().tostring()
        if obs_ram not in ram2class:
            ram2class[obs_ram] = np.random.randint(0, ARGS.num_rand_classes)
        rand_classes[replay_buffer.idx] = ram2class[obs_ram]
        obsp, r, done, tr = env.step(action)
        replay_buffer.add(obs, action, r, done, env.enumber % 2)
        obs = obsp
        totr += tr
        if done:
            totr = 0
            obs = env.reset()
            replay_buffer.new_episode(obs, env.enumber % 2)
        it += 1

    # Remove last episode from replay buffer, as it didn't end
    it = replay_buffer.idx
    curp = replay_buffer.p[it]
    while replay_buffer.p[it] == curp:
        replay_buffer._sumtree.set(it, 0)
        it -= 1
    print(f'went from {replay_buffer.idx} to {it} when deleting states')
    return rand_classes
Ejemplo n.º 4
0
 def __init__(self,
              seed,
              size,
              gamma=0.99,
              tslice=4,
              snorm=255,
              xdtype=torch.uint8):
     self.current_size = 0
     self.size = size
     self.device = get_device()
     self.rng = np.random.RandomState(seed)
     self.gamma = gamma
     self.hit_max = False
     self.current_episode = Episode(self.gamma, tslice=tslice, snorm=snorm)
     self.episodes = []
     self.tslice = tslice
     self.snorm = snorm
     self.xdtype = xdtype
 def __init__(self, seed, size, iwidth=84, near_strategy="both", extras=[]):
     self.rng = np.random.RandomState(seed)
     self.size = size
     self.near_strategy = near_strategy
     self.device = device = get_device()
     # Storing s,a,r,done,episode parity
     self.s = torch.zeros([size, iwidth, iwidth],
                          dtype=torch.uint8,
                          device=device)
     self.a = torch.zeros([size], dtype=torch.uint8, device=device)
     self.r = torch.zeros([size], dtype=torch.float32, device=device)
     self.t = torch.zeros([size], dtype=torch.uint8, device=device)
     self.p = torch.zeros([size], dtype=torch.uint8, device=device)
     self.idx = 0
     self.last_idx = 0
     self.maxidx = 0
     self.is_filling = True
     self._sumtree = SumTree(self.rng, size)
     self._sumtree = SumTree(self.rng, size)
def fill_buffer_with_expert(dqn, env, replay_buffer):
  totr = 0
  obs = env.reset()
  it = 0
  device = mm.get_device()
  while not replay_buffer.hit_max:
    action = dqn.act_e_greedy(
        torch.tensor(obs).float().to(device) / 255, epsilon=0.01)
    obsp, r, done, tr = env.step(action)
    replay_buffer.add(obs, action, r, done)
    obs = obsp
    totr += tr
    if done:
      print("Done episode %d reward %d"%(totr, replay_buffer.current_size))
      totr = 0
      obs = env.reset()
      if args.test_run:
        break
    it += 1
  return dqn
def fill_buffer_with_expert(env, replay_buffer):
  model_path = f"rainbow_atari_models/{ARGS.env_name}.pth"
  with open(model_path, "rb") as f:
    m = torch.load(f)

  device = mm.get_device()
  dqn = DQN(
      odict({
          "history_length": 4,
          "hidden_size": 256,
          "architecture": "data-efficient",
          "atoms": 51,
          "noisy_std": 0.1,
          "V_min": -10,
          "V_max": 10,
          "device": device,
      }), env.num_actions)
  dqn.load_state_dict(m)
  dqn.eval()
  dqn.to(device)
  relevant_features = np.int32(
      sorted(list(atari_dict[env.env_name.replace("_", "").lower()].values())))

  totr = 0
  obs = env.reset()
  #for it in range(replay_buffer.size):
  it = 0
  while not replay_buffer.hit_max:
    action = dqn.act_e_greedy(
        torch.tensor(obs).float().to(device) / 255, epsilon=0.01)
    #obs_ram = env.getRAM()
    obsp, r, done, tr = env.step(action)
    replay_buffer.add(obs, action, r, done)
    #, ram_info=obs_ram[relevant_features])
    obs = obsp
    totr += tr
    if done:
      print("Done episode %d reward %d"%(totr, replay_buffer.current_size))
      totr = 0
      obs = env.reset()
    it += 1
def load_expert(env_name, env):
  model_path = f"oracles/{env_name}.pth"
  with open(model_path, "rb") as f:
    m = torch.load(f)

  device = mm.get_device()
  dqn = DQN(
      odict({
          "history_length": 4,
          "hidden_size": 256,
          "architecture": "data-efficient",
          "atoms": 51,
          "noisy_std": 0.1,
          "V_min": -10,
          "V_max": 10,
          "device": device,
      }), env.num_actions)
  dqn.load_state_dict(m)
  dqn.eval()
  dqn.to(device)
  return dqn
 def __init__(self,
              seed,
              size,
              value_callback=None,
              target_value_callback=None,
              Lambda=0.9,
              gamma=0.99,
              nbins=512):
     self.current_size = 0
     self.size = size
     self.device = get_device()
     self.rng = np.random.RandomState(seed)
     self.value_callback = value_callback
     self.target_value_callback = target_value_callback
     self.Lambda = Lambda
     self.gamma = gamma
     self.hit_max = False
     if self.Lambda > 0:
         self.lr = LambdaReturn(Lambda, gamma)
     self.current_episode = Episode(self.gamma)
     self.episodes = []
     self.vdiff_acc = np.zeros(nbins)
     self.vdiff_cnt = np.zeros(nbins)
     self.vdiff_bins = np.linspace(-2, 2, nbins - 1)