def restore(self): f = open( "/home/yunke/prl_proj/panda_ws/src/franka_cal_sim/python/replay_buffer.txt", "r") obs, actions, rewards, next_obs, terminals, weights = [],[],[],[],[],[] for line in f: cols = line.strip().split('\t') obs_t = np.array([float(v) for v in cols[0].split(',')]) obs.append(obs_t) action = np.array([float(v) for v in cols[1].split(',')]) actions.append(action) rewards.append(float(cols[2])) obs_tp1 = np.array([float(v) for v in cols[3].split(',')]) next_obs.append(obs_tp1) terminals.append(bool(cols[4])) weights.append(float(cols[5])) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": next_obs, "dones": terminals, "weights": weights }) for i in range(obs_s.shape[0]): self.replay_buffers[policy_id].add(pack_if_needed(obs_s[i]), actions[i], rewards[i], pack_if_needed(new_obs[i]), dones[i])
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray_get_and_free( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({ DEFAULT_POLICY_ID: batch }, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def __call__(self, batch: SampleBatchType): for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers.buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) return batch
def __call__(self, batch: SampleBatchType): # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None)
def __call__(self, batch: SampleBatchType): x = 0 for policy_id, s in batch.policy_batches.items(): if policy_id in self.policies_to_train: for row in s.rows(): flag = row["mode"] == MODE.best_response.value if flag: # Transition must be inserted in the reservoir buffer self.reservoir_buffers.buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"]) self.replay_buffers.steps[policy_id] += 1 bb = SampleBatch({ 'obs': row["obs"].reshape(1, -1), 'actions': row['actions'].reshape(1, -1), 'rewards': row['rewards'].reshape(1, -1), 'new_obs': row['new_obs'].reshape(1, -1), 'dones': np.array([row['dones']]), "eps_id": np.array([row['eps_id']]), 'unroll_id': np.array([row['unroll_id']]), 'agent_index': np.array([row['agent_index']]) }) bb.compress(bulk=True) self.replay_buffers.buffers[policy_id].add_batch(bb) self.reservoir_buffers.steps[policy_id] += 1 return batch
def step(self): with self.update_weights_timer: if self.workers.remote_workers(): weights = ray.put(self.workers.local_worker().get_weights()) for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: if self.workers.remote_workers(): batch = SampleBatch.concat_samples( ray_get_and_free([ e.sample.remote() for e in self.workers.remote_workers() ])) else: batch = self.workers.local_worker().sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], None, row["action_logp"], # row["diversity_advantages"], row["diversity_rewards"], # row["diversity_value_targets"], # row["my_logits"], row["prev_actions"], row["prev_rewards"]) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self, attention_score_dic=None): with self.update_weights_timer: if self.workers.remote_workers(): weights = ray.put(self.workers.local_worker().get_weights()) for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: if self.workers.remote_workers(): batch = SampleBatch.concat_samples( ray_get_and_free([ e.sample.remote() for e in self.workers.remote_workers() ])) else: batch = self.workers.local_worker().sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) ''' For Gamma Reward by LJJ (You can check the local history for changing) ''' for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.pre_replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) if self.num_steps_presampled >= self.memory_thres: self._preprocess(batch, attention_score_dic) self.num_steps_presampled += batch.count
def step(self): with self.update_weights_timer: if self.workers.remote_workers(): # !!!!! CHANGED FROM ORIGINAL !!!! doesnt sync policies we arent training weights = ray.put(self.workers.local_worker().get_weights( policies=self.workers_only_sync_policy_list)) for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: if self.workers.remote_workers(): batch = SampleBatch.concat_samples( ray_get_and_free([ e.sample.remote() for e in self.workers.remote_workers() ])) else: batch = self.workers.local_worker().sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], weight=None) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() for row in batch.rows(): self.replay_buffer.add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], row["weights"]) if len(self.replay_buffer) >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def add(self, **kwargs): assert len(kwargs) == len(self.expected_keys) and sorted( kwargs.keys()) == self.expected_keys for k in kwargs.keys(): if k in self.can_pack_list: kwargs[k] = pack_if_needed(kwargs[k]) data = [kwargs[k] for k in self.expected_keys] if len(self._storage) < self._maxsize: self._storage.append(data) self._est_size_bytes += sum(sys.getsizeof(d) for d in data) else: idx = np.random.randint(0, self._num_added + 1) if idx < self._maxsize: self._storage[idx] = data self._evicted_hit_stats.push(self._hit_count[idx]) self._hit_count[idx] = 0 self._num_added += 1
def __call__(self, batch: SampleBatchType): x = 0 for policy_id, s in batch.policy_batches.items(): if policy_id in self.policies_to_train: for row in s.rows(): if row["mode"] == MODE.best_response.value: # Transition must be inserted in the reservoir buffer self.reservoir_buffers.buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"]) self.replay_buffers.steps[policy_id] += 1 episode_ids = np.unique(s['eps_id']) for ep_id in episode_ids: sample_ids = np.where(s["eps_id"] == ep_id) bb = SampleBatch({ 'obs': s["obs"][sample_ids], 'actions': s['actions'][sample_ids], 'rewards': s['rewards'][sample_ids], 'new_obs': s['new_obs'][sample_ids], 'dones': np.array(s['dones'][sample_ids]), "eps_id": np.array(s['eps_id'][sample_ids]), 'unroll_id': np.array(s['unroll_id'][sample_ids]), 'agent_index': np.array(s['agent_index'][sample_ids]) }) bb.compress(bulk=True) self.replay_buffers.buffers[policy_id].add_batch(bb) self.reservoir_buffers.steps[policy_id] += bb.count return batch