def _initialize(self, env): assert isinstance(env.observation_space, (Box, Discrete)) and isinstance(env.action_space, (Box, Discrete)), 'action_space and observation_space must be one of available_type' # process observation ObsSpace = env.observation_space if isinstance(ObsSpace, Box): self.vector_dims = [ObsSpace.shape[0] if len(ObsSpace.shape) == 1 else 0] # self.obs_high = ObsSpace.high # self.obs_low = ObsSpace.low else: self.vector_dims = [int(ObsSpace.n)] if len(ObsSpace.shape) == 3: self.obs_type = 'visual' self.visual_dims = [list(ObsSpace.shape)] else: self.obs_type = 'vector' self.visual_dims = [] self.vector_info_type = NamedTupleStaticClass.generate_obs_namedtuple(n_agents=self.n, item_nums=1 if self.obs_type == 'vector' else 0, name='vector') self.visual_info_type = NamedTupleStaticClass.generate_obs_namedtuple(n_agents=self.n, item_nums=1 if self.obs_type == 'visual' else 0, name='vector') # process action ActSpace = env.action_space if isinstance(ActSpace, Box): assert len(ActSpace.shape) == 1, 'if action space is continuous, the shape length of action must equal to 1' self.action_type = 'continuous' self._is_continuous = True self.a_dim = ActSpace.shape[0] elif isinstance(ActSpace, Tuple): assert all([isinstance(i, Discrete) for i in ActSpace]) == True, 'if action space is Tuple, each item in it must have type Discrete' self.action_type = 'Tuple(Discrete)' self._is_continuous = False self.a_dim = int(np.asarray([i.n for i in ActSpace]).prod()) discrete_action_dim_list = [i.n for i in ActSpace] else: self.action_type = 'discrete' self._is_continuous = False self.a_dim = env.action_space.n discrete_action_dim_list = [env.action_space.n] if not self._is_continuous: self.discrete_action_list = get_discrete_action_list(discrete_action_dim_list) self.reward_threshold = env.env.spec.reward_threshold # reward threshold refer to solved self.EnvSpec = SingleAgentEnvArgs( obs_spec=ObsSpec(vector_dims=self.vector_dims, visual_dims=self.visual_dims), a_dim=self.a_dim, is_continuous=self._is_continuous, n_agents=self.n )
def normalize_vector_obs(self, func): ''' TODO: Annotation ''' assert 'obs' in self.data_buffer.keys( ), "assert 'obs' in self.data_buffer.keys()" assert 'obs_' in self.data_buffer.keys( ), "assert 'obs_' in self.data_buffer.keys()" self.data_buffer['obs'] = [ NamedTupleStaticClass.data_convert(func, obs, keys=['vector']) for obs in self.data_buffer['obs'] ] self.data_buffer['obs_'] = [ NamedTupleStaticClass.data_convert(func, obs_, keys=['vector']) for obs_ in self.data_buffer['obs_'] ]
def _per_store(self, i: int, data: BatchExperiences) -> NoReturn: # TODO: 优化 q = self.queue[i] if len(q) == 0: # 如果Nstep临时经验池为空,就直接添加 q.append(data) return if len(q) == self.n: self._store_op(q.pop(0)) if not NamedTupleStaticClass.check_equal( q[-1].obs_, data.obs): # 如果截断了,非常规done,把Nstep临时经验池中已存在的经验都存进去,临时经验池清空 q.clear( ) # 保证经验池中不存在不足N长度的序列,有done的除外,因为(1-done)为0,导致gamma的次方计算不准确也没有关系。 q.append(data) else: _len = len(q) for j in range(_len): # 然后再存入一条最新的经验到Nstep临时经验池 q[j] = q[j]._replace(reward=q[j].reward + data.reward * (self.gamma**(_len - j))) q[j] = q[j]._replace(obs_=data.obs_) q[j] = q[j]._replace(done=data.done) q.append(data) if data.done: # done or not # 如果新数据是done,就清空临时经验池 while q: # (1-done)会清零不正确的n-step self._store_op(q.pop())
def sample(self) -> BatchExperiences: ''' change [[s, a, r],[s, a, r]] to [[s, s],[a, a],[r, r]] ''' n_sample = self.batch_size if self.is_lg_batch_size else self._size t = np.random.choice(self._buffer[:self._size], size=n_sample, replace=False) # return [np.asarray(e) for e in zip(*t)] return NamedTupleStaticClass.pack(t.tolist())
def _data_process2dict(self, exps: BatchExperiences) -> BatchExperiences: # TODO 优化 if not self.is_continuous: assert 'action' in exps._fields, "assert 'action' in exps._fields" exps = exps._replace(action=int2one_hot(exps.action.astype(np.int32), self.a_dim)) assert 'obs' in exps._fields and 'obs_' in exps._fields, "'obs' in exps._fields and 'obs_' in exps._fields" # exps = exps._replace( # obs=exps.obs._replace(vector=self.normalize_vector_obs()), # obs_=exps.obs_._replace(vector=self.normalize_vector_obs())) return NamedTupleStaticClass.data_convert(self.data_convert, exps)
def _learn(self, function_dict: Dict) -> NoReturn: ''' TODO: Annotation ''' _cal_stics = function_dict.get('calculate_statistics', lambda *args: None) _train = function_dict.get('train_function', lambda *args: None) # 训练过程 _summary = function_dict.get('summary_dict', {}) # 记录输出到tensorboard的词典 self.intermediate_variable_reset() # self.data.normalize_vector_obs(self.normalize_vector_obs) if not self.is_continuous: self.data.convert_action2one_hot(self.a_dim) if self.use_curiosity and not self.use_rnn: curiosity_data = self.data.get_curiosity_data() curiosity_data = NamedTupleStaticClass.data_convert(self.data_convert, curiosity_data) cell_state = self.initial_cell_state(batch=self.n_agents) crsty_r, crsty_summaries = self.curiosity_model(curiosity_data, cell_state) self.data.update_reward(crsty_r.numpy()) # self.data.r += crsty_r.numpy().reshape([self.data.eps_len, -1]) self.summaries.update(crsty_summaries) _cal_stics() if self.use_rnn: all_data = self.data.sample_generater_rnn() else: all_data = self.data.sample_generater() for data, cell_state in all_data: data = NamedTupleStaticClass.data_convert(self.data_convert, data) cell_state = self.data_convert(cell_state) summaries = _train(data, cell_state) self.summaries.update(summaries) self.summaries.update(_summary) self.write_training_summaries(self.train_step, self.summaries) self.clear()
def get_all(self, return_index: bool = False) -> BatchExperiences: idxs, data_indx, p, data = self.tree.get_all() self.last_indexs = idxs _min_p = self.min_p if self.global_v and self.min_p < sys.maxsize else p.min( ) self.IS_w = np.power(_min_p / p, self.beta) data = NamedTupleStaticClass.pack(data.tolist()) if return_index: return data, idxs else: return data
def sample_generater(self, batch_size: int = None): ''' create sampling data iterator without using rnn. params: batch_size: the batch size of training data keys: the keys of data that should be sampled to train policies return: sampled data. ''' batch_size = batch_size or self.batch_size buffer = {} # T * [B, N] => [T*B, N] for k in self.sample_data_type._fields: assert k in self.data_buffer.keys( ), f"assert {k} in self.data_buffer.keys()" if isinstance(self.data_buffer[k][0], tuple): buffer[k] = NamedTupleStaticClass.pack(self.data_buffer[k], func=np.concatenate) assert NamedTupleStaticClass.check_len(buffer[k], l=self.n_agents * self.eps_len), \ f"shape of {k} not equal to {self.n_agents * self.eps_len}" else: buffer[k] = np.concatenate(self.data_buffer[k]) assert buffer[k].shape[0] == self.n_agents * self.eps_len, \ f"shape of {k} not equal to {self.n_agents * self.eps_len}" idxs = np.arange(self.eps_len * self.n_agents) np.random.shuffle(idxs) for i in range(0, self.eps_len * self.n_agents, batch_size * self.n_agents): _idxs = idxs[i:i + batch_size * self.n_agents] data = [] for k in self.sample_data_type._fields: if isinstance(buffer[k], tuple): data.append( NamedTupleStaticClass.getbatchitems(buffer[k], _idxs)) else: data.append(buffer[k][_idxs]) yield self.sample_data_type._make(data), (None, )
def observation(self, observation: List[SingleModelInformation]): def func(x): return np.asarray(x * 255).astype(np.uint8) for bn in self.behavior_names: visual = observation[bn].obs.visual if isinstance(visual, np.ndarray): visual = func(visual) else: visual = NamedTupleStaticClass.data_convert(func, visual) observation[bn] = observation[bn]._replace( obs=observation[bn].obs._replace(visual=visual)) visual = observation[bn].obs_.visual if isinstance(visual, np.ndarray): visual = func(visual) else: visual = NamedTupleStaticClass.data_convert(func, visual) observation[bn] = observation[bn]._replace( obs_=observation[bn].obs_._replace(visual=visual)) return observation
def get_curiosity_data(self): ''' 返回用于好奇心机制的数据 ''' # T * [B, N] => [B, T, N] => [B*T, N] def func(x): return np.stack(x, axis=1).reshape(self.n_agents * self.eps_len, -1) data = {} for k in BatchExperiences._fields: assert k in self.data_buffer.keys( ), f"assert {k} in self.data_buffer.keys()" if isinstance(self.data_buffer[k][0], tuple): data[k] = NamedTupleStaticClass.pack(self.data_buffer[k], func=func) assert NamedTupleStaticClass.check_len(data[k], l=self.n_agents * self.eps_len), \ f"shape of {k} not equal to {self.n_agents * self.eps_len}" else: data[k] = func(self.data_buffer[k]) assert data[k].shape[0] == self.n_agents * self.eps_len, \ f"shape of {k} not equal to {self.n_agents * self.eps_len}" return BatchExperiences(**data)
def _per_store(self, i: int, data: BatchExperiences) -> NoReturn: q = self.queue[i] if len(q) == 0: q.append(data) return if not NamedTupleStaticClass.check_equal(q[-1].obs_, data.obs): self._store_op(q.copy()) q.clear() q.append(data) return if data.done: q.append(data) self._store_op(q.copy()) q.clear() return q.append(data)
def get_transitions(self, databuffer, data_name_list=['s', 'a', 'r', 's_', 'done']): ''' TODO: Annotation ''' exps = databuffer.sample() # 经验池取数据 if not self.is_continuous: assert 'action' in exps._fields, "assert 'action' in exps._fields" a = exps.action.astype(np.int32) pre_shape = a.shape a = a.reshape(-1) a = int2one_hot(a, self.a_dim) a = a.reshape(pre_shape + (-1, )) exps = exps._replace(action=a) return NamedTupleStaticClass.data_convert(self.data_convert, exps)
def sample(self, return_index: bool = False) -> Union[List, Tuple]: ''' output: weights, [ss, visual_ss, as, rs, s_s, visual_s_s, dones] ''' n_sample = self.batch_size if self.is_lg_batch_size else self._size all_intervals = np.linspace(0, self.tree.total, n_sample + 1) ps = np.random.uniform(all_intervals[:-1], all_intervals[1:]) idxs, data_indx, p, data = self.tree.get_batch_parallel(ps) self.last_indexs = idxs _min_p = self.min_p if self.global_v and self.min_p < sys.maxsize else p.min( ) self.IS_w = np.power(_min_p / p, self.beta) data = NamedTupleStaticClass.pack(data.tolist()) if return_index: return data, idxs else: return data
def sample(self) -> BatchExperiences: n_sample = self.batch_size if self.is_lg_batch_size else self._size trajs = np.random.choice(self._buffer[:self._size], size=n_sample, replace=False) # 选n_sample条轨迹 def f(v, l): # [B, T, N] return lambda x: tf.keras.preprocessing.sequence.pad_sequences( x, padding='pre', dtype='float32', value=v, maxlen=l, truncating='pre') def truncate(traj): idx = np.random.randint(max(1, len(traj) - self.timestep + 1)) # [min, max) return traj[idx:idx + self.timestep] datas = [] # [B, 不定长时间步, N] for traj in trajs: data = NamedTupleStaticClass.pack(truncate(traj)) datas.append(data) sample_data = NamedTupleStaticClass.pack(datas) sample_data = NamedTupleStaticClass.data_convert( f(v=1., l=self.timestep), sample_data, ['done']) # [B, T, N] sample_data = NamedTupleStaticClass.data_convert( f(v=0., l=self.timestep), sample_data) # [B, T, N] burn_in_data = NamedTupleStaticClass.data_convert( lambda x: x[:, :self.burn_in_time_step], sample_data) train_data = NamedTupleStaticClass.data_convert( lambda x: x[:, self.burn_in_time_step:], sample_data) self.burn_in_data = NamedTupleStaticClass.data_convert( lambda x: tf.reshape(x, [-1, *x.shape[2:]]), burn_in_data) train_data = NamedTupleStaticClass.data_convert( lambda x: tf.reshape(x, [-1, *x.shape[2:]]), train_data) return train_data
def add(self, exps: BatchExperiences) -> NoReturn: ''' change [s, s],[a, a],[r, r] to [s, a, r],[s, a, r] and store every item in it. ''' for exp in NamedTupleStaticClass.unpack(exps): self._store_op(exp)
def add(self, exps: BatchExperiences) -> NoReturn: ''' change [s, s],[a, a],[r, r] to [s, a, r],[s, a, r] and store every item in it. ''' for i, data in enumerate(NamedTupleStaticClass.unpack(exps)): self._per_store(i, data)
def add(self, exps: BatchExperiences) -> NoReturn: for i, data in enumerate(NamedTupleStaticClass.unpack(exps)): self._per_store(i, data)
def initialize_environment(self): ''' 初始化环境,获取必要的信息,如状态、动作维度等等 ''' self.behavior_names = list(self.env.behavior_specs.keys()) self.is_multi_agents = len(self.behavior_names) > 1 self.first_bn = self.behavior_names[0] self.first_fbn = self.first_bn.replace('?', '_') self.behavior_agents = defaultdict(int) self.behavior_ids = defaultdict(dict) self.vector_idxs = defaultdict(list) self.vector_dims = defaultdict(list) self.visual_idxs = defaultdict(list) self.visual_dims = defaultdict(list) self.a_dim = defaultdict(int) self.discrete_action_lists = {} self.is_continuous = {} self.empty_actiontuples = {} self.vector_info_type = {} self.visual_info_type = {} self.env.reset() for bn, spec in self.env.behavior_specs.items(): d, t = self.env.get_steps(bn) self.behavior_agents[bn] = len(d) self.behavior_ids[bn] = d.agent_id_to_index for i, shape in enumerate(spec.observation_shapes): if len(shape) == 1: self.vector_idxs[bn].append(i) self.vector_dims[bn].append(shape[0]) elif len(shape) == 3: self.visual_idxs[bn].append(i) self.visual_dims[bn].append(list(shape)) else: raise ValueError( "shape of observation cannot be understood.") self.vector_info_type[ bn] = NamedTupleStaticClass.generate_obs_namedtuple( n_agents=self.behavior_agents[bn], item_nums=len(self.vector_idxs[bn]), name='vector') self.visual_info_type[ bn] = NamedTupleStaticClass.generate_obs_namedtuple( n_agents=self.behavior_agents[bn], item_nums=len(self.visual_idxs[bn]), name='visual') action_spec = spec.action_spec if action_spec.is_continuous(): self.a_dim[bn] = action_spec.continuous_size self.discrete_action_lists[bn] = None self.is_continuous[bn] = True elif action_spec.is_discrete(): self.a_dim[bn] = int( np.asarray(action_spec.discrete_branches).prod()) self.discrete_action_lists[bn] = get_discrete_action_list( action_spec.discrete_branches) self.is_continuous[bn] = False else: raise NotImplementedError( "doesn't support continuous and discrete actions simultaneously for now." ) self.empty_actiontuples[bn] = action_spec.empty_action( n_agents=self.behavior_agents[bn]) if self.is_multi_agents: self.behavior_controls = defaultdict(int) for bn in self.behavior_names: self.behavior_controls[bn] = int(bn.split('#')[0]) self.env_copys = self.behavior_agents[ self.first_bn] // self.behavior_controls[self.first_bn]
def sample_generater_rnn(self, batch_size: int = None, rnn_time_step: int = None): ''' create rnn sampling data iterator. params: rnn_time_step: the length of time slide window return: sampled data. ''' batch_size = batch_size or self.batch_size rnn_time_step = rnn_time_step or self.rnn_time_step # TODO: 未done导致的episode切换需要严谨处理 # T * [B, 1] => [T, B] => [B, T] done = np.asarray(self.data_buffer['done']).squeeze().transpose((1, 0)) B, T = done.shape done_dict = defaultdict(list) for i, j in zip(*np.where(done)): done_dict[i].append(j) available_sample_range = defaultdict(list) count = 0 # 记录不交叉分割,最多有几段 for i in range(B): idxs = [-1] + done_dict[i] + [T - 1] for x, y in zip(idxs[:-1], idxs[1:]): if y - rnn_time_step + 1 > x: available_sample_range[i].append( [x + 1, y - rnn_time_step + 1]) # 左开右开 count += (y - x) // 2 # prevent total_eps_num is smaller than batch_size while batch_size > count: batch_size //= 2 for _ in range(count // batch_size): samples = [] sample_cs = [] for i in range(batch_size): # B batch_idx = random.choice(list(available_sample_range.keys())) sample_range = random.choice(available_sample_range[batch_idx]) time_idx = random.randint(*sample_range) sample_exp = {} for k in self.sample_data_type._fields: assert k in self.data_buffer.keys( ), f"assert {k} in self.data_buffer.keys()" d = self.data_buffer[k][time_idx:time_idx + rnn_time_step] # T * [B, N] if isinstance(self.data_buffer[k][0], tuple): d = [ NamedTupleStaticClass.getitem(_d, batch_idx) for _d in d ] sample_exp[k] = NamedTupleStaticClass.pack(d) # [T, N] else: d = [_d[batch_idx] for _d in d] sample_exp[k] = np.asarray(d) samples.append(self.sample_data_type(**sample_exp)) sample_cs.append( (cs[time_idx][batch_idx] for cs in self.cell_state_buffer)) cs = tuple(np.asarray(x) for x in zip(*sample_cs)) # [B, N] yield NamedTupleStaticClass.pack( samples, func=np.concatenate), cs # [B*T, N]
def get_all(self) -> BatchExperiences: return NamedTupleStaticClass.pack(self._buffer[:self._size].tolist())
def add(self, exps: BatchExperiences) -> NoReturn: ''' input: [ss, visual_ss, as, rs, s_s, visual_s_s, dones] ''' self.add_batch(list(NamedTupleStaticClass.unpack(exps)))