コード例 #1
0
    def _initialize(self, env):
        assert isinstance(env.observation_space, (Box, Discrete)) and isinstance(env.action_space, (Box, Discrete)), 'action_space and observation_space must be one of available_type'
        # process observation
        ObsSpace = env.observation_space
        if isinstance(ObsSpace, Box):
            self.vector_dims = [ObsSpace.shape[0] if len(ObsSpace.shape) == 1 else 0]
            # self.obs_high = ObsSpace.high
            # self.obs_low = ObsSpace.low
        else:
            self.vector_dims = [int(ObsSpace.n)]
        if len(ObsSpace.shape) == 3:
            self.obs_type = 'visual'
            self.visual_dims = [list(ObsSpace.shape)]
        else:
            self.obs_type = 'vector'
            self.visual_dims = []

        self.vector_info_type = NamedTupleStaticClass.generate_obs_namedtuple(n_agents=self.n,
                                                                              item_nums=1 if self.obs_type == 'vector' else 0,
                                                                              name='vector')
        self.visual_info_type = NamedTupleStaticClass.generate_obs_namedtuple(n_agents=self.n,
                                                                              item_nums=1 if self.obs_type == 'visual' else 0,
                                                                              name='vector')

        # process action
        ActSpace = env.action_space
        if isinstance(ActSpace, Box):
            assert len(ActSpace.shape) == 1, 'if action space is continuous, the shape length of action must equal to 1'
            self.action_type = 'continuous'
            self._is_continuous = True
            self.a_dim = ActSpace.shape[0]
        elif isinstance(ActSpace, Tuple):
            assert all([isinstance(i, Discrete) for i in ActSpace]) == True, 'if action space is Tuple, each item in it must have type Discrete'
            self.action_type = 'Tuple(Discrete)'
            self._is_continuous = False
            self.a_dim = int(np.asarray([i.n for i in ActSpace]).prod())
            discrete_action_dim_list = [i.n for i in ActSpace]
        else:
            self.action_type = 'discrete'
            self._is_continuous = False
            self.a_dim = env.action_space.n
            discrete_action_dim_list = [env.action_space.n]
        if not self._is_continuous:
            self.discrete_action_list = get_discrete_action_list(discrete_action_dim_list)

        self.reward_threshold = env.env.spec.reward_threshold  # reward threshold refer to solved
        self.EnvSpec = SingleAgentEnvArgs(
            obs_spec=ObsSpec(vector_dims=self.vector_dims,
                             visual_dims=self.visual_dims),
            a_dim=self.a_dim,
            is_continuous=self._is_continuous,
            n_agents=self.n
        )
コード例 #2
0
ファイル: on_policy_buffer.py プロジェクト: zhijie-ai/RLs
 def normalize_vector_obs(self, func):
     '''
     TODO: Annotation
     '''
     assert 'obs' in self.data_buffer.keys(
     ), "assert 'obs' in self.data_buffer.keys()"
     assert 'obs_' in self.data_buffer.keys(
     ), "assert 'obs_' in self.data_buffer.keys()"
     self.data_buffer['obs'] = [
         NamedTupleStaticClass.data_convert(func, obs, keys=['vector'])
         for obs in self.data_buffer['obs']
     ]
     self.data_buffer['obs_'] = [
         NamedTupleStaticClass.data_convert(func, obs_, keys=['vector'])
         for obs_ in self.data_buffer['obs_']
     ]
コード例 #3
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
    def _per_store(self, i: int, data: BatchExperiences) -> NoReturn:
        # TODO: 优化
        q = self.queue[i]
        if len(q) == 0:  # 如果Nstep临时经验池为空,就直接添加
            q.append(data)
            return

        if len(q) == self.n:
            self._store_op(q.pop(0))
        if not NamedTupleStaticClass.check_equal(
                q[-1].obs_,
                data.obs):  # 如果截断了,非常规done,把Nstep临时经验池中已存在的经验都存进去,临时经验池清空
            q.clear(
            )  # 保证经验池中不存在不足N长度的序列,有done的除外,因为(1-done)为0,导致gamma的次方计算不准确也没有关系。
            q.append(data)
        else:
            _len = len(q)
            for j in range(_len):  # 然后再存入一条最新的经验到Nstep临时经验池
                q[j] = q[j]._replace(reward=q[j].reward + data.reward *
                                     (self.gamma**(_len - j)))
                q[j] = q[j]._replace(obs_=data.obs_)
                q[j] = q[j]._replace(done=data.done)
            q.append(data)
            if data.done:  # done or not # 如果新数据是done,就清空临时经验池
                while q:  # (1-done)会清零不正确的n-step
                    self._store_op(q.pop())
コード例 #4
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def sample(self) -> BatchExperiences:
     '''
     change [[s, a, r],[s, a, r]] to [[s, s],[a, a],[r, r]]
     '''
     n_sample = self.batch_size if self.is_lg_batch_size else self._size
     t = np.random.choice(self._buffer[:self._size],
                          size=n_sample,
                          replace=False)
     # return [np.asarray(e) for e in zip(*t)]
     return NamedTupleStaticClass.pack(t.tolist())
コード例 #5
0
ファイル: off_policy.py プロジェクト: zhijie-ai/RLs
 def _data_process2dict(self, exps: BatchExperiences) -> BatchExperiences:
     # TODO 优化
     if not self.is_continuous:
         assert 'action' in exps._fields, "assert 'action' in exps._fields"
         exps = exps._replace(action=int2one_hot(exps.action.astype(np.int32), self.a_dim))
     assert 'obs' in exps._fields and 'obs_' in exps._fields, "'obs' in exps._fields and 'obs_' in exps._fields"
     # exps = exps._replace(
     #     obs=exps.obs._replace(vector=self.normalize_vector_obs()),
     #     obs_=exps.obs_._replace(vector=self.normalize_vector_obs()))
     return NamedTupleStaticClass.data_convert(self.data_convert, exps)
コード例 #6
0
    def _learn(self, function_dict: Dict) -> NoReturn:
        '''
        TODO: Annotation
        '''
        _cal_stics = function_dict.get('calculate_statistics', lambda *args: None)
        _train = function_dict.get('train_function', lambda *args: None)    # 训练过程
        _summary = function_dict.get('summary_dict', {})    # 记录输出到tensorboard的词典

        self.intermediate_variable_reset()

        # self.data.normalize_vector_obs(self.normalize_vector_obs)

        if not self.is_continuous:
            self.data.convert_action2one_hot(self.a_dim)

        if self.use_curiosity and not self.use_rnn:
            curiosity_data = self.data.get_curiosity_data()
            curiosity_data = NamedTupleStaticClass.data_convert(self.data_convert, curiosity_data)
            cell_state = self.initial_cell_state(batch=self.n_agents)
            crsty_r, crsty_summaries = self.curiosity_model(curiosity_data, cell_state)
            self.data.update_reward(crsty_r.numpy())
            # self.data.r += crsty_r.numpy().reshape([self.data.eps_len, -1])
            self.summaries.update(crsty_summaries)

        _cal_stics()

        if self.use_rnn:
            all_data = self.data.sample_generater_rnn()
        else:
            all_data = self.data.sample_generater()

        for data, cell_state in all_data:
            data = NamedTupleStaticClass.data_convert(self.data_convert, data)
            cell_state = self.data_convert(cell_state)
            summaries = _train(data, cell_state)

        self.summaries.update(summaries)
        self.summaries.update(_summary)

        self.write_training_summaries(self.train_step, self.summaries)

        self.clear()
コード例 #7
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def get_all(self, return_index: bool = False) -> BatchExperiences:
     idxs, data_indx, p, data = self.tree.get_all()
     self.last_indexs = idxs
     _min_p = self.min_p if self.global_v and self.min_p < sys.maxsize else p.min(
     )
     self.IS_w = np.power(_min_p / p, self.beta)
     data = NamedTupleStaticClass.pack(data.tolist())
     if return_index:
         return data, idxs
     else:
         return data
コード例 #8
0
ファイル: on_policy_buffer.py プロジェクト: zhijie-ai/RLs
    def sample_generater(self, batch_size: int = None):
        '''
        create sampling data iterator without using rnn.

        params:
            batch_size: the batch size of training data
            keys: the keys of data that should be sampled to train policies
        return:
            sampled data.
        '''

        batch_size = batch_size or self.batch_size

        buffer = {}
        # T * [B, N] => [T*B, N]
        for k in self.sample_data_type._fields:
            assert k in self.data_buffer.keys(
            ), f"assert {k} in self.data_buffer.keys()"
            if isinstance(self.data_buffer[k][0], tuple):
                buffer[k] = NamedTupleStaticClass.pack(self.data_buffer[k],
                                                       func=np.concatenate)
                assert NamedTupleStaticClass.check_len(buffer[k], l=self.n_agents * self.eps_len), \
                    f"shape of {k} not equal to {self.n_agents * self.eps_len}"
            else:
                buffer[k] = np.concatenate(self.data_buffer[k])
                assert buffer[k].shape[0] == self.n_agents * self.eps_len, \
                    f"shape of {k} not equal to {self.n_agents * self.eps_len}"

        idxs = np.arange(self.eps_len * self.n_agents)
        np.random.shuffle(idxs)
        for i in range(0, self.eps_len * self.n_agents,
                       batch_size * self.n_agents):
            _idxs = idxs[i:i + batch_size * self.n_agents]
            data = []
            for k in self.sample_data_type._fields:
                if isinstance(buffer[k], tuple):
                    data.append(
                        NamedTupleStaticClass.getbatchitems(buffer[k], _idxs))
                else:
                    data.append(buffer[k][_idxs])
            yield self.sample_data_type._make(data), (None, )
コード例 #9
0
ファイル: wrappers.py プロジェクト: zhijie-ai/RLs
    def observation(self, observation: List[SingleModelInformation]):
        def func(x):
            return np.asarray(x * 255).astype(np.uint8)

        for bn in self.behavior_names:
            visual = observation[bn].obs.visual
            if isinstance(visual, np.ndarray):
                visual = func(visual)
            else:
                visual = NamedTupleStaticClass.data_convert(func, visual)
            observation[bn] = observation[bn]._replace(
                obs=observation[bn].obs._replace(visual=visual))

            visual = observation[bn].obs_.visual
            if isinstance(visual, np.ndarray):
                visual = func(visual)
            else:
                visual = NamedTupleStaticClass.data_convert(func, visual)
            observation[bn] = observation[bn]._replace(
                obs_=observation[bn].obs_._replace(visual=visual))

        return observation
コード例 #10
0
ファイル: on_policy_buffer.py プロジェクト: zhijie-ai/RLs
    def get_curiosity_data(self):
        '''
        返回用于好奇心机制的数据
        '''

        # T * [B, N] => [B, T, N] => [B*T, N]
        def func(x):
            return np.stack(x, axis=1).reshape(self.n_agents * self.eps_len,
                                               -1)

        data = {}
        for k in BatchExperiences._fields:
            assert k in self.data_buffer.keys(
            ), f"assert {k} in self.data_buffer.keys()"
            if isinstance(self.data_buffer[k][0], tuple):
                data[k] = NamedTupleStaticClass.pack(self.data_buffer[k],
                                                     func=func)
                assert NamedTupleStaticClass.check_len(data[k], l=self.n_agents * self.eps_len), \
                    f"shape of {k} not equal to {self.n_agents * self.eps_len}"
            else:
                data[k] = func(self.data_buffer[k])
                assert data[k].shape[0] == self.n_agents * self.eps_len, \
                    f"shape of {k} not equal to {self.n_agents * self.eps_len}"
        return BatchExperiences(**data)
コード例 #11
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def _per_store(self, i: int, data: BatchExperiences) -> NoReturn:
     q = self.queue[i]
     if len(q) == 0:
         q.append(data)
         return
     if not NamedTupleStaticClass.check_equal(q[-1].obs_, data.obs):
         self._store_op(q.copy())
         q.clear()
         q.append(data)
         return
     if data.done:
         q.append(data)
         self._store_op(q.copy())
         q.clear()
         return
     q.append(data)
コード例 #12
0
 def get_transitions(self,
                     databuffer,
                     data_name_list=['s', 'a', 'r', 's_', 'done']):
     '''
     TODO: Annotation
     '''
     exps = databuffer.sample()  # 经验池取数据
     if not self.is_continuous:
         assert 'action' in exps._fields, "assert 'action' in exps._fields"
         a = exps.action.astype(np.int32)
         pre_shape = a.shape
         a = a.reshape(-1)
         a = int2one_hot(a, self.a_dim)
         a = a.reshape(pre_shape + (-1, ))
         exps = exps._replace(action=a)
     return NamedTupleStaticClass.data_convert(self.data_convert, exps)
コード例 #13
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def sample(self, return_index: bool = False) -> Union[List, Tuple]:
     '''
     output: weights, [ss, visual_ss, as, rs, s_s, visual_s_s, dones]
     '''
     n_sample = self.batch_size if self.is_lg_batch_size else self._size
     all_intervals = np.linspace(0, self.tree.total, n_sample + 1)
     ps = np.random.uniform(all_intervals[:-1], all_intervals[1:])
     idxs, data_indx, p, data = self.tree.get_batch_parallel(ps)
     self.last_indexs = idxs
     _min_p = self.min_p if self.global_v and self.min_p < sys.maxsize else p.min(
     )
     self.IS_w = np.power(_min_p / p, self.beta)
     data = NamedTupleStaticClass.pack(data.tolist())
     if return_index:
         return data, idxs
     else:
         return data
コード例 #14
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
    def sample(self) -> BatchExperiences:
        n_sample = self.batch_size if self.is_lg_batch_size else self._size
        trajs = np.random.choice(self._buffer[:self._size],
                                 size=n_sample,
                                 replace=False)  # 选n_sample条轨迹

        def f(v, l):  # [B, T, N]
            return lambda x: tf.keras.preprocessing.sequence.pad_sequences(
                x,
                padding='pre',
                dtype='float32',
                value=v,
                maxlen=l,
                truncating='pre')

        def truncate(traj):
            idx = np.random.randint(max(1,
                                        len(traj) - self.timestep +
                                        1))  # [min, max)
            return traj[idx:idx + self.timestep]

        datas = []  # [B, 不定长时间步, N]
        for traj in trajs:
            data = NamedTupleStaticClass.pack(truncate(traj))
            datas.append(data)

        sample_data = NamedTupleStaticClass.pack(datas)
        sample_data = NamedTupleStaticClass.data_convert(
            f(v=1., l=self.timestep), sample_data, ['done'])  # [B, T, N]
        sample_data = NamedTupleStaticClass.data_convert(
            f(v=0., l=self.timestep), sample_data)  # [B, T, N]

        burn_in_data = NamedTupleStaticClass.data_convert(
            lambda x: x[:, :self.burn_in_time_step], sample_data)
        train_data = NamedTupleStaticClass.data_convert(
            lambda x: x[:, self.burn_in_time_step:], sample_data)

        self.burn_in_data = NamedTupleStaticClass.data_convert(
            lambda x: tf.reshape(x, [-1, *x.shape[2:]]), burn_in_data)
        train_data = NamedTupleStaticClass.data_convert(
            lambda x: tf.reshape(x, [-1, *x.shape[2:]]), train_data)

        return train_data
コード例 #15
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def add(self, exps: BatchExperiences) -> NoReturn:
     '''
     change [s, s],[a, a],[r, r] to [s, a, r],[s, a, r] and store every item in it.
     '''
     for exp in NamedTupleStaticClass.unpack(exps):
         self._store_op(exp)
コード例 #16
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def add(self, exps: BatchExperiences) -> NoReturn:
     '''
     change [s, s],[a, a],[r, r] to [s, a, r],[s, a, r] and store every item in it.
     '''
     for i, data in enumerate(NamedTupleStaticClass.unpack(exps)):
         self._per_store(i, data)
コード例 #17
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def add(self, exps: BatchExperiences) -> NoReturn:
     for i, data in enumerate(NamedTupleStaticClass.unpack(exps)):
         self._per_store(i, data)
コード例 #18
0
ファイル: wrappers.py プロジェクト: zhijie-ai/RLs
    def initialize_environment(self):
        '''
        初始化环境,获取必要的信息,如状态、动作维度等等
        '''

        self.behavior_names = list(self.env.behavior_specs.keys())
        self.is_multi_agents = len(self.behavior_names) > 1
        self.first_bn = self.behavior_names[0]
        self.first_fbn = self.first_bn.replace('?', '_')

        self.behavior_agents = defaultdict(int)
        self.behavior_ids = defaultdict(dict)
        self.vector_idxs = defaultdict(list)
        self.vector_dims = defaultdict(list)
        self.visual_idxs = defaultdict(list)
        self.visual_dims = defaultdict(list)
        self.a_dim = defaultdict(int)
        self.discrete_action_lists = {}
        self.is_continuous = {}
        self.empty_actiontuples = {}

        self.vector_info_type = {}
        self.visual_info_type = {}

        self.env.reset()
        for bn, spec in self.env.behavior_specs.items():
            d, t = self.env.get_steps(bn)
            self.behavior_agents[bn] = len(d)
            self.behavior_ids[bn] = d.agent_id_to_index

            for i, shape in enumerate(spec.observation_shapes):
                if len(shape) == 1:
                    self.vector_idxs[bn].append(i)
                    self.vector_dims[bn].append(shape[0])
                elif len(shape) == 3:
                    self.visual_idxs[bn].append(i)
                    self.visual_dims[bn].append(list(shape))
                else:
                    raise ValueError(
                        "shape of observation cannot be understood.")
            self.vector_info_type[
                bn] = NamedTupleStaticClass.generate_obs_namedtuple(
                    n_agents=self.behavior_agents[bn],
                    item_nums=len(self.vector_idxs[bn]),
                    name='vector')
            self.visual_info_type[
                bn] = NamedTupleStaticClass.generate_obs_namedtuple(
                    n_agents=self.behavior_agents[bn],
                    item_nums=len(self.visual_idxs[bn]),
                    name='visual')

            action_spec = spec.action_spec
            if action_spec.is_continuous():
                self.a_dim[bn] = action_spec.continuous_size
                self.discrete_action_lists[bn] = None
                self.is_continuous[bn] = True
            elif action_spec.is_discrete():
                self.a_dim[bn] = int(
                    np.asarray(action_spec.discrete_branches).prod())
                self.discrete_action_lists[bn] = get_discrete_action_list(
                    action_spec.discrete_branches)
                self.is_continuous[bn] = False
            else:
                raise NotImplementedError(
                    "doesn't support continuous and discrete actions simultaneously for now."
                )

            self.empty_actiontuples[bn] = action_spec.empty_action(
                n_agents=self.behavior_agents[bn])

        if self.is_multi_agents:
            self.behavior_controls = defaultdict(int)
            for bn in self.behavior_names:
                self.behavior_controls[bn] = int(bn.split('#')[0])
            self.env_copys = self.behavior_agents[
                self.first_bn] // self.behavior_controls[self.first_bn]
コード例 #19
0
ファイル: on_policy_buffer.py プロジェクト: zhijie-ai/RLs
    def sample_generater_rnn(self,
                             batch_size: int = None,
                             rnn_time_step: int = None):
        '''
        create rnn sampling data iterator.

        params:
            rnn_time_step: the length of time slide window
        return:
            sampled data.
        '''
        batch_size = batch_size or self.batch_size
        rnn_time_step = rnn_time_step or self.rnn_time_step

        # TODO: 未done导致的episode切换需要严谨处理
        # T * [B, 1] => [T, B] => [B, T]
        done = np.asarray(self.data_buffer['done']).squeeze().transpose((1, 0))
        B, T = done.shape
        done_dict = defaultdict(list)
        for i, j in zip(*np.where(done)):
            done_dict[i].append(j)

        available_sample_range = defaultdict(list)
        count = 0  # 记录不交叉分割,最多有几段
        for i in range(B):
            idxs = [-1] + done_dict[i] + [T - 1]
            for x, y in zip(idxs[:-1], idxs[1:]):
                if y - rnn_time_step + 1 > x:
                    available_sample_range[i].append(
                        [x + 1, y - rnn_time_step + 1])  # 左开右开
                    count += (y - x) // 2

        # prevent total_eps_num is smaller than batch_size
        while batch_size > count:
            batch_size //= 2

        for _ in range(count // batch_size):
            samples = []
            sample_cs = []
            for i in range(batch_size):  # B
                batch_idx = random.choice(list(available_sample_range.keys()))
                sample_range = random.choice(available_sample_range[batch_idx])
                time_idx = random.randint(*sample_range)

                sample_exp = {}
                for k in self.sample_data_type._fields:
                    assert k in self.data_buffer.keys(
                    ), f"assert {k} in self.data_buffer.keys()"
                    d = self.data_buffer[k][time_idx:time_idx +
                                            rnn_time_step]  # T * [B, N]
                    if isinstance(self.data_buffer[k][0], tuple):
                        d = [
                            NamedTupleStaticClass.getitem(_d, batch_idx)
                            for _d in d
                        ]
                        sample_exp[k] = NamedTupleStaticClass.pack(d)  # [T, N]
                    else:
                        d = [_d[batch_idx] for _d in d]
                        sample_exp[k] = np.asarray(d)
                samples.append(self.sample_data_type(**sample_exp))

                sample_cs.append(
                    (cs[time_idx][batch_idx] for cs in self.cell_state_buffer))
            cs = tuple(np.asarray(x) for x in zip(*sample_cs))  # [B, N]
            yield NamedTupleStaticClass.pack(
                samples, func=np.concatenate), cs  # [B*T, N]
コード例 #20
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def get_all(self) -> BatchExperiences:
     return NamedTupleStaticClass.pack(self._buffer[:self._size].tolist())
コード例 #21
0
ファイル: replay_buffer.py プロジェクト: zhijie-ai/RLs
 def add(self, exps: BatchExperiences) -> NoReturn:
     '''
     input: [ss, visual_ss, as, rs, s_s, visual_s_s, dones]
     '''
     self.add_batch(list(NamedTupleStaticClass.unpack(exps)))