Ejemplo n.º 1
0
def print_data_bundle(data_bundle: DataBundle, title: str = None):
    """ 打印输出data_bundle的信息.

    @params:
        data_bundle - 数据集DataBundle.
        title - 打印输出的标题信息.
    """
    if title:
        logger.warning(title)
    for name, dataset in data_bundle.iter_datasets():
        logger.info('dataset name : {}'.format(name))
        logger.info('dataset len : {}'.format(len(dataset)))
        logger.info('dataset example : ')
        logger.info('\n{}'.format(dataset[:5]))
        logger.info('dataset 输出各个field的被设置成input和target的情况 : ')
        logger.info('\n{}'.format(dataset.print_field_meta()))
Ejemplo n.º 2
0
    def process(self, data_bundle: DataBundle):
        """
        可处理的DataSet应具备如下的field

        .. csv-table::
            :header: "raw_words", "target"

            "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
            "...", "..."

        :param data_bundle:
        :return:
        """
        # 根据granularity设置tag
        # 由原来的固定tagmap,修改为根据数据集获取tagmap
        targets_vocabs = get_data_bundle_tags(data_bundle)
        self.tag_map = {tag_name: tag_name for tag_name in targets_vocabs}
        data_bundle = self._granularize(data_bundle=data_bundle,
                                        tag_map=self.tag_map)
        # clean,lower

        # CWS(tokenize)
        data_bundle = self._tokenize(data_bundle=data_bundle,
                                     field_name='raw_chars',
                                     new_field_name='chars')
        input_field_names = [Const.CHAR_INPUT]

        # n-grams
        if self.bigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')
        if self.trigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(lambda chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(chars, chars[1:] + ['<eos>'], chars[
                        2:] + ['<eos>'] * 2)
                ],
                                    field_name=Const.CHAR_INPUT,
                                    new_field_name='trigrams')
            input_field_names.append('trigrams')

        # index
        data_bundle = _indexize(data_bundle=data_bundle,
                                input_field_names=Const.CHAR_INPUT)
        # add length
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(field_name=Const.CHAR_INPUT,
                                new_field_name=Const.INPUT_LEN)

        # input_fields包含的字段名称
        # input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        input_fields = [Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET]

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle