Ejemplo n.º 1
0
    def process(self, data_bundle: DataBundle) -> DataBundle:

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        _indexize(data_bundle,
                  target_field_names=['target'],
                  vocabulary=self.vocabulary)
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)
        return data_bundle
Ejemplo n.º 2
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
        :return DataBundle:
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        # index
        _indexize(data_bundle)

        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Ejemplo n.º 3
0
 def process(self, data_bundle):
     data_bundle = _add_words_field(data_bundle)
     data_bundle = _indexize(
         data_bundle=data_bundle,
         input_field_names=[Const.INPUT, "pos", "deprel"])
     for name, dataset in data_bundle.datasets.items():
         dataset.add_seq_len(Const.INPUT)
         data_bundle.get_vocab(Const.INPUT).index_dataset(
             dataset, field_name="asp", new_field_name="aspect")
     data_bundle.apply(self._get_post, new_field_name="post")
     data_bundle.apply(self._get_mask, new_field_name="aspmask")
     data_bundle.set_input(Const.INPUT, Const.INPUT_LEN, "pos", "dephead",
                           "deprel", "aspect", "fidx", "tidx", "post",
                           "aspmask")
     data_bundle.set_target(Const.TARGET)
     return data_bundle