Example #1
0
    def process(self, data_bundle: DataBundle) -> DataBundle:

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        _indexize(data_bundle,
                  target_field_names=['target'],
                  vocabulary=self.vocabulary)
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)
        return data_bundle
Example #2
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        支持的DataSet的field为

        .. csv-table::
           :header: "raw_words", "target"

           "[Nadim, Ladki]", "[B-PER, I-PER]"
           "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]"
           "[...]", "[...]"

        :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。
        :return DataBundle:
        """
        # 转换tag
        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(self.convert_tag,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)

        _add_words_field(data_bundle, lower=self.lower)

        if self.word_shape:
            data_bundle.apply_field(word_shape,
                                    field_name='raw_words',
                                    new_field_name='word_shapes')
            data_bundle.set_input('word_shapes')

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars: [
            ''.join(['0' if c.isdigit() else c for c in char])
            for char in chars
        ],
                                field_name=Const.INPUT,
                                new_field_name=Const.INPUT)

        # index
        _indexize(data_bundle)

        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Example #3
0
    def process(self, data_bundle: DataBundle):
        _add_chars_field(data_bundle, lower=False)

        data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET)

        # 将所有digit转为0
        data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars],
            field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT)

        #
        input_field_names = [Const.CHAR_INPUT]
        if self.bigrams:
            data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])],
                                    field_name=Const.CHAR_INPUT, new_field_name='bigrams')
            input_field_names.append('bigrams')

        # index
        _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET)

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Example #4
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        new_bundle = DataBundle()
        aspect_dict = {}
        mask_id = self.tokenizer.convert_tokens_to_ids([self.mask])[0]
        if isinstance(self.tokenizer, BertTokenizer):
            cls = "[CLS]"
            sep = "[SEP]"
        else:
            cls = self.tokenizer.cls_token
            sep = self.tokenizer.sep_token
        for name, ds in data_bundle.iter_datasets():
            new_ds = DataSet()
            for ins in ds:
                tokens = ins["tokens"]
                if not isinstance(self.tokenizer, XLNetTokenizer):
                    tokens.insert(0, cls)
                    tokens.append(sep)
                    shift = 1
                else:
                    tokens.append(sep)
                    tokens.append(cls)
                    shift = 0

                starts = []
                ends = []
                for aspect in ins["aspects"]:
                    starts.append(aspect["from"] + shift)
                    ends.append(aspect["to"] + shift)
                for aspect in ins["aspects"]:
                    target = aspect["polarity"]
                    start = aspect["from"] + shift
                    end = aspect["to"] + shift
                    aspect_mask = [0] * len(tokens)
                    for i in range(start, end):
                        aspect_mask[i] = 1
                    pieces = []
                    piece_masks = []
                    raw_words = tokens[shift:-1]
                    raw_words.insert(start - 1, "[[")
                    raw_words.insert(end, "]]")
                    for mask, token in zip(aspect_mask, tokens):
                        bpes = self.tokenizer.convert_tokens_to_ids(
                            self.tokenizer.tokenize(token))
                        pieces.extend(bpes)
                        piece_masks.extend([mask] * (len(bpes)))
                    new_ins = Instance(
                        tokens=pieces,
                        target=target,
                        aspect_mask=piece_masks,
                        raw_words=" ".join(raw_words),
                    )
                    new_ds.append(new_ins)
            new_bundle.set_dataset(new_ds, name)

        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab.add_word_lst(
            ["neutral", "positive", "negative", "smooth"])
        target_vocab.index_dataset(*new_bundle.datasets.values(),
                                   field_name="target")

        new_bundle.set_target("target")
        new_bundle.set_input("tokens", "aspect_mask", "raw_words")
        new_bundle.apply_field(lambda x: len(x),
                               field_name="tokens",
                               new_field_name="seq_len")

        # new_bundle.set_vocab(vocab, 'tokens')
        if hasattr(self.tokenizer, "pad_token_id"):
            new_bundle.set_pad_val("tokens", self.tokenizer.pad_token_id)
        else:
            new_bundle.set_pad_val("tokens", self.tokenizer.pad_index)
        new_bundle.set_vocab(target_vocab, "target")

        return new_bundle
Example #5
0
    def process(self, data_bundle: DataBundle) -> DataBundle:
        """
        可以处理的DataSet需要包含raw_words列

        .. csv-table::
           :header: "raw_words"

           "上海 浦东 开发 与 法制 建设 同步"
           "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )"
           "..."

        :param data_bundle:
        :return:
        """
        data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT)

        if self.replace_num_alpha:
            data_bundle.apply_field(_find_and_replace_alpha_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)
            data_bundle.apply_field(_find_and_replace_digit_spans,
                                    Const.CHAR_INPUT, Const.CHAR_INPUT)

        self._tokenize(data_bundle)
        input_field_names = [Const.CHAR_INPUT]
        target_field_names = []

        for name, dataset in data_bundle.datasets.items():
            dataset.apply_field(
                lambda chars: _word_lens_to_relay(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name=Const.TARGET)
            dataset.apply_field(
                lambda chars: _word_lens_to_start_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='start_seg_mask')
            dataset.apply_field(
                lambda chars: _word_lens_to_end_seg_mask(map(len, chars)),
                field_name=Const.CHAR_INPUT,
                new_field_name='end_seg_mask')
            dataset.apply_field(lambda chars: list(chain(*chars)),
                                field_name=Const.CHAR_INPUT,
                                new_field_name=Const.CHAR_INPUT)
            target_field_names.append('start_seg_mask')
            input_field_names.append('end_seg_mask')
        if self.bigrams:
            for name, dataset in data_bundle.datasets.items():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')

        _indexize(data_bundle, ['chars', 'bigrams'], [])

        func = partial(_clip_target, L=self.L)
        for name, dataset in data_bundle.datasets.items():
            res = dataset.apply_field(func, field_name='target')
            relay_target = [res_i[0] for res_i in res]
            relay_mask = [res_i[1] for res_i in res]
            dataset.add_field('relay_target',
                              relay_target,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            dataset.add_field('relay_mask',
                              relay_mask,
                              is_input=True,
                              is_target=False,
                              ignore_type=False)
            input_field_names.append('relay_target')
            input_field_names.append('relay_mask')

        input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle