def process(self, data_bundle: DataBundle) -> DataBundle: for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) _indexize(data_bundle, target_field_names=['target'], vocabulary=self.vocabulary) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 .. csv-table:: :header: "raw_words", "target" "[Nadim, Ladki]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。 :return DataBundle: """ # 转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') # 将所有digit转为0 data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) # index _indexize(data_bundle) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle): _add_chars_field(data_bundle, lower=False) data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET) # 将所有digit转为0 data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars], field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) # input_field_names = [Const.CHAR_INPUT] if self.bigrams: data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') # index _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET) input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: new_bundle = DataBundle() aspect_dict = {} mask_id = self.tokenizer.convert_tokens_to_ids([self.mask])[0] if isinstance(self.tokenizer, BertTokenizer): cls = "[CLS]" sep = "[SEP]" else: cls = self.tokenizer.cls_token sep = self.tokenizer.sep_token for name, ds in data_bundle.iter_datasets(): new_ds = DataSet() for ins in ds: tokens = ins["tokens"] if not isinstance(self.tokenizer, XLNetTokenizer): tokens.insert(0, cls) tokens.append(sep) shift = 1 else: tokens.append(sep) tokens.append(cls) shift = 0 starts = [] ends = [] for aspect in ins["aspects"]: starts.append(aspect["from"] + shift) ends.append(aspect["to"] + shift) for aspect in ins["aspects"]: target = aspect["polarity"] start = aspect["from"] + shift end = aspect["to"] + shift aspect_mask = [0] * len(tokens) for i in range(start, end): aspect_mask[i] = 1 pieces = [] piece_masks = [] raw_words = tokens[shift:-1] raw_words.insert(start - 1, "[[") raw_words.insert(end, "]]") for mask, token in zip(aspect_mask, tokens): bpes = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(token)) pieces.extend(bpes) piece_masks.extend([mask] * (len(bpes))) new_ins = Instance( tokens=pieces, target=target, aspect_mask=piece_masks, raw_words=" ".join(raw_words), ) new_ds.append(new_ins) new_bundle.set_dataset(new_ds, name) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.add_word_lst( ["neutral", "positive", "negative", "smooth"]) target_vocab.index_dataset(*new_bundle.datasets.values(), field_name="target") new_bundle.set_target("target") new_bundle.set_input("tokens", "aspect_mask", "raw_words") new_bundle.apply_field(lambda x: len(x), field_name="tokens", new_field_name="seq_len") # new_bundle.set_vocab(vocab, 'tokens') if hasattr(self.tokenizer, "pad_token_id"): new_bundle.set_pad_val("tokens", self.tokenizer.pad_token_id) else: new_bundle.set_pad_val("tokens", self.tokenizer.pad_index) new_bundle.set_vocab(target_vocab, "target") return new_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: """ 可以处理的DataSet需要包含raw_words列 .. csv-table:: :header: "raw_words" "上海 浦东 开发 与 法制 建设 同步" "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" "..." :param data_bundle: :return: """ data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) if self.replace_num_alpha: data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) self._tokenize(data_bundle) input_field_names = [Const.CHAR_INPUT] target_field_names = [] for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: _word_lens_to_relay(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.TARGET) dataset.apply_field( lambda chars: _word_lens_to_start_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='start_seg_mask') dataset.apply_field( lambda chars: _word_lens_to_end_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='end_seg_mask') dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) target_field_names.append('start_seg_mask') input_field_names.append('end_seg_mask') if self.bigrams: for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') _indexize(data_bundle, ['chars', 'bigrams'], []) func = partial(_clip_target, L=self.L) for name, dataset in data_bundle.datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) input_field_names.append('relay_target') input_field_names.append('relay_mask') input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle