def process(self, data_bundle: DataBundle): _add_chars_field(data_bundle, lower=False) data_bundle.apply_field(self.encoding_func, field_name=Const.TARGET, new_field_name=Const.TARGET) # 将所有digit转为0 data_bundle.apply_field(lambda chars:[''.join(['0' if c.isdigit() else c for c in char]) for char in chars], field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) # input_field_names = [Const.CHAR_INPUT] if self.bigrams: data_bundle.apply_field(lambda chars:[c1+c2 for c1,c2 in zip(chars, chars[1:]+['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') # index _indexize(data_bundle, input_field_names=input_field_names, target_field_names=Const.TARGET) input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) # index _indexize(data_bundle) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 .. csv-table:: :header: "raw_words", "target" "[Nadim, Ladki]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。 :return DataBundle: """ # 转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_words_field(data_bundle, lower=self.lower) if self.word_shape: data_bundle.apply_field(word_shape, field_name='raw_words', new_field_name='word_shapes') data_bundle.set_input('word_shapes') # 将所有digit转为0 data_bundle.apply_field(lambda chars: [ ''.join(['0' if c.isdigit() else c for c in char]) for char in chars ], field_name=Const.INPUT, new_field_name=Const.INPUT) # index _indexize(data_bundle) input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle): data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] target_fields = [C.TARGET, C.INPUT_LEN] if self.bigram: for dataset in data_bundle.datasets.values(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=C.CHAR_INPUT, new_field_name='bigrams') bigram_vocab = Vocabulary() bigram_vocab.from_dataset( data_bundle.get_dataset('train'), field_name='bigrams', no_create_entry_dataset=[ ds for name, ds in data_bundle.datasets.items() if name != 'train' ]) bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') data_bundle.set_vocab(bigram_vocab, field_name='bigrams') input_fields.append('bigrams') _add_chars_field(data_bundle, lower=False) # index _indexize(data_bundle, input_field_names=C.CHAR_INPUT, target_field_names=C.TARGET) for name, dataset in data_bundle.datasets.items(): dataset.set_pad_val(C.TARGET, self.target_pad_val) dataset.add_seq_len(C.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle): data_bundle = _add_words_field(data_bundle) data_bundle = _indexize( data_bundle=data_bundle, input_field_names=[Const.INPUT, "pos", "deprel"]) for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) data_bundle.get_vocab(Const.INPUT).index_dataset( dataset, field_name="asp", new_field_name="aspect") data_bundle.apply(self._get_post, new_field_name="post") data_bundle.apply(self._get_mask, new_field_name="aspmask") data_bundle.set_input(Const.INPUT, Const.INPUT_LEN, "pos", "dephead", "deprel", "aspect", "fidx", "tidx", "post", "aspmask") data_bundle.set_target(Const.TARGET) return data_bundle
def process(self, data_bundle: DataBundle): """ 可处理的DataSet应具备如下的field .. csv-table:: :header: "raw_words", "target" "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "...", "..." :param data_bundle: :return: """ # 根据granularity设置tag # 由原来的固定tagmap,修改为根据数据集获取tagmap targets_vocabs = get_data_bundle_tags(data_bundle) self.tag_map = {tag_name: tag_name for tag_name in targets_vocabs} data_bundle = self._granularize(data_bundle=data_bundle, tag_map=self.tag_map) # clean,lower # CWS(tokenize) data_bundle = self._tokenize(data_bundle=data_bundle, field_name='raw_chars', new_field_name='chars') input_field_names = [Const.CHAR_INPUT] # n-grams if self.bigrams: for name, dataset in data_bundle.iter_datasets(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') if self.trigrams: for name, dataset in data_bundle.iter_datasets(): dataset.apply_field(lambda chars: [ c1 + c2 + c3 for c1, c2, c3 in zip(chars, chars[1:] + ['<eos>'], chars[ 2:] + ['<eos>'] * 2) ], field_name=Const.CHAR_INPUT, new_field_name='trigrams') input_field_names.append('trigrams') # index data_bundle = _indexize(data_bundle=data_bundle, input_field_names=Const.CHAR_INPUT) # add length for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(field_name=Const.CHAR_INPUT, new_field_name=Const.INPUT_LEN) # input_fields包含的字段名称 # input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names input_fields = [Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET] data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, data_bundle: DataBundle) -> DataBundle: """ 可以处理的DataSet需要包含raw_words列 .. csv-table:: :header: "raw_words" "上海 浦东 开发 与 法制 建设 同步" "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" "..." :param data_bundle: :return: """ data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) if self.replace_num_alpha: data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) self._tokenize(data_bundle) input_field_names = [Const.CHAR_INPUT] target_field_names = [] for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: _word_lens_to_relay(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.TARGET) dataset.apply_field( lambda chars: _word_lens_to_start_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='start_seg_mask') dataset.apply_field( lambda chars: _word_lens_to_end_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name='end_seg_mask') dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) target_field_names.append('start_seg_mask') input_field_names.append('end_seg_mask') if self.bigrams: for name, dataset in data_bundle.datasets.items(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') _indexize(data_bundle, ['chars', 'bigrams'], []) func = partial(_clip_target, L=self.L) for name, dataset in data_bundle.datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) input_field_names.append('relay_target') input_field_names.append('relay_mask') input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle