Example #1
0
    def process(self,
                paths,
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                src_embed_op: EmbeddingOption = None):
        input_name, target_name = 'words', 'target'
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

        info = DataBundle(datasets=self.load(paths))
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()
        src_vocab.from_dataset(*_train_ds, field_name=input_name)
        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        src_vocab.index_dataset(*info.datasets.values(),
                                field_name=input_name,
                                new_field_name=input_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)
        info.vocabs = {input_name: src_vocab, target_name: tgt_vocab}

        if src_embed_op is not None:
            src_embed_op.vocab = src_vocab
            init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
            info.embeddings[input_name] = init_emb

        for name, dataset in info.datasets.items():
            dataset.set_input(input_name)
            dataset.set_target(target_name)
        return info
Example #2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(
            0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Example #3
0
    def process(self, paths):
        def get_seq_len(instance):
            return len(instance['article'])

        print('Start loading datasets !!!')
        start = time()

        # load datasets
        datasets = {}
        for name in paths:
            datasets[name] = self._load(paths[name])

            datasets[name].apply(get_seq_len, new_field_name='seq_len')

            # set input and target
            datasets[name].set_input('article', 'segment_id', 'cls_id')
            datasets[name].set_target(Const.TARGET)

            # set padding value
            datasets[name].set_pad_val('article', 0)
            datasets[name].set_pad_val('segment_id', 0)
            datasets[name].set_pad_val('cls_id', -1)
            datasets[name].set_pad_val(Const.TARGET, 0)

        print('Finished in {}'.format(timedelta(seconds=time() - start)))

        return DataBundle(datasets=datasets)
Example #4
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):

        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Example #5
0
    def process(self, paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False,
                split_dev_op=True
                ):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name]
                     for name in train_ds] if train_ds else info.datasets.values()

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs={}
        #就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar, field_name="words",new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name)
            info.vocabs[input_name]=src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(
            *info.datasets.values(),
            field_name=target_name, new_field_name=target_name)

        info.vocabs[target_name]=tgt_vocab

        if split_dev_op:
            info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False)

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Example #6
0
def print_data_bundle(data_bundle: DataBundle, title: str = None):
    """ 打印输出data_bundle的信息.

    @params:
        data_bundle - 数据集DataBundle.
        title - 打印输出的标题信息.
    """
    if title:
        logger.warning(title)
    for name, dataset in data_bundle.iter_datasets():
        logger.info('dataset name : {}'.format(name))
        logger.info('dataset len : {}'.format(len(dataset)))
        logger.info('dataset example : ')
        logger.info('\n{}'.format(dataset[:5]))
        logger.info('dataset 输出各个field的被设置成input和target的情况 : ')
        logger.info('\n{}'.format(dataset.print_field_meta()))
Example #7
0
    def load(self, paths):
        def get_seq_len(instance):
            return len(instance['text_id'])

        def sample(instance, candidate_num):
            candidate_id = instance['candidate_id'][:candidate_num]
            return candidate_id

        def truncate_candidate_id(instance, max_len):
            candidate_id = []
            for i in range(len(instance['candidate_id'])):
                if len(instance['candidate_id'][i]) > max_len:
                    cur_id = instance['candidate_id'][i][:(max_len - 1)]
                    cur_id += self.sep_id
                else:
                    cur_id = instance['candidate_id'][i]
                candidate_id.append(cur_id)
            return candidate_id

        print('Start loading datasets !!!')
        start = time()

        # load datasets
        datasets = {}
        for name in paths:
            datasets[name] = self._load(paths[name])

            if name == 'train':
                datasets[name].apply(
                    lambda ins: truncate_candidate_id(ins, self.max_len),
                    new_field_name='candidate_id')

            # set input and target
            datasets[name].set_input('text_id', 'candidate_id', 'summary_id')

            # set padding value
            if self.encoder == 'bert':
                pad_id = 0
            else:
                pad_id = 1  # for RoBERTa
            datasets[name].set_pad_val('text_id', pad_id)
            datasets[name].set_pad_val('candidate_id', pad_id)
            datasets[name].set_pad_val('summary_id', pad_id)

        print('Finished in {}'.format(timedelta(seconds=time() - start)))

        return DataBundle(datasets=datasets)
Example #8
0
def get_data_bundle_tags(data_bundle: DataBundle):
    """ 根据dataBundle获取tags.

    @params:
        data_bundle - DataBundle数据集.

    @return:
        On success - 数据标签的tag列表.
    """
    try:
        dataset = data_bundle.get_dataset('train')
        target_names = dataset.get_field(Const.TARGET).content
        target_names = list(set(target_names))
    except Exception:
        traceback.print_exc()
        logger.error('缺少train数据集')
        raise Exception('缺少train数据集')
    target_names = list(set(target_names))
    target_names.sort()
    return target_names
Example #9
0
    def process_from_file(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_loader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])),
                          new_field_name='chars')
            dataset.apply(add_bigram,
                          field_name='chars',
                          new_field_name='bigrams')
            dataset.apply(add_trigram,
                          field_name='chars',
                          new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count) == 0:
                char_label_vocab.from_dataset(dataset,
                                              field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs,
                              new_field_name='gold_word_pairs',
                              ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs,
                                           label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs,
                              new_field_name='gold_label_word_pairs',
                              ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(
            data.datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])
        bigram_vocab = Vocabulary(min_freq=3).from_dataset(
            data.datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(
            data.datasets['train'],
            field_name='trigrams',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name,
                                              no_create_entry_dataset=list(
                                                  data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(),
                                field_name=name,
                                new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                               [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(),
                                field_name=name,
                                new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens',
                              'char_labels', 'char_heads', 'pre_chars',
                              'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets',
                               'seg_masks', 'char_labels', 'char_heads',
                               'pun_masks', 'gold_label_word_pairs')

        return data
Example #10
0
    def process(self, data_bundle: DataBundle):
        """
        可处理的DataSet应具备如下的field

        .. csv-table::
            :header: "raw_words", "target"

            "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
            "...", "..."

        :param data_bundle:
        :return:
        """
        # 根据granularity设置tag
        # 由原来的固定tagmap,修改为根据数据集获取tagmap
        targets_vocabs = get_data_bundle_tags(data_bundle)
        self.tag_map = {tag_name: tag_name for tag_name in targets_vocabs}
        data_bundle = self._granularize(data_bundle=data_bundle,
                                        tag_map=self.tag_map)
        # clean,lower

        # CWS(tokenize)
        data_bundle = self._tokenize(data_bundle=data_bundle,
                                     field_name='raw_chars',
                                     new_field_name='chars')
        input_field_names = [Const.CHAR_INPUT]

        # n-grams
        if self.bigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=Const.CHAR_INPUT,
                    new_field_name='bigrams')
            input_field_names.append('bigrams')
        if self.trigrams:
            for name, dataset in data_bundle.iter_datasets():
                dataset.apply_field(lambda chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(chars, chars[1:] + ['<eos>'], chars[
                        2:] + ['<eos>'] * 2)
                ],
                                    field_name=Const.CHAR_INPUT,
                                    new_field_name='trigrams')
            input_field_names.append('trigrams')

        # index
        data_bundle = _indexize(data_bundle=data_bundle,
                                input_field_names=Const.CHAR_INPUT)
        # add length
        for name, dataset in data_bundle.datasets.items():
            dataset.add_seq_len(field_name=Const.CHAR_INPUT,
                                new_field_name=Const.INPUT_LEN)

        # input_fields包含的字段名称
        # input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
        input_fields = [Const.INPUT_LEN] + input_field_names
        target_fields = [Const.TARGET]

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Example #11
0
    def process(self, paths):
        def truncate_articles(instance,
                              max_nsents=self.max_nsents,
                              max_ntokens=self.max_ntokens):
            article = [
                ' '.join(sent.lower().split()[:max_ntokens])
                for sent in instance['article']
            ]
            return article[:max_nsents]

        def truncate_labels(instance):
            label = list(
                filter(lambda x: x < len(instance['article']),
                       instance['label']))
            return label

        def bert_tokenize(instance, tokenizer, max_len, pad_value):
            article = instance['article']
            article = ' [SEP] [CLS] '.join(article)
            word_pieces = tokenizer.tokenize(article)[:(max_len - 2)]
            word_pieces = ['[CLS]'] + word_pieces + ['[SEP]']
            token_ids = tokenizer.convert_tokens_to_ids(word_pieces)
            while len(token_ids) < max_len:
                token_ids.append(pad_value)
            assert len(token_ids) == max_len
            return token_ids

        def get_seg_id(instance, max_len, sep_id):
            _segs = [-1] + [
                i for i, idx in enumerate(instance['article']) if idx == sep_id
            ]
            segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
            segment_id = []
            for i, length in enumerate(segs):
                if i % 2 == 0:
                    segment_id += length * [0]
                else:
                    segment_id += length * [1]
            while len(segment_id) < max_len:
                segment_id.append(0)
            return segment_id

        def get_cls_id(instance, cls_id):
            classification_id = [
                i for i, idx in enumerate(instance['article']) if idx == cls_id
            ]
            return classification_id

        def get_labels(instance):
            labels = [0] * len(instance['cls_id'])
            label_idx = list(
                filter(lambda x: x < len(instance['cls_id']),
                       instance['label']))
            for idx in label_idx:
                labels[idx] = 1
            return labels

        datasets = {}
        for name in paths:
            datasets[name] = self._load(paths[name])

            # remove empty samples
            datasets[name].drop(
                lambda ins: len(ins['article']) == 0 or len(ins['label']) == 0)

            # truncate articles
            datasets[name].apply(lambda ins: truncate_articles(
                ins, self.max_nsents, self.max_ntokens),
                                 new_field_name='article')

            # truncate labels
            datasets[name].apply(truncate_labels, new_field_name='label')

            # tokenize and convert tokens to id
            datasets[name].apply(lambda ins: bert_tokenize(
                ins, self.tokenizer, self.max_len, self.pad_id),
                                 new_field_name='article')

            # get segment id
            datasets[name].apply(
                lambda ins: get_seg_id(ins, self.max_len, self.sep_id),
                new_field_name='segment_id')

            # get classification id
            datasets[name].apply(lambda ins: get_cls_id(ins, self.cls_id),
                                 new_field_name='cls_id')

            # get label
            datasets[name].apply(get_labels, new_field_name='label')

            # rename filed
            datasets[name].rename_field('article', Const.INPUTS(0))
            datasets[name].rename_field('segment_id', Const.INPUTS(1))
            datasets[name].rename_field('cls_id', Const.INPUTS(2))
            datasets[name].rename_field('lbael', Const.TARGET)

            # set input and target
            datasets[name].set_input(Const.INPUTS(0), Const.INPUTS(1),
                                     Const.INPUTS(2))
            datasets[name].set_target(Const.TARGET)

            # set paddding value
            datasets[name].set_pad_val('article', 0)

        return DataBundle(datasets=datasets)