Example #1
0
    def __init__(self, encoding_type: str = 'bioes'):
        """

        :param str encoding_type: 支持bio和bioes格式
        """
        super().__init__()
        self._loader = ConllLoader(headers=['raw_chars', 'target'],
                                   indexes=[0, 1])

        assert encoding_type in ('bio', 'bioes')

        self._tag_converters = [iob2]
        if encoding_type == 'bioes':
            self._tag_converters.append(iob2bioes)
    def __init__(self, task: str = 'ner', encoding_type: str = 'bioes'):
        """
        加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
            时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回
            的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但
            鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行
        ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。

        :param task: 指定需要标注任务。可选ner, pos, chunk
        """
        assert task in ('ner', 'pos', 'chunk')
        index = {'ner': 3, 'pos': 1, 'chunk': 2}[task]
        self._loader = ConllLoader(headers=['raw_words', 'target'],
                                   indexes=[0, index])
        self._tag_converters = []
        if task in ('ner', 'chunk'):
            self._tag_converters = [iob2]
            if encoding_type == 'bioes':
                self._tag_converters.append(iob2bioes)
Example #3
0
def prepare_ptb(args):
    datas = {}
    datas["pos"] = (ConllLoader(headers=["words", "pos"],
                                indexes=[0, 1]).load(args.pos).datasets)
    chunk_data = (ConllLoader(headers=["words", "chunk"],
                              indexes=[0, 2]).load(args.chunk).datasets)
    chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1)
    datas['chunk'] = chunk_data
    datas["ner"] = (ConllLoader(headers=["words", "ner"],
                                indexes=[0, 3]).load(args.ner).datasets)

    for ds in datas['chunk'].values():
        ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk')
    for ds in datas['ner'].values():
        ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner')

    vocabs = {}
    src_vocab = Vocabulary()
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        filter_docstart(data)
        vocab = Vocabulary(padding=None, unknown=None)
        vocab.from_dataset(*list(data.values()), field_name=task_name)
        src_vocab.from_dataset(*list(data.values()), field_name="words")
        vocabs[task_name] = vocab

    task_lst = []
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        src_vocab.index_dataset(*list(data.values()),
                                field_name="words",
                                new_field_name="words")
        vocabs[task_name].index_dataset(*list(data.values()),
                                        field_name=task_name,
                                        new_field_name=task_name)
        for ds in data.values():
            ds.apply_field(len, 'words', 'seq_len')
        task_lst.append(
            Task(idx, task_name, data["train"], data["dev"], data["test"]))
    vocabs["words"] = src_vocab
    return task_lst, vocabs
Example #4
0
    def load(self, path: str) -> DataSet:
        """
        给定一个文件路径,读取数据。返回的DataSet包含以下的field
            raw_words: List[str]
            target: List[str]

        :param path:
        :return:
        """
        dataset = ConllLoader(headers=['raw_words', 'target'],
                              indexes=[3, 10]).load(path)

        def convert_to_bio(tags):
            bio_tags = []
            flag = None
            for tag in tags:
                label = tag.strip("()*")
                if '(' in tag:
                    bio_label = 'B-' + label
                    flag = label
                elif flag:
                    bio_label = 'I-' + flag
                else:
                    bio_label = 'O'
                if ')' in tag:
                    flag = None
                bio_tags.append(bio_label)
            return self.encoding_method(bio_tags)

        def convert_word(words):
            converted_words = []
            for word in words:
                word = word.replace('/.', '.')  # 有些结尾的.是/.形式的
                if not word.startswith('-'):
                    converted_words.append(word)
                    continue
                # 以下是由于这些符号被转义了,再转回来
                tfrs = {
                    '-LRB-': '(',
                    '-RRB-': ')',
                    '-LSB-': '[',
                    '-RSB-': ']',
                    '-LCB-': '{',
                    '-RCB-': '}'
                }
                if word in tfrs:
                    converted_words.append(tfrs[word])
                else:
                    converted_words.append(word)
            return converted_words

        dataset.apply_field(convert_word,
                            field_name='raw_words',
                            new_field_name='raw_words')
        dataset.apply_field(convert_to_bio,
                            field_name='target',
                            new_field_name='target')

        return dataset
class Conll2003DataLoader(DataSetLoader):
    def __init__(self, task: str = 'ner', encoding_type: str = 'bioes'):
        """
        加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
            时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回
            的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但
            鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行
        ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。

        :param task: 指定需要标注任务。可选ner, pos, chunk
        """
        assert task in ('ner', 'pos', 'chunk')
        index = {'ner': 3, 'pos': 1, 'chunk': 2}[task]
        self._loader = ConllLoader(headers=['raw_words', 'target'],
                                   indexes=[0, index])
        self._tag_converters = []
        if task in ('ner', 'chunk'):
            self._tag_converters = [iob2]
            if encoding_type == 'bioes':
                self._tag_converters.append(iob2bioes)

    def load(self, path: str):
        dataset = self._loader.load(path)

        def convert_tag_schema(tags):
            for converter in self._tag_converters:
                tags = converter(tags)
            return tags

        if self._tag_converters:
            dataset.apply_field(convert_tag_schema,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)
        return dataset

    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = False):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写。
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(
            data.datasets['train'],
            field_name='raw_words',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
Example #6
0
 def process_from_file(self, paths):
     paths = check_loader_paths(paths)
     loader = ConllLoader(headers=['raw_chars', 'target'])
     data_bundle = loader.load(paths)
     return self.process(data_bundle)
Example #7
0
 def __init__(self):
     self._loader = ConllLoader(
         headers=['words', 'pos_tags', 'heads', 'labels'],
         indexes=[1, 3, 6, 7])
Example #8
0
class CTBxJointPipe(Pipe):
    """
    文件夹下应该具有以下的文件结构
        -train.conllx
        -dev.conllx
        -test.conllx
    每个文件中的内容如下(空格隔开不同的句子, 共有)
        1	费孝通	_	NR	NR	_	3	nsubjpass	_	_
        2	被	_	SB	SB	_	3	pass	_	_
        3	授予	_	VV	VV	_	0	root	_	_
        4	麦格赛赛	_	NR	NR	_	5	nn	_	_
        5	奖	_	NN	NN	_	3	dobj	_	_

        1	新华社	_	NR	NR	_	7	dep	_	_
        2	马尼拉	_	NR	NR	_	7	dep	_	_
        3	8月	_	NT	NT	_	7	dep	_	_
        4	31日	_	NT	NT	_	7	dep	_	_
        ...

    """
    def __init__(self):
        self._loader = ConllLoader(
            headers=['words', 'pos_tags', 'heads', 'labels'],
            indexes=[1, 3, 6, 7])

    def load(self, path: str):
        """
        给定一个文件路径,将数据读取为DataSet格式。DataSet中包含以下的内容
        words: list[str]
        pos_tags: list[str]
        heads: list[int]
        labels: list[str]

        :param path:
        :return:
        """
        dataset = self._loader._load(path)
        dataset.heads.int()
        return dataset

    def process_from_file(self, paths):
        """
        
        :param paths: 
        :return:
            Dataset包含以下的field
                chars:
                bigrams:
                trigrams:
                pre_chars:
                pre_bigrams:
                pre_trigrams:
                seg_targets:
                seg_masks:
                seq_lens:
                char_labels:
                char_heads:
                gold_word_pairs:
                seg_targets:
                seg_masks:
                char_labels:
                char_heads:
                pun_masks:
                gold_label_word_pairs:
        """
        paths = check_loader_paths(paths)
        data = DataBundle()

        for name, path in paths.items():
            dataset = self.load(path)
            data.datasets[name] = dataset

        char_labels_vocab = Vocabulary(padding=None, unknown=None)

        def process(dataset, char_label_vocab):
            dataset.apply(add_word_lst, new_field_name='word_lst')
            dataset.apply(lambda x: list(chain(*x['word_lst'])),
                          new_field_name='chars')
            dataset.apply(add_bigram,
                          field_name='chars',
                          new_field_name='bigrams')
            dataset.apply(add_trigram,
                          field_name='chars',
                          new_field_name='trigrams')
            dataset.apply(add_char_heads, new_field_name='char_heads')
            dataset.apply(add_char_labels, new_field_name='char_labels')
            dataset.apply(add_segs, new_field_name='seg_targets')
            dataset.apply(add_mask, new_field_name='seg_masks')
            dataset.add_seq_len('chars', new_field_name='seq_lens')
            dataset.apply(add_pun_masks, new_field_name='pun_masks')
            if len(char_label_vocab.word_count) == 0:
                char_label_vocab.from_dataset(dataset,
                                              field_name='char_labels')
            char_label_vocab.index_dataset(dataset, field_name='char_labels')
            new_dataset = add_root(dataset)
            new_dataset.apply(add_word_pairs,
                              new_field_name='gold_word_pairs',
                              ignore_type=True)
            global add_label_word_pairs
            add_label_word_pairs = partial(add_label_word_pairs,
                                           label_vocab=char_label_vocab)
            new_dataset.apply(add_label_word_pairs,
                              new_field_name='gold_label_word_pairs',
                              ignore_type=True)

            new_dataset.set_pad_val('char_labels', -1)
            new_dataset.set_pad_val('char_heads', -1)

            return new_dataset

        for name in list(paths.keys()):
            dataset = data.datasets[name]
            dataset = process(dataset, char_labels_vocab)
            data.datasets[name] = dataset

        data.vocabs['char_labels'] = char_labels_vocab

        char_vocab = Vocabulary(min_freq=2).from_dataset(
            data.datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])
        bigram_vocab = Vocabulary(min_freq=3).from_dataset(
            data.datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])
        trigram_vocab = Vocabulary(min_freq=5).from_dataset(
            data.datasets['train'],
            field_name='trigrams',
            no_create_entry_dataset=[
                data.get_dataset('dev'),
                data.get_dataset('test')
            ])

        for name in ['chars', 'bigrams', 'trigrams']:
            vocab = Vocabulary().from_dataset(field_name=name,
                                              no_create_entry_dataset=list(
                                                  data.datasets.values()))
            vocab.index_dataset(*data.datasets.values(),
                                field_name=name,
                                new_field_name='pre_' + name)
            data.vocabs['pre_{}'.format(name)] = vocab

        for name, vocab in zip(['chars', 'bigrams', 'trigrams'],
                               [char_vocab, bigram_vocab, trigram_vocab]):
            vocab.index_dataset(*data.datasets.values(),
                                field_name=name,
                                new_field_name=name)
            data.vocabs[name] = vocab

        for name, dataset in data.datasets.items():
            dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens',
                              'char_labels', 'char_heads', 'pre_chars',
                              'pre_bigrams', 'pre_trigrams')
            dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets',
                               'seg_masks', 'char_labels', 'char_heads',
                               'pun_masks', 'gold_label_word_pairs')

        return data
Example #9
0
class ChineseNERLoader(DataSetLoader):
    """
    读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER
    请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开
        我 O
        们 O
        变 O
        而 O
        以 O
        书 O
        会 O
        ...

    """
    def __init__(self, encoding_type: str = 'bioes'):
        """

        :param str encoding_type: 支持bio和bioes格式
        """
        super().__init__()
        self._loader = ConllLoader(headers=['raw_chars', 'target'],
                                   indexes=[0, 1])

        assert encoding_type in ('bio', 'bioes')

        self._tag_converters = [iob2]
        if encoding_type == 'bioes':
            self._tag_converters.append(iob2bioes)

    def load(self, path: str):
        dataset = self._loader.load(path)

        def convert_tag_schema(tags):
            for converter in self._tag_converters:
                tags = converter(tags)
            return tags

        if self._tag_converters:
            dataset.apply_field(convert_tag_schema,
                                field_name=Const.TARGET,
                                new_field_name=Const.TARGET)
        return dataset

    def process(self, paths, bigrams=False, trigrams=False):
        """

        :param paths:
        :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>]
        :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>]
        :return: DataBundle
            包含以下的fields
                raw_chars: List[str]
                chars: List[int]
                seq_len: int, 字的长度
                bigrams: List[int], optional
                trigrams: List[int], optional
                target: List[int]
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]
        target_fields = [Const.TARGET, Const.INPUT_LEN]

        for name, path in paths.items():
            dataset = self.load(path)
            if bigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2
                    for c1, c2 in zip(raw_chars, raw_chars[1:] + ['<eos>'])
                ],
                                    field_name='raw_chars',
                                    new_field_name='bigrams')

            if trigrams:
                dataset.apply_field(lambda raw_chars: [
                    c1 + c2 + c3
                    for c1, c2, c3 in zip(raw_chars, raw_chars[1:] + ['<eos>'],
                                          raw_chars[2:] + ['<eos>'] * 2)
                ],
                                    field_name='raw_chars',
                                    new_field_name='trigrams')
            data.datasets[name] = dataset

        char_vocab = Vocabulary().from_dataset(
            data.datasets['train'],
            field_name='raw_chars',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        char_vocab.index_dataset(*data.datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name=Const.CHAR_INPUT)
        data.vocabs[Const.CHAR_INPUT] = char_vocab

        target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(
            data.datasets['train'], field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        if bigrams:
            bigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='bigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data.datasets.values(),
                                       field_name='bigrams',
                                       new_field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            input_fields.append('bigrams')

        if trigrams:
            trigram_vocab = Vocabulary().from_dataset(
                data.datasets['train'],
                field_name='trigrams',
                no_create_entry_dataset=[
                    dataset for name, dataset in data.datasets.items()
                    if name != 'train'
                ])
            trigram_vocab.index_dataset(*data.datasets.values(),
                                        field_name='trigrams',
                                        new_field_name='trigrams')
            data.vocabs['trigrams'] = trigram_vocab
            input_fields.append('trigrams')

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.CHAR_INPUT)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data