def input_with_span_attr(datasets, vocabs):
    datasets['train'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                  new_field_name='span_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                    new_field_name='span_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                 new_field_name='span_label')

    datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                  field_name='target', new_field_name='attr_start_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                    field_name='target', new_field_name='attr_start_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                 field_name='target', new_field_name='attr_start_label')

    datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                  field_name='target', new_field_name='attr_end_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                    field_name='target', new_field_name='attr_end_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                 field_name='target', new_field_name='attr_end_label')

    span_label_vocab = Vocabulary()
    attr_label_vocab = Vocabulary()
    span_label_vocab.from_dataset(datasets['train'], field_name='span_label')
    attr_label_vocab.from_dataset(datasets['train'], field_name=['attr_start_label', 'attr_end_label'])
    vocabs['span_label'] = span_label_vocab
    vocabs['attr_label'] = attr_label_vocab
    print(f"span label: {span_label_vocab.word2idx.keys()}")
    print(f"attr label: {attr_label_vocab.word2idx.keys()}")
    return datasets, vocabs
Exemple #2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}
        #就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(),
                                    field_name=input_name,
                                    new_field_name=input_name)
            info.vocabs[input_name] = src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)

        info.vocabs[target_name] = tgt_vocab

        info.datasets['train'], info.datasets['dev'] = info.datasets[
            'train'].split(0.1, shuffle=False)

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
    def test_bert_embed_eq_bert_piece_encoder(self):
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = BertWordPieceEncoder(
            model_dir_or_name='test/data_for_tests/embedding/small_bert')
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            pool_method='first',
            include_cls_sep=True,
            pooled_cls=False,
            min_freq=1)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Exemple #4
0
    def test_roberta_embed_eq_roberta_piece_encoder(self):
        # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        ds = DataSet({
            'words': ["this is a texta a sentence".split(), 'this is'.split()]
        })
        encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 pool_method='first',
                                 include_cls_sep=True,
                                 pooled_cls=False)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(
            0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Exemple #6
0
def load_sst2(dict_path, embedding_path=None):
    '''

    :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/
    :param embedding_path: glove 300d txt
    :return:
    '''
    train_path = os.path.join(dict_path, 'train.tsv')
    dev_path = os.path.join(dict_path, 'dev.tsv')

    loader = CSVLoader(headers=('words', 'target'), sep='\t')
    train_data = loader.load(train_path).datasets['train']
    dev_data = loader.load(dev_path).datasets['train']

    train_data.apply_field(lambda x: x.split(),
                           field_name='words',
                           new_field_name='words')
    dev_data.apply_field(lambda x: x.split(),
                         field_name='words',
                         new_field_name='words')

    train_data.apply_field(lambda x: len(x),
                           field_name='words',
                           new_field_name='seq_len')
    dev_data.apply_field(lambda x: len(x),
                         field_name='words',
                         new_field_name='seq_len')

    vocab = Vocabulary(min_freq=2)
    vocab.from_dataset(train_data, field_name='words')
    vocab.from_dataset(dev_data, field_name='words')

    # pretrained_embedding = load_word_emb(embedding_path, 300, vocab)

    label_vocab = Vocabulary(padding=None,
                             unknown=None).from_dataset(train_data,
                                                        field_name='target')

    label_vocab.index_dataset(train_data, field_name='target')
    label_vocab.index_dataset(dev_data, field_name='target')

    vocab.index_dataset(train_data, field_name='words', new_field_name='words')
    vocab.index_dataset(dev_data, field_name='words', new_field_name='words')

    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    dev_data.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_data.set_target(Const.TARGET)

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_data, dev_data), (vocab,
                                        label_vocab), pretrained_embedding

    else:
        return (train_data, dev_data), (vocab, label_vocab)
Exemple #7
0
def get_label_vocab(data_type='default'):
    label = [
        'family', 'education', 'money', 'med_exam', 'ID', 'contact', 'name',
        'time', 'location', 'profession'
    ]
    total_label = []

    for prefix in tagging_method:
        total_label.extend([prefix + '-' + ele for ele in label])
    total_label.append('O')
    print(total_label)
    label_ds = DataSet({'target': total_label})
    label_vocab = Vocabulary(unknown=None, padding=None)
    label_vocab.from_dataset(label_ds, field_name='target')
    label_vocab.index_dataset(label_ds, field_name='target')
    # label_vocab.add_word_lst(total_label)
    return label_vocab
Exemple #8
0
def prepare_ptb(args):
    datas = {}
    datas["pos"] = (ConllLoader(headers=["words", "pos"],
                                indexes=[0, 1]).load(args.pos).datasets)
    chunk_data = (ConllLoader(headers=["words", "chunk"],
                              indexes=[0, 2]).load(args.chunk).datasets)
    chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1)
    datas['chunk'] = chunk_data
    datas["ner"] = (ConllLoader(headers=["words", "ner"],
                                indexes=[0, 3]).load(args.ner).datasets)

    for ds in datas['chunk'].values():
        ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk')
    for ds in datas['ner'].values():
        ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner')

    vocabs = {}
    src_vocab = Vocabulary()
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        filter_docstart(data)
        vocab = Vocabulary(padding=None, unknown=None)
        vocab.from_dataset(*list(data.values()), field_name=task_name)
        src_vocab.from_dataset(*list(data.values()), field_name="words")
        vocabs[task_name] = vocab

    task_lst = []
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        src_vocab.index_dataset(*list(data.values()),
                                field_name="words",
                                new_field_name="words")
        vocabs[task_name].index_dataset(*list(data.values()),
                                        field_name=task_name,
                                        new_field_name=task_name)
        for ds in data.values():
            ds.apply_field(len, 'words', 'seq_len')
        task_lst.append(
            Task(idx, task_name, data["train"], data["dev"], data["test"]))
    vocabs["words"] = src_vocab
    return task_lst, vocabs
Exemple #9
0
 def test_from_dataset_no_entry(self):
     # 测试能否正确将no_create_entry正确设置
     dataset = DataSet()
     start_char = 65
     num_samples = 10
     test_dataset = DataSet()
     for i in range(num_samples):
         char = [chr(start_char + i)] * 6
         ins = Instance(char=char)
         dataset.append(ins)
         ins = Instance(char=[c + c for c in char])
         test_dataset.append(ins)
     vocab = Vocabulary()
     vocab.from_dataset(dataset,
                        field_name='char',
                        no_create_entry_dataset=test_dataset)
     vocab.index_dataset(dataset, field_name='char')
     for i in range(num_samples):
         self.assertEqual(
             True,
             vocab._is_word_no_create_entry(
                 chr(start_char + i) + chr(start_char + i)))
Exemple #10
0
    def process(self, data_bundle):
        data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT)
        input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN]
        target_fields = [C.TARGET, C.INPUT_LEN]
        if self.bigram:
            for dataset in data_bundle.datasets.values():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=C.CHAR_INPUT,
                    new_field_name='bigrams')
            bigram_vocab = Vocabulary()
            bigram_vocab.from_dataset(
                data_bundle.get_dataset('train'),
                field_name='bigrams',
                no_create_entry_dataset=[
                    ds for name, ds in data_bundle.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data_bundle.datasets.values(),
                                       field_name='bigrams')
            data_bundle.set_vocab(bigram_vocab, field_name='bigrams')
            input_fields.append('bigrams')

        _add_chars_field(data_bundle, lower=False)

        # index
        _indexize(data_bundle,
                  input_field_names=C.CHAR_INPUT,
                  target_field_names=C.TARGET)

        for name, dataset in data_bundle.datasets.items():
            dataset.set_pad_val(C.TARGET, self.target_pad_val)
            dataset.add_seq_len(C.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Exemple #11
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):
        
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Exemple #12
0
def get_vocab(trainset, testset):
    # 构建vocab以及word2idx
    #tok
    tok_vocab = Vocabulary()
    tok_vocab.from_dataset(trainset,
                           field_name="tok",
                           no_create_entry_dataset=testset)
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="tok",
                            new_field_name="chars")
    tok_vocab.index_dataset(trainset,
                            testset,
                            field_name="asp",
                            new_field_name="aspect")
    # deprel
    dep_vocab = Vocabulary()
    dep_vocab.from_dataset(trainset, field_name="deprel")
    dep_vocab.index_dataset(trainset,
                            testset,
                            field_name="deprel",
                            new_field_name="depidx")
    # pol(target)
    pol_vocab = Vocabulary(padding=None, unknown=None)
    pol_vocab.from_dataset(trainset, field_name="pol")
    pol_vocab.index_dataset(trainset,
                            testset,
                            field_name="pol",
                            new_field_name="target")
    # pos
    pos_vocab = Vocabulary()
    pos_vocab.from_dataset(trainset, field_name="pos")
    pos_vocab.index_dataset(trainset,
                            testset,
                            field_name="pos",
                            new_field_name="posidx")
    # post
    max_len = max(max(trainset["seq_len"]), max(testset["seq_len"]))
    post_vocab = Vocabulary()
    post_vocab.add_word_lst(list(range(-max_len, max_len)))
    post_vocab.index_dataset(trainset,
                             testset,
                             field_name="post",
                             new_field_name="postidx")
    return tok_vocab, pos_vocab, post_vocab, trainset, testset
Exemple #13
0
    def test_from_dataset(self):
        start_char = 65
        num_samples = 10

        # 0 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=chr(start_char + i))
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 1 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[chr(start_char + i)] * 6)
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 2 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[[chr(start_char + i) for _ in range(6)]
                                 for _ in range(6)])
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')
            kernel_sizes=kernel_sizes,
            padding=padding)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    
    def forward(self, words, seq_len=None):
        x = self.embed(words)  # [N,L] -> [N,L,C]
        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    
    def predict(self, words, seq_len=None):
        output = self(words, seq_len)
        _, predict = output[C.OUTPUT].max(dim=1)
        return {C.OUTPUT: predict}


#demo version

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words',new_field_name='words')
trainData.set_target('target')
model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1)
train_data, dev_data = trainData.split(0.2)
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16)
trainer.train()
Exemple #15
0
def load_resume_ner(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    normalize={
                        'char': True,
                        'bigram': True,
                        'word': False
                    }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'train.char.bmes')
    dev_path = os.path.join(path, 'dev.char.bmes')
    test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams,
                                field_name='chars',
                                new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['dev'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['dev'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['dev'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab,
                                         char_embedding_path,
                                         word_dropout=0.01,
                                         normalize=normalize['char'])
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab,
                                           bigram_embedding_path,
                                           word_dropout=0.01,
                                           normalize=normalize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #16
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   normlize={
                       'char': True,
                       'bigram': True,
                       'word': False
                   }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    bundle = loader.load(path)

    datasets = bundle.datasets
    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #17
0
def load_weibo_ner_old(path,
                       unigram_embedding_path=None,
                       bigram_embedding_path=None,
                       index_token=True,
                       normlize={
                           'char': True,
                           'bigram': True,
                           'word': False
                       }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # from fastNLP.io.file_reader import _read_conll
    # from fastNLP.core import Instance,DataSet
    # def _load(path):
    #     ds = DataSet()
    #     for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna,
    #                                 encoding='ISO-8859-1'):
    #         ins = {h: data[i] for i, h in enumerate(loader.headers)}
    #         ds.append(Instance(**ins))
    #     return ds
    # from fastNLP.io.utils import check_loader_paths
    # paths = check_loader_paths(path)
    # datasets = {name: _load(path) for name, path in paths.items()}
    datasets = {}
    train_path = os.path.join(path, 'train.all.bmes')
    dev_path = os.path.join(path, 'dev.all.bmes')
    test_path = os.path.join(path, 'test.all.bmes')
    datasets['train'] = loader.load(train_path).datasets['train']
    datasets['dev'] = loader.load(dev_path).datasets['train']
    datasets['test'] = loader.load(test_path).datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #18
0
def load_conllized_ontonote_POS(path, embedding_path=None):
    from fastNLP.io.loader import ConllLoader
    header2index = {'words': 3, 'POS': 4, 'NER': 10}
    headers = ['words', 'POS']

    if 'NER' in headers:
        print(
            '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!'
        )
    indexes = list(map(lambda x: header2index[x], headers))

    loader = ConllLoader(headers, indexes)

    bundle = loader.load(path)

    # print(bundle.datasets)

    train_set = bundle.datasets['train']
    dev_set = bundle.datasets['dev']
    test_set = bundle.datasets['test']

    # train_set = loader.load(os.path.join(path,'train.txt'))
    # dev_set = loader.load(os.path.join(path, 'dev.txt'))
    # test_set = loader.load(os.path.join(path, 'test.txt'))

    # print(len(train_set))

    train_set.add_seq_len('words', 'seq_len')
    dev_set.add_seq_len('words', 'seq_len')
    test_set.add_seq_len('words', 'seq_len')

    # print(dataset['POS'])

    vocab = Vocabulary(min_freq=1)
    vocab.from_dataset(train_set, field_name='words')
    vocab.from_dataset(dev_set, field_name='words')
    vocab.from_dataset(test_set, field_name='words')

    vocab.index_dataset(train_set, field_name='words')
    vocab.index_dataset(dev_set, field_name='words')
    vocab.index_dataset(test_set, field_name='words')

    label_vocab_dict = {}

    for i, h in enumerate(headers):
        if h == 'words':
            continue
        label_vocab_dict[h] = Vocabulary(min_freq=1,
                                         padding=None,
                                         unknown=None)
        label_vocab_dict[h].from_dataset(train_set, field_name=h)

        label_vocab_dict[h].index_dataset(train_set, field_name=h)
        label_vocab_dict[h].index_dataset(dev_set, field_name=h)
        label_vocab_dict[h].index_dataset(test_set, field_name=h)

    train_set.set_input(Const.INPUT, Const.INPUT_LEN)
    train_set.set_target(headers[1])

    dev_set.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_set.set_target(headers[1])

    test_set.set_input(Const.INPUT, Const.INPUT_LEN)
    test_set.set_target(headers[1])

    if len(headers) > 2:
        print('警告:由于任务数量大于1,所以需要每次手动设置target!')

    print('train:', len(train_set), 'dev:', len(dev_set), 'test:',
          len(test_set))

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_set, dev_set,
                test_set), (vocab, label_vocab_dict), pretrained_embedding
    else:
        return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
Exemple #19
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                char_vocab_opt: VocabularyOption = None,
                char_embed_opt: EmbeddingOption = None,
                bigram_vocab_opt: VocabularyOption = None,
                bigram_embed_opt: EmbeddingOption = None,
                L: int = 4):
        """
        支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如

        Option::

            共同  创造  美好  的  新  世纪  ——  二○○一年  新年  贺词
            (  二○○○年  十二月  三十一日  )  (  附  图片  1  张  )
            女士  们  ,  先生  们  ,  同志  们  ,  朋友  们  :

        paths支持两种格式,第一种是str,第二种是Dict[str, str].

        Option::

            # 1. str类型
            # 1.1 传入具体的文件路径
            data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容
            # 包含以下的内容data.vocabs['chars']:Vocabulary对象,
            #             data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值
            #             data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项
            #             data.datasets['train']: DataSet对象
            #                   包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            # 1.2 传入一个目录, 里面必须包含train.txt文件
            data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt
            # 包含以下的内容data.vocabs['chars']: Vocabulary对象
            #             data.vocabs['target']:Vocabulary对象
            #             data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象;
            #             data.datasets['train']: DataSet对象
            #                    包含的field有:
            #                       raw_chars: list[str], 每个元素是一个汉字
            #                       chars: list[int], 每个元素是汉字对应的index
            #                       target: list[int], 根据encoding_type有对应的变化
            #             data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样

            # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key
            paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'}
            data = SigHanLoader(paths).process(paths)
            # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致

        :param paths: 支持传入目录,文件路径,以及dict。
        :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2
        :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding
        :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。
            为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos>
        :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效
        :param L: 当target_type为shift_relay时传入的segment长度
        :return:
        """
        # 推荐大家使用这个check_data_loader_paths进行paths的验证
        paths = check_dataloader_paths(paths)
        datasets = {}
        data = DataBundle()
        bigram = bigram_vocab_opt is not None
        for name, path in paths.items():
            dataset = self.load(path, bigram=bigram)
            datasets[name] = dataset
        input_fields = []
        target_fields = []
        # 创建vocab
        char_vocab = Vocabulary(
            min_freq=2) if char_vocab_opt is None else Vocabulary(
                **char_vocab_opt)
        char_vocab.from_dataset(datasets['train'], field_name='raw_chars')
        char_vocab.index_dataset(*datasets.values(),
                                 field_name='raw_chars',
                                 new_field_name='chars')
        data.vocabs[Const.CHAR_INPUT] = char_vocab
        input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET])
        target_fields.append(Const.TARGET)
        # 创建target
        if self.target_type == 'bmes':
            target_vocab = Vocabulary(unknown=None, padding=None)
            target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 +
                                      ['S'])
            target_vocab.index_dataset(*datasets.values(), field_name='target')
            data.vocabs[Const.TARGET] = target_vocab
        if char_embed_opt is not None:
            char_embed = EmbedLoader.load_with_vocab(**char_embed_opt,
                                                     vocab=char_vocab)
            data.embeddings['chars'] = char_embed
        if bigram:
            bigram_vocab = Vocabulary(**bigram_vocab_opt)
            bigram_vocab.from_dataset(datasets['train'], field_name='bigrams')
            bigram_vocab.index_dataset(*datasets.values(),
                                       field_name='bigrams')
            data.vocabs['bigrams'] = bigram_vocab
            if bigram_embed_opt is not None:
                bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt,
                                                           vocab=bigram_vocab)
                data.embeddings['bigrams'] = bigram_embed
            input_fields.append('bigrams')
        if self.target_type == 'shift_relay':
            func = partial(self._clip_target, L=L)
            for name, dataset in datasets.items():
                res = dataset.apply_field(func, field_name='target')
                relay_target = [res_i[0] for res_i in res]
                relay_mask = [res_i[1] for res_i in res]
                dataset.add_field('relay_target',
                                  relay_target,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
                dataset.add_field('relay_mask',
                                  relay_mask,
                                  is_input=True,
                                  is_target=False,
                                  ignore_type=False)
        if self.target_type == 'shift_relay':
            input_fields.extend(['end_seg_mask'])
            target_fields.append('start_seg_mask')
        # 将dataset加入DataInfo
        for name, dataset in datasets.items():
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)
            data.datasets[name] = dataset

        return data
Exemple #20
0
def load_msra_ner_1(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    train_clip=False,
                    char_min_freq=1,
                    bigram_min_freq=1,
                    only_train_min_freq=0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams
    if train_clip:
        train_path = os.path.join(path, 'train_dev.char.bmes_clip1')
        test_path = os.path.join(path, 'test.char.bmes_clip1')
    else:
        train_path = os.path.join(path, 'train_dev.char.bmes')
        test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    # print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(datasets['train'],
                            field_name='chars',
                            no_create_entry_dataset=[datasets['test']])
    bigram_vocab.from_dataset(datasets['train'],
                              field_name='bigrams',
                              no_create_entry_dataset=[datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(
            char_vocab,
            char_embedding_path,
            word_dropout=0.01,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True,
                   char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    print(f"load train dataset: {train_path}")
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=char_word_dropout,
                                            min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}")
    return datasets, vocabs, embeddings
Exemple #22
0
    def process(self, paths, config, load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """

        vocab_size = config.vocab_size

        def _merge_abstracts(abstracts):
            merged = []
            for abstract in abstracts:
                merged.extend(abstract[:self.max_concat_len] + [SEP])
            if len(abstracts) == 0:
                assert merged == []
            return merged[:-1]

        def _pad_graph_inputs(graph_inputs):
            pad_text_wd = []
            max_len = config.max_graph_enc_steps

            for graph_input in graph_inputs:
                if len(graph_input) < max_len:
                    pad_num = max_len - len(graph_input)
                    graph_input.extend([PAD_TOKEN] * pad_num)
                else:
                    graph_input = graph_input[:max_len]
                pad_text_wd.append(graph_input)

            if len(pad_text_wd) == 0:
                pad_text_wd.append([PAD_TOKEN] * max_len)

            return pad_text_wd

        def _get_nbr_input_len(input_wd):
            enc_len = [
                min(len(text), config.max_graph_enc_steps) for text in input_wd
            ]
            if len(enc_len) == 0:
                enc_len = [0]
            return enc_len

        def _pad_article(text_wd):
            token_num = len(text_wd)
            max_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_len += self.max_concat_len * self.max_concat_num
            if token_num < max_len:
                padding = [PAD_TOKEN] * (max_len - token_num)
                article = text_wd + padding
            else:
                article = text_wd[:max_len]
            return article

        def _split_list(input_list):
            return [text.split() for text in input_list]

        def sent_tokenize(abstract):
            abs_list = abstract.split(".")
            return [(abst + ".") for abst in abs_list[:-1]]

        def _article_token_mask(text_wd):
            max_enc_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_enc_len += self.max_concat_len * self.max_concat_num
            token_num = len(text_wd)
            if token_num < max_enc_len:
                mask = [1] * token_num + [0] * (max_enc_len - token_num)
            else:
                mask = [1] * max_enc_len
            return mask

        def generate_article_input(text, abstracts):
            if config.neighbor_process == "sep":
                text_wd = text.split()[:config.max_enc_steps]
                text_wd.append(SEP)
                abstracts_wd = _merge_abstracts(abstracts)
                return text_wd + abstracts_wd
            else:
                return text.split()

        def generate_graph_inputs(graph_struct):

            graph_inputs_ = [
                graph_strut_dict[pid][config.graph_input_type]
                for pid in graph_struct
            ]
            return _split_list(graph_inputs_[1:])

        def generate_graph_structs(paper_id):
            sub_graph_dict = {}
            sub_graph_set = []

            n_hop = config.n_hop
            max_neighbor_num = config.max_neighbor_num
            k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num)
            for sub_g in k_nbrs:
                sub_graph_set += sub_g

            for node in sub_graph_set:
                sub_graph_dict[node] = []

            for sub_g in k_nbrs:
                for centre_node in sub_g:
                    nbrs = graph_strut_dict[centre_node]['references']
                    c_nbrs = list(set(nbrs).intersection(sub_graph_set))
                    sub_graph_dict[centre_node].extend(c_nbrs)
                    for c_nbr in c_nbrs:
                        sub_graph_dict[c_nbr].append(centre_node)
            # in python 3.6, the first in subgraph dict is source paper
            return sub_graph_dict

        def _k_hop_neighbor(paper_id, n_hop, max_neighbor):
            sub_graph = [[] for _ in range(n_hop + 1)]
            level = 0
            visited = set()
            q = deque()
            q.append([paper_id, level])
            curr_node_num = 0
            while len(q) != 0:
                paper_first = q.popleft()
                paper_id_first, level_first = paper_first
                if level_first > n_hop:
                    return sub_graph
                sub_graph[level_first].append(paper_id_first)
                curr_node_num += 1
                if curr_node_num > max_neighbor:
                    return sub_graph
                visited.add(paper_id_first)
                for pid in graph_strut_dict[paper_id_first]["references"]:
                    if pid not in visited and pid in graph_strut_dict:
                        q.append([pid, level_first + 1])
                        visited.add(pid)

            return sub_graph

        def generate_dgl_graph(paper_id, graph_struct, nodes_num):
            g = dgl.DGLGraph()
            assert len(graph_struct) == nodes_num

            g.add_nodes(len(graph_struct))
            pid2idx = {}
            for index, key_node in enumerate(graph_struct):
                pid2idx[key_node] = index
            assert pid2idx[paper_id] == 0

            for index, key_node in enumerate(graph_struct):
                neighbor = [pid2idx[node] for node in graph_struct[key_node]]
                # add self loop
                neighbor.append(index)
                key_nodes = [index] * len(neighbor)
                g.add_edges(key_nodes, neighbor)
            return g

        train_ds = None
        dataInfo = self.load(paths)

        # pop nodes in train graph in inductive setting
        if config.mode == "test" and self.setting == "inductive":
            dataInfo.datasets.pop("train")

        graph_strut_dict = {}
        for key, ds in dataInfo.datasets.items():
            for ins in ds:
                graph_strut_dict[ins["paper_id"]] = ins

        logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes")

        for key, ds in dataInfo.datasets.items():
            # process summary
            ds.apply(lambda x: x['abstract'].split(),
                     new_field_name='summary_wd')
            ds.apply(lambda x: sent_tokenize(x['abstract']),
                     new_field_name='abstract_sentences')
            # generate graph

            ds.apply(lambda x: generate_graph_structs(x["paper_id"]),
                     new_field_name="graph_struct")
            ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]),
                     new_field_name='graph_inputs_wd')

            ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1,
                     new_field_name="nodes_num")
            # pad input
            ds.apply(lambda x: generate_article_input(x['introduction'], x[
                "graph_inputs_wd"]),
                     new_field_name='input_wd')
            ds.apply(lambda x: _article_token_mask(x["input_wd"]),
                     new_field_name="enc_len_mask")
            ds.apply(lambda x: sum(x["enc_len_mask"]),
                     new_field_name="enc_len")
            ds.apply(lambda x: _pad_article(x["input_wd"]),
                     new_field_name="pad_input_wd")

            ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]),
                     new_field_name="nbr_inputs_len")

            ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]),
                     new_field_name="pad_graph_inputs_wd")
            if key == "train":
                train_ds = ds

        vocab_dict = {}
        if not load_vocab_file:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds is None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=config.vocab_size - 2,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.from_dataset(train_ds,
                                field_name=["input_wd", "summary_wd"])
            vocabs.add_word(START_DECODING)
            vocabs.add_word(STOP_DECODING)
            vocab_dict["vocab"] = vocabs
            # save vocab
            with open(os.path.join(config.train_path, "vocab"),
                      "w",
                      encoding="utf8") as f:
                for w, idx in vocabs:
                    f.write(str(w) + "\t" + str(idx) + "\n")
            logger.info(
                "build new vocab ends.. please reRun the code with load_vocab = True"
            )
            exit(0)
        else:

            logger.info("[INFO] Load existing vocab from %s!" %
                        config.vocab_path)
            word_list = []
            cnt = 3  # pad and unk
            if config.neighbor_process == "sep":
                cnt += 1

            with open(config.vocab_path, 'r', encoding='utf8') as vocab_f:
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.add_word_lst(word_list)
            vocabs.add(START_DECODING)
            vocabs.add(STOP_DECODING)
            if config.neighbor_process == "sep":
                vocabs.add(SEP)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        logger.info(f"vocab size = {len(vocabs)}")
        assert len(vocabs) == config.vocab_size
        dataInfo.set_vocab(vocabs, "vocab")

        for key, dataset in dataInfo.datasets.items():
            # do not process the training set in test mode
            if config.mode == "test" and key == "train":
                continue

            data_dict = {
                "enc_input": [],
                "nbr_inputs": [],
                "graph": [],
                "dec_input": [],
                "target": [],
                "dec_len": [],
                "article_oovs": [],
                "enc_input_extend_vocab": [],
            }
            logger.info(
                f"start construct the input of the model for {key} set, please wait..."
            )
            for instance in dataset:
                graph_inputs = instance["pad_graph_inputs_wd"]
                abstract_sentences = instance["summary_wd"]
                enc_input = instance["pad_input_wd"]
                enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \
                    getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config)
                graph = generate_dgl_graph(instance["paper_id"],
                                           instance["graph_struct"],
                                           instance["nodes_num"])
                data_dict["graph"].append(graph)
                data_dict["enc_input"].append(enc_input)
                data_dict["nbr_inputs"].append(nbr_inputs)
                data_dict["dec_input"].append(dec_input)
                data_dict["target"].append(target)
                data_dict["dec_len"].append(dec_len)
                data_dict["article_oovs"].append(article_oovs)
                data_dict["enc_input_extend_vocab"].append(
                    enc_input_extend_vocab)

            dataset.add_field("enc_input", data_dict["enc_input"])
            dataset.add_field("nbr_inputs", data_dict["nbr_inputs"])
            dataset.add_field("dec_input", data_dict["dec_input"])
            dataset.add_field("target", data_dict["target"])
            dataset.add_field("dec_len", data_dict["dec_len"])
            dataset.add_field("article_oovs", data_dict["article_oovs"])
            dataset.add_field("enc_input_extend_vocab",
                              data_dict["enc_input_extend_vocab"])

            dataset.add_field("graph", data_dict["graph"])
            dataset.set_ignore_type(
                'graph')  # without this line, there may be some errors
            dataset.set_input("graph")

            dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len",
                              "enc_input", "enc_len_mask", "dec_input",
                              "dec_len", "article_oovs", "nodes_num",
                              "enc_input_extend_vocab")
            dataset.set_target("target", "article_oovs", "abstract_sentences")

            dataset.delete_field('graph_inputs_wd')
            dataset.delete_field('pad_graph_inputs_wd')
            dataset.delete_field('input_wd')
            dataset.delete_field('pad_input_wd')
        logger.info("------load dataset over---------")
        return dataInfo, vocabs
Exemple #23
0
                        target = file2label[file]))
        else:
            train_dataset.append(Instance(raw_words = raw_words,
                        words = words,
                        seq_len = seq_len,
                        target = file2label[file]))

train_dataset.set_input('words', 'seq_len', 'target')
test_dataset.set_input('words', 'seq_len', 'target')

train_dataset.set_target('target')
test_dataset.set_target('target')

'''build vocabulary'''
vocab = Vocabulary()
vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset])
vocab.index_dataset(train_dataset, test_dataset, field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset])
target_vocab.index_dataset(train_dataset, test_dataset, field_name='target')

'''build bundle'''
data_dict = {"train":train_dataset, "test":test_dataset}
vocab_dict = {"words":vocab, "target":target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)

'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True)
model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = False):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写。
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(
            data.datasets['train'],
            field_name='raw_words',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
Exemple #25
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])

    train_path = os.path.join(path, 'weiboNER_2nd_conll.train')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #26
0
def load_ip_step2(path, char_embedding_path=None):
    train_path = os.path.join(path, 'train1.txt')
    dev_path = os.path.join(path, 'dev1.txt')
    test_path = os.path.join(path, 'test1.txt')

    # 播放徐秉龙的故事\t徐秉龙\talbum_film\t[0 0 0 0 1]
    loader = myConllLoader()
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    char_vocab = Vocabulary()
    entity_vocab = Vocabulary()
    logging.info('dev instance:{}'.format(len(datasets['dev'])))
    logging.info('test instance:{}'.format(len(datasets['test'])))
    logging.info('train instance:{}'.format(len(datasets['train'])))

    char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='left_context')
    char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='right_context')
    char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity')

    entity_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity_label')

    char_vocab.index_dataset(datasets['train'],
                             datasets['dev'],
                             datasets['test'],
                             field_name='left_context',
                             new_field_name='left_chars')
    char_vocab.index_dataset(datasets['train'],
                             datasets['dev'],
                             datasets['test'],
                             field_name='right_context',
                             new_field_name='right_chars')
    char_vocab.index_dataset(datasets['train'],
                             datasets['dev'],
                             datasets['test'],
                             field_name='raw_entity',
                             new_field_name='entity_chars')
    entity_vocab.index_dataset(datasets['train'],
                               datasets['dev'],
                               datasets['test'],
                               field_name='raw_entity_label',
                               new_field_name='entity_label')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['entity'] = entity_vocab

    embeddings = {}
    if char_embedding_path is not None:
        bi_voc = dict()
        for k, v in char_vocab:
            bi_voc[k] = v
        embed_weight = build_pretrain_embedding(char_embedding_path, bi_voc, embedd_dim=200)
        embeddings['char'] = embed_weight

    return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'toy_train.bmoes')
    dev_path = os.path.join(path, 'toy_dev.bmoes')
    test_path = os.path.join(path, 'toy_test.bmoes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, )
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #28
0
data_set.rename_field('Sentiment', 'target')


def get_words(instance):
    ins = instance['raw_words'].split()
    if not ins:
        ins.append('nothing')
    return ins


data_set.apply(get_words, new_field_name='words')
#data_set.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')
#data_set.drop(lambda ins:ins['raw_words'].strip()=='')

vocab = Vocabulary()
vocab.from_dataset(data_set, field_name='words')
vocab.index_dataset(data_set, field_name='words')

vocab_target = Vocabulary(unknown=None, padding=None)
vocab_target.from_dataset(data_set, field_name='target')
vocab_target.index_dataset(data_set, field_name='target')

data_set.set_input('words')
data_set.set_target('target')

train_data, dev_data = data_set.split(0.015)

# training
device = 0 if torch.cuda.is_available() else 'cpu'
'''
EMBED_DIM = 100
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True,
                    char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    print(f"load train dataset: {train_path}")
    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01,
                                         min_freq=char_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Exemple #30
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = True) -> DataBundle:
        """
        读取并处理数据。返回的DataInfo包含以下的内容
            vocabs:
                word: Vocabulary
                target: Vocabulary
            datasets:
                train: DataSet
                    words: List[int], 被设置为input
                    target: int. label,被同时设置为input和target
                    seq_len: int. 句子的长度,被同时设置为input和target
                    raw_words: List[str]
                xxx(根据传入的paths可能有所变化)

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否使用小写
        :return:
        """
        paths = check_dataloader_paths(paths)
        data = DataBundle()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(*data.datasets.values(),
                                    field_name='raw_words')
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data