Example #1
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                embed_opt: EmbeddingOption = None,
                char_level_op=False):
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataInfo(datasets=self.load(paths))
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}
        #就分隔为char形式
        if char_level_op:
            for dataset in info.datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')
        # if embed_opt is not None:
        #     embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab)
        #     info.embeddings['words'] = embed
        else:
            src_vocab.from_dataset(*_train_ds, field_name=input_name)
            src_vocab.index_dataset(*info.datasets.values(),
                                    field_name=input_name,
                                    new_field_name=input_name)
            info.vocabs[input_name] = src_vocab

        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)

        info.vocabs[target_name] = tgt_vocab

        info.datasets['train'], info.datasets['dev'] = info.datasets[
            'train'].split(0.1, shuffle=False)

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Example #2
0
    def test_roberta_embed_eq_roberta_piece_encoder(self):
        # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        ds = DataSet({
            'words': ["this is a texta a sentence".split(), 'this is'.split()]
        })
        encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 pool_method='first',
                                 include_cls_sep=True,
                                 pooled_cls=False)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
    def test_bert_embed_eq_bert_piece_encoder(self):
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = BertWordPieceEncoder(
            model_dir_or_name='test/data_for_tests/embedding/small_bert')
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            pool_method='first',
            include_cls_sep=True,
            pooled_cls=False,
            min_freq=1)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Example #4
0
def preprocess():
    train_set = DataSet()
    for i in range(len(raw_train.data)):
        train_set.append(
            Instance(sentence=raw_train.data[i],
                     target=int(raw_train.target[i])))

    train_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                    new_field_name='sentence')
    train_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    test_set = DataSet()
    for i in range(len(raw_test.data)):
        test_set.append(
            Instance(sentence=raw_test.data[i],
                     target=int(raw_test.target[i])))

    test_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                   new_field_name='sentence')
    test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    vocab = Vocabulary(min_freq=10)
    train_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    test_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    vocab.index_dataset(train_set, field_name='words', new_field_name='words')
    vocab.index_dataset(test_set, field_name='words', new_field_name='words')

    return train_set, test_set, vocab
def create_dataset():
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles']
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']
        categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                      'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
                      'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
                      'soc.religion.christian', 'talk.politics.guns',
                      'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

        newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..')
        newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..')

        dataset = DataSet()

        for i in range(len(newsgroups_train.data)):
            if len(newsgroups_train.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i])))
        for i in range(len(newsgroups_test.data)):
            if len(newsgroups_test.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i])))

        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
        dataset.apply(lambda x: x['sentence'].split(), new_field_name='words')
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')

        vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
        vocab.index_dataset(dataset, field_name='words', new_field_name='words')

        dataset.set_input('words', 'seq_len')
        dataset.set_target('target')

        train_dev_data, test_data = dataset.split(0.1)
        train_data, dev_data = train_dev_data.split(0.1)

        return vocab, train_data, dev_data, test_data
Example #6
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        datasets = {}
        info = DataBundle()
        paths = check_dataloader_paths(paths)
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')

        datasets["train"], datasets["dev"] = datasets["train"].split(
            0.1, shuffle=False)

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')

        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
def get_train_dev_test_vocab():

    dataset_train = fetch_20newsgroups(subset='train', data_home='../../../')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../../')
    # dataset_train, dataset_test = get_text_classification_datasets()

    train_data = dataset_train.data
    train_target = dataset_train.target
    test_data = dataset_test.data
    test_target = dataset_test.target
    print(f'train dataset: {len(train_data)}')
    print(f'test dataset: {len(test_data)}')

    train_dataset = to_dataset(train_data, train_target)
    test_dataset = to_dataset(test_data, test_target)

    vocab = Vocabulary(min_freq=10).from_dataset(train_dataset,
                                                 field_name='words')
    print(f'Vocab size: {len(vocab)}')

    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')

    train_dataset.apply_field(lambda x: len(x),
                              field_name='words',
                              new_field_name='seq_len')
    test_dataset.apply_field(lambda x: len(x),
                             field_name='words',
                             new_field_name='seq_len')

    # Rename to suit inbuilt Model in fastNLP
    train_dataset.rename_field('words', Const.INPUT)
    train_dataset.rename_field('seq_len', Const.INPUT_LEN)
    train_dataset.rename_field('target', Const.TARGET)
    train_dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    train_dataset.set_target(Const.TARGET)

    test_dataset.rename_field('words', Const.INPUT)
    test_dataset.rename_field('seq_len', Const.INPUT_LEN)
    test_dataset.rename_field('target', Const.TARGET)
    test_dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    test_dataset.set_target(Const.TARGET)

    # Split into development dataset
    train_dataset, dev_dataset = train_dataset.split(0.1)

    return train_dataset, dev_dataset, test_dataset, vocab
Example #8
0
def load_sst2(dict_path, embedding_path=None):
    '''

    :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/
    :param embedding_path: glove 300d txt
    :return:
    '''
    train_path = os.path.join(dict_path, 'train.tsv')
    dev_path = os.path.join(dict_path, 'dev.tsv')

    loader = CSVLoader(headers=('words', 'target'), sep='\t')
    train_data = loader.load(train_path).datasets['train']
    dev_data = loader.load(dev_path).datasets['train']

    train_data.apply_field(lambda x: x.split(),
                           field_name='words',
                           new_field_name='words')
    dev_data.apply_field(lambda x: x.split(),
                         field_name='words',
                         new_field_name='words')

    train_data.apply_field(lambda x: len(x),
                           field_name='words',
                           new_field_name='seq_len')
    dev_data.apply_field(lambda x: len(x),
                         field_name='words',
                         new_field_name='seq_len')

    vocab = Vocabulary(min_freq=2)
    vocab.from_dataset(train_data, field_name='words')
    vocab.from_dataset(dev_data, field_name='words')

    # pretrained_embedding = load_word_emb(embedding_path, 300, vocab)

    label_vocab = Vocabulary(padding=None,
                             unknown=None).from_dataset(train_data,
                                                        field_name='target')

    label_vocab.index_dataset(train_data, field_name='target')
    label_vocab.index_dataset(dev_data, field_name='target')

    vocab.index_dataset(train_data, field_name='words', new_field_name='words')
    vocab.index_dataset(dev_data, field_name='words', new_field_name='words')

    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    dev_data.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_data.set_target(Const.TARGET)

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_data, dev_data), (vocab,
                                        label_vocab), pretrained_embedding

    else:
        return (train_data, dev_data), (vocab, label_vocab)
Example #9
0
def get_label_vocab(data_type='default'):
    label = [
        'family', 'education', 'money', 'med_exam', 'ID', 'contact', 'name',
        'time', 'location', 'profession'
    ]
    total_label = []

    for prefix in tagging_method:
        total_label.extend([prefix + '-' + ele for ele in label])
    total_label.append('O')
    print(total_label)
    label_ds = DataSet({'target': total_label})
    label_vocab = Vocabulary(unknown=None, padding=None)
    label_vocab.from_dataset(label_ds, field_name='target')
    label_vocab.index_dataset(label_ds, field_name='target')
    # label_vocab.add_word_lst(total_label)
    return label_vocab
Example #10
0
def preprocess(batch=16):
    raw_data1 = []
    raw_data2 = []

    for i in range(len(traindata.data)):
        raw_data1.append(
            Instance(sentence=traindata.data[i],
                     label=int(traindata.target[i])))
    trainset = DataSet(raw_data1)
    trainset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    for i in range(len(testdata.data)):
        raw_data2.append(
            Instance(sentence=testdata.data[i], label=int(testdata.target[i])))
    testset = DataSet(raw_data2)
    testset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    global vocab
    vocab = Vocabulary(min_freq=1).from_dataset(trainset,
                                                testset,
                                                field_name='words')
    vocab.index_dataset(trainset,
                        testset,
                        field_name='words',
                        new_field_name='words')
    trainset.set_input('words')
    testset.set_input('words')

    trainset.apply(lambda x: int(x['label']),
                   new_field_name='target',
                   is_target=True)
    testset.apply(lambda x: int(x['label']),
                  new_field_name='target',
                  is_target=True)

    trainset.apply(lambda x: len(x['words']), new_field_name='seq_len')
    testset.apply(lambda x: len(x['words']), new_field_name='seq_len')

    global vocabsize
    vocabsize = len(vocab)
    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler)
    test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler)

    return train_batch, test_batch, vocabsize
Example #11
0
def Get_Data_Vocab():
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    train_data_raw = construct_dataset(dataset_train)
    test_data = construct_dataset(dataset_test)
    vocab = Vocabulary(min_freq=10).from_dataset(train_data_raw,
                                                 field_name='input')
    vocab.index_dataset(train_data_raw,
                        field_name='input',
                        new_field_name='input')
    vocab.index_dataset(test_data, field_name='input', new_field_name='input')
    train_data_raw.set_input("input")
    train_data_raw.set_target("target")
    test_data.set_input("input")
    test_data.set_target("target")
    dev_data, train_data = train_data_raw.split(0.8)

    return vocab, train_data, dev_data, test_data
Example #12
0
def prepare_ptb(args):
    datas = {}
    datas["pos"] = (ConllLoader(headers=["words", "pos"],
                                indexes=[0, 1]).load(args.pos).datasets)
    chunk_data = (ConllLoader(headers=["words", "chunk"],
                              indexes=[0, 2]).load(args.chunk).datasets)
    chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1)
    datas['chunk'] = chunk_data
    datas["ner"] = (ConllLoader(headers=["words", "ner"],
                                indexes=[0, 3]).load(args.ner).datasets)

    for ds in datas['chunk'].values():
        ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk')
    for ds in datas['ner'].values():
        ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner')

    vocabs = {}
    src_vocab = Vocabulary()
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        filter_docstart(data)
        vocab = Vocabulary(padding=None, unknown=None)
        vocab.from_dataset(*list(data.values()), field_name=task_name)
        src_vocab.from_dataset(*list(data.values()), field_name="words")
        vocabs[task_name] = vocab

    task_lst = []
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        src_vocab.index_dataset(*list(data.values()),
                                field_name="words",
                                new_field_name="words")
        vocabs[task_name].index_dataset(*list(data.values()),
                                        field_name=task_name,
                                        new_field_name=task_name)
        for ds in data.values():
            ds.apply_field(len, 'words', 'seq_len')
        task_lst.append(
            Task(idx, task_name, data["train"], data["dev"], data["test"]))
    vocabs["words"] = src_vocab
    return task_lst, vocabs
Example #13
0
def get_data():
    dataset_train, dataset_test = get_text_classification_datasets()
    # print(dataset_train.data)

    dic_train = {
        "input" : dataset_train.data,
        "target" : dataset_train.target
    }
    dic_test = {
        "input" : dataset_test.data,
        "target" : dataset_test.target
    }

    dataset = DataSet(dic_train)
    test_data = DataSet(dic_test)

    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')

    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')


    # **************************
    dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('target', Const.TARGET)
    
    test_data.rename_field('words', Const.INPUT)
    test_data.rename_field('seq_len', Const.INPUT_LEN)
    test_data.rename_field('target', Const.TARGET)

    # dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_input(Const.INPUT)
    dataset.set_target(Const.TARGET)

    # test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_input(Const.INPUT)
    test_data.set_target(Const.TARGET)
    # **************************

    # only use train for vocab or train+dev
    train_data, dev_data = dataset.split(0.1)
    # print(len(train_data), len(dev_data), len(test_data))
    # print(train_data[0])

    vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT)

    vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT)

    # print(test_data[0])
    print(len(vocab))
    return vocab, train_data, dev_data, test_data
Example #14
0
 def test_from_dataset_no_entry(self):
     # 测试能否正确将no_create_entry正确设置
     dataset = DataSet()
     start_char = 65
     num_samples = 10
     test_dataset = DataSet()
     for i in range(num_samples):
         char = [chr(start_char + i)] * 6
         ins = Instance(char=char)
         dataset.append(ins)
         ins = Instance(char=[c + c for c in char])
         test_dataset.append(ins)
     vocab = Vocabulary()
     vocab.from_dataset(dataset,
                        field_name='char',
                        no_create_entry_dataset=test_dataset)
     vocab.index_dataset(dataset, field_name='char')
     for i in range(num_samples):
         self.assertEqual(
             True,
             vocab._is_word_no_create_entry(
                 chr(start_char + i) + chr(start_char + i)))
Example #15
0
    def process(self, data_bundle):
        data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT)
        input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN]
        target_fields = [C.TARGET, C.INPUT_LEN]
        if self.bigram:
            for dataset in data_bundle.datasets.values():
                dataset.apply_field(
                    lambda chars:
                    [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                    field_name=C.CHAR_INPUT,
                    new_field_name='bigrams')
            bigram_vocab = Vocabulary()
            bigram_vocab.from_dataset(
                data_bundle.get_dataset('train'),
                field_name='bigrams',
                no_create_entry_dataset=[
                    ds for name, ds in data_bundle.datasets.items()
                    if name != 'train'
                ])
            bigram_vocab.index_dataset(*data_bundle.datasets.values(),
                                       field_name='bigrams')
            data_bundle.set_vocab(bigram_vocab, field_name='bigrams')
            input_fields.append('bigrams')

        _add_chars_field(data_bundle, lower=False)

        # index
        _indexize(data_bundle,
                  input_field_names=C.CHAR_INPUT,
                  target_field_names=C.TARGET)

        for name, dataset in data_bundle.datasets.items():
            dataset.set_pad_val(C.TARGET, self.target_pad_val)
            dataset.add_seq_len(C.CHAR_INPUT)

        data_bundle.set_input(*input_fields)
        data_bundle.set_target(*target_fields)

        return data_bundle
Example #16
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None):
        
        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {
            "words": src_vocab,
            "target": tgt_vocab
        }

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Example #17
0
    def preprocess():
        train_set = DataSet()
        for i in range(len(raw_train['data'])):
            di = transfer(raw_train['data'][i])
            train_set.append(
                Instance(sentence=di, target=int(raw_train['target'][i])))

        train_set.apply(lambda x: x['sentence'].lower(),
                        new_field_name='sentence')
        train_set.apply(lambda x: x['sentence'].split(),
                        new_field_name='words')
        train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        test_set = DataSet()
        for i in range(len(raw_test['data'])):
            di = transfer(raw_test['data'][i])
            test_set.append(
                Instance(sentence=di, target=int(raw_test['target'][i])))

        test_set.apply(lambda x: x['sentence'].lower(),
                       new_field_name='sentence')
        test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
        test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        word_dict = Vocabulary(min_freq=2)
        train_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        test_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        word_dict.build_vocab()
        word_dict.index_dataset(train_set,
                                field_name='words',
                                new_field_name='words')
        word_dict.index_dataset(test_set,
                                field_name='words',
                                new_field_name='words')

        return train_set, test_set, word_dict
Example #18
0
    def test_from_dataset(self):
        start_char = 65
        num_samples = 10

        # 0 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=chr(start_char + i))
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 1 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[chr(start_char + i)] * 6)
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 2 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[[chr(start_char + i) for _ in range(6)]
                                 for _ in range(6)])
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')
Example #19
0
    label_vocab['POS'] = Vocabulary().from_dataset(
        all_data['train']['POS-ctb9'], field_name='target')
    label_vocab['CWS'] = Vocabulary().from_dataset(
        all_data['train']['CWS-pku'], field_name='target')
    label_vocab['NER'] = Vocabulary().from_dataset(
        all_data['train']['NER-msra'], field_name='target')
    label_vocab['Parsing'] = torch.load('vocab/parsing_vocab')
    label_vocab['pos'] = Vocabulary().from_dataset(
        all_data['train']['Parsing-ctb9'], field_name='pos')

    for target in target_list:
        for task in task_list:
            all_data[target][task].drop(lambda ins: len(ins['words']) > 256)
            chars_vocab.index_dataset(all_data[target][task],
                                      field_name='words',
                                      new_field_name='chars')
            task_class = task.split('-')[0]
            all_data[target][task].apply(lambda ins: task_class,
                                         new_field_name='task_class')
            if task == 'Parsing-ctb9':
                label_vocab['Parsing'].index_dataset(
                    all_data[target]['Parsing-ctb9'], field_name='char_labels')
                label_vocab[task_class].index_dataset(all_data[target][task],
                                                      field_name='dep_label')
                label_vocab['pos'].index_dataset(
                    all_data[target]['Parsing-ctb9'], field_name='pos')
                label_vocab['POS'].index_dataset(
                    all_data[target]['Parsing-ctb9'], field_name='target')

                all_data[target][task].set_input('seq_len_for_wordlist',
    def process(self,
                paths: Union[str, Dict[str, str]],
                word_vocab_opt: VocabularyOption = None,
                lower: bool = False):
        """
        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

        :param paths:
        :param word_vocab_opt: vocabulary的初始化值
        :param lower: 是否将所有字母转为小写。
        :return:
        """
        # 读取数据
        paths = check_dataloader_paths(paths)
        data = DataInfo()
        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
        target_fields = [Const.TARGET, Const.INPUT_LEN]
        for name, path in paths.items():
            dataset = self.load(path)
            dataset.apply_field(lambda words: words,
                                field_name='raw_words',
                                new_field_name=Const.INPUT)
            if lower:
                dataset.words.lower()
            data.datasets[name] = dataset

        # 对construct vocab
        word_vocab = Vocabulary(
            min_freq=2) if word_vocab_opt is None else Vocabulary(
                **word_vocab_opt)
        word_vocab.from_dataset(data.datasets['train'],
                                field_name=Const.INPUT,
                                no_create_entry_dataset=[
                                    dataset
                                    for name, dataset in data.datasets.items()
                                    if name != 'train'
                                ])
        word_vocab.index_dataset(*data.datasets.values(),
                                 field_name=Const.INPUT,
                                 new_field_name=Const.INPUT)
        data.vocabs[Const.INPUT] = word_vocab

        # cap words
        cap_word_vocab = Vocabulary()
        cap_word_vocab.from_dataset(
            data.datasets['train'],
            field_name='raw_words',
            no_create_entry_dataset=[
                dataset for name, dataset in data.datasets.items()
                if name != 'train'
            ])
        cap_word_vocab.index_dataset(*data.datasets.values(),
                                     field_name='raw_words',
                                     new_field_name='cap_words')
        input_fields.append('cap_words')
        data.vocabs['cap_words'] = cap_word_vocab

        # 对target建vocab
        target_vocab = Vocabulary(unknown=None, padding=None)
        target_vocab.from_dataset(*data.datasets.values(),
                                  field_name=Const.TARGET)
        target_vocab.index_dataset(*data.datasets.values(),
                                   field_name=Const.TARGET)
        data.vocabs[Const.TARGET] = target_vocab

        for name, dataset in data.datasets.items():
            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
            dataset.set_input(*input_fields)
            dataset.set_target(*target_fields)

        return data
            kernel_sizes=kernel_sizes,
            padding=padding)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    
    def forward(self, words, seq_len=None):
        x = self.embed(words)  # [N,L] -> [N,L,C]
        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    
    def predict(self, words, seq_len=None):
        output = self(words, seq_len)
        _, predict = output[C.OUTPUT].max(dim=1)
        return {C.OUTPUT: predict}


#demo version

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words',new_field_name='words')
trainData.set_target('target')
model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1)
train_data, dev_data = trainData.split(0.2)
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16)
trainer.train()
Example #22
0
testset = DataSet()
for i in range(newsgroups_test.target.shape[0]):
    testset.append(
        Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '),
                 target=int(newsgroups_test.target[i])))
testset.apply(lambda x: x['raw_sentence'].lower().translate(table),
              new_field_name='sentence')
testset.apply_field(lambda x: x.split(),
                    field_name='sentence',
                    new_field_name='words')
testset.apply_field(lambda x: len(x),
                    field_name='words',
                    new_field_name='seq_len')

vocab = Vocabulary(min_freq=10).from_dataset(dataset, field_name='words')
vocab.index_dataset(dataset, field_name='words', new_field_name='words')
vocab.index_dataset(testset, field_name='words', new_field_name='words')

#model = CNNText((len(vocab),50), num_classes=20, padding=2, dropout=0.1)
model = mycnn(len(vocab), 100, len(dataset.target))
#model = myrnn(len(vocab),100,20)
#model = LSTMText(len(vocab),64,20) #used

dataset.rename_field('words', Const.INPUT)
dataset.rename_field('target', Const.TARGET)
dataset.rename_field('seq_len', Const.INPUT_LEN)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET)

testset.rename_field('words', Const.INPUT)
testset.rename_field('target', Const.TARGET)
Example #23
0
def load_resume_ner(path,
                    char_embedding_path=None,
                    bigram_embedding_path=None,
                    index_token=True,
                    normalize={
                        'char': True,
                        'bigram': True,
                        'word': False
                    }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'train.char.bmes')
    dev_path = os.path.join(path, 'dev.char.bmes')
    test_path = os.path.join(path, 'test.char.bmes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams,
                                  field_name='chars',
                                  new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams,
                                field_name='chars',
                                new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams,
                                 field_name='chars',
                                 new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print(len(datasets['dev']))
    print(len(datasets['test']))
    print(len(datasets['train']))
    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'],
                                 datasets['dev'],
                                 datasets['test'],
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'],
                                   datasets['dev'],
                                   datasets['test'],
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'],
                                  datasets['dev'],
                                  datasets['test'],
                                  field_name='target',
                                  new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab,
                                         char_embedding_path,
                                         word_dropout=0.01,
                                         normalize=normalize['char'])
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab,
                                           bigram_embedding_path,
                                           word_dropout=0.01,
                                           normalize=normalize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Example #24
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   normlize={
                       'char': True,
                       'bigram': True,
                       'word': False
                   }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    bundle = loader.load(path)

    datasets = bundle.datasets
    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Example #25
0
def load_weibo_ner_old(path,
                       unigram_embedding_path=None,
                       bigram_embedding_path=None,
                       index_token=True,
                       normlize={
                           'char': True,
                           'bigram': True,
                           'word': False
                       }):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # from fastNLP.io.file_reader import _read_conll
    # from fastNLP.core import Instance,DataSet
    # def _load(path):
    #     ds = DataSet()
    #     for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna,
    #                                 encoding='ISO-8859-1'):
    #         ins = {h: data[i] for i, h in enumerate(loader.headers)}
    #         ds.append(Instance(**ins))
    #     return ds
    # from fastNLP.io.utils import check_loader_paths
    # paths = check_loader_paths(path)
    # datasets = {name: _load(path) for name, path in paths.items()}
    datasets = {}
    train_path = os.path.join(path, 'train.all.bmes')
    dev_path = os.path.join(path, 'dev.all.bmes')
    test_path = os.path.join(path, 'test.all.bmes')
    datasets['train'] = loader.load(train_path).datasets['train']
    datasets['dev'] = loader.load(dev_path).datasets['train']
    datasets['test'] = loader.load(test_path).datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))

    vocabs = {}
    word_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    word_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = word_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        word_vocab.index_dataset(*list(datasets.values()),
                                 field_name='raw_words',
                                 new_field_name='words')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='raw_bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='raw_target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            word_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['char'])
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            normalize=normlize['bigram'])
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True,
                   char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])
    # bundle = loader.load(path)
    #
    # datasets = bundle.datasets

    # print(datasets['train'][:5])

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    print(f"load train dataset: {train_path}")
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    if index_token:
        char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path,
                                            word_dropout=char_word_dropout,
                                            min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path,
                                           word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}")
    return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_path = os.path.join(path, 'toy_train.bmoes')
    dev_path = os.path.join(path, 'toy_dev.bmoes')
    test_path = os.path.join(path, 'toy_test.bmoes')

    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary(padding=None, unknown=None)
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    vocabs['label'] = label_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, )
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Example #28
0
def load_weibo_ner(path,
                   unigram_embedding_path=None,
                   bigram_embedding_path=None,
                   index_token=True,
                   char_min_freq=1,
                   bigram_min_freq=1,
                   only_train_min_freq=0,
                   char_word_dropout=0.01):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    loader = ConllLoader(['chars', 'target'])

    train_path = os.path.join(path, 'weiboNER_2nd_conll.train')
    dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev')
    test_path = os.path.join(path, 'weiboNER_2nd_conll.test')

    paths = {}
    paths['train'] = train_path
    paths['dev'] = dev_path
    paths['test'] = test_path

    datasets = {}

    for k, v in paths.items():
        bundle = loader.load(v)
        datasets[k] = bundle.datasets['train']

    for k, v in datasets.items():
        print('{}:{}'.format(k, len(v)))
    # print(*list(datasets.keys()))
    vocabs = {}
    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()

    for k, v in datasets.items():
        # ignore the word segmentation tag
        v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars')
        v.apply_field(get_bigrams, 'chars', 'bigrams')

    char_vocab.from_dataset(
        datasets['train'],
        field_name='chars',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word))

    for k, v in datasets.items():
        # v.set_pad_val('target',-100)
        v.add_seq_len('chars', new_field_name='seq_len')

    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab

    bigram_vocab.from_dataset(
        datasets['train'],
        field_name='bigrams',
        no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if index_token:
        char_vocab.index_dataset(*list(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
        bigram_vocab.index_dataset(*list(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
        label_vocab.index_dataset(*list(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')

    # for k,v in datasets.items():
    #     v.set_input('chars','bigrams','seq_len','target')
    #     v.set_target('target','seq_len')

    vocabs['bigram'] = bigram_vocab

    embeddings = {}

    if unigram_embedding_path is not None:
        unigram_embedding = StaticEmbedding(
            char_vocab,
            model_dir_or_name=unigram_embedding_path,
            word_dropout=char_word_dropout,
            min_freq=char_min_freq,
            only_train_min_freq=only_train_min_freq,
        )
        embeddings['char'] = unigram_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(
            bigram_vocab,
            model_dir_or_name=bigram_embedding_path,
            word_dropout=0.01,
            min_freq=bigram_min_freq,
            only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True,
                    char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0):
    from fastNLP.io.loader import ConllLoader
    from utils import get_bigrams

    train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}'
    train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}')
    dev_path = os.path.join(path, 'dev.char.bmoes')
    test_path = os.path.join(path, 'test.char.bmoes')

    print(f"load train dataset: {train_path}")
    loader = ConllLoader(['chars', 'target'])
    train_bundle = loader.load(train_path)
    dev_bundle = loader.load(dev_path)
    test_bundle = loader.load(test_path)

    datasets = dict()
    datasets['train'] = train_bundle.datasets['train']
    datasets['dev'] = dev_bundle.datasets['train']
    datasets['test'] = test_bundle.datasets['train']

    datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')
    datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams')

    datasets['train'].add_seq_len('chars')
    datasets['dev'].add_seq_len('chars')
    datasets['test'].add_seq_len('chars')

    char_vocab = Vocabulary()
    bigram_vocab = Vocabulary()
    label_vocab = Vocabulary()
    print(datasets.keys())
    print("dev:", len(datasets['dev']))
    print("test:", len(datasets['test']))
    print("train:", len(datasets['train']))
    char_vocab.from_dataset(datasets['train'], field_name='chars',
                            no_create_entry_dataset=[datasets['dev'], datasets['test']])
    bigram_vocab.from_dataset(datasets['train'], field_name='bigrams',
                              no_create_entry_dataset=[datasets['dev'], datasets['test']])
    label_vocab.from_dataset(datasets['train'], field_name='target')
    if index_token:
        char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                 field_name='chars', new_field_name='chars')
        bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                   field_name='bigrams', new_field_name='bigrams')
        label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'],
                                  field_name='target', new_field_name='target')

    vocabs = {}
    vocabs['char'] = char_vocab
    vocabs['label'] = label_vocab
    vocabs['bigram'] = bigram_vocab
    # TODO: add span_label, attr_start, attr_end
    datasets, vocabs = input_with_span_attr(datasets, vocabs)

    embeddings = {}
    if char_embedding_path is not None:
        char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01,
                                         min_freq=char_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['char'] = char_embedding

    if bigram_embedding_path is not None:
        bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01,
                                           min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['bigram'] = bigram_embedding

    return datasets, vocabs, embeddings
Example #30
0
def load_conllized_ontonote_POS(path, embedding_path=None):
    from fastNLP.io.loader import ConllLoader
    header2index = {'words': 3, 'POS': 4, 'NER': 10}
    headers = ['words', 'POS']

    if 'NER' in headers:
        print(
            '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!'
        )
    indexes = list(map(lambda x: header2index[x], headers))

    loader = ConllLoader(headers, indexes)

    bundle = loader.load(path)

    # print(bundle.datasets)

    train_set = bundle.datasets['train']
    dev_set = bundle.datasets['dev']
    test_set = bundle.datasets['test']

    # train_set = loader.load(os.path.join(path,'train.txt'))
    # dev_set = loader.load(os.path.join(path, 'dev.txt'))
    # test_set = loader.load(os.path.join(path, 'test.txt'))

    # print(len(train_set))

    train_set.add_seq_len('words', 'seq_len')
    dev_set.add_seq_len('words', 'seq_len')
    test_set.add_seq_len('words', 'seq_len')

    # print(dataset['POS'])

    vocab = Vocabulary(min_freq=1)
    vocab.from_dataset(train_set, field_name='words')
    vocab.from_dataset(dev_set, field_name='words')
    vocab.from_dataset(test_set, field_name='words')

    vocab.index_dataset(train_set, field_name='words')
    vocab.index_dataset(dev_set, field_name='words')
    vocab.index_dataset(test_set, field_name='words')

    label_vocab_dict = {}

    for i, h in enumerate(headers):
        if h == 'words':
            continue
        label_vocab_dict[h] = Vocabulary(min_freq=1,
                                         padding=None,
                                         unknown=None)
        label_vocab_dict[h].from_dataset(train_set, field_name=h)

        label_vocab_dict[h].index_dataset(train_set, field_name=h)
        label_vocab_dict[h].index_dataset(dev_set, field_name=h)
        label_vocab_dict[h].index_dataset(test_set, field_name=h)

    train_set.set_input(Const.INPUT, Const.INPUT_LEN)
    train_set.set_target(headers[1])

    dev_set.set_input(Const.INPUT, Const.INPUT_LEN)
    dev_set.set_target(headers[1])

    test_set.set_input(Const.INPUT, Const.INPUT_LEN)
    test_set.set_target(headers[1])

    if len(headers) > 2:
        print('警告:由于任务数量大于1,所以需要每次手动设置target!')

    print('train:', len(train_set), 'dev:', len(dev_set), 'test:',
          len(test_set))

    if embedding_path is not None:
        pretrained_embedding = load_word_emb(embedding_path, 300, vocab)
        return (train_set, dev_set,
                test_set), (vocab, label_vocab_dict), pretrained_embedding
    else:
        return (train_set, dev_set, test_set), (vocab, label_vocab_dict)