Esempio n. 1
0
def my_model_single_sentence(sentence):
    '''
    #取出Pipe的处理过程,目的是取得由训练集所构建的词典
    from fastNLP.io import WeiboNERLoader

    #load原始数据
    data_bundle = WeiboNERLoader().load()

    #这里需要获取原始数据的此表Vocabulary
    from fastNLP import Vocabulary
    from fastNLP.core.utils import iob2, iob2bioes
    from fastNLP.core.const import Const

    #encoding_type
    encoding_type: str = 'bio'

    if encoding_type == 'bio':
        convert_tag = iob2
    elif encoding_type == 'bioes':
        convert_tag = lambda words: iob2bioes(iob2(words))


    #转换tag
    for name, dataset in data_bundle.datasets.items():
        dataset.apply_field(convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET)    
   
    #复制一列chars
    data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True)


    input_field_names = [Const.CHAR_INPUT]
    target_field_names=Const.TARGET

    if isinstance(input_field_names, str):
        input_field_names = [input_field_names]
    if isinstance(target_field_names, str):
        target_field_names = [target_field_names]

    #构建词表
    for input_field_name in input_field_names:
        src_vocab = Vocabulary()
        src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name],
                                field_name=input_field_name,
                                no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if ('train' not in name) and (ds.has_field(input_field_name))]
                                )
        src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name)
        data_bundle.set_vocab(src_vocab, input_field_name)
  
    #构建target表
    for target_field_name in target_field_names:
        tgt_vocab = Vocabulary(unknown=None, padding=None)
        tgt_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name],
                                field_name=target_field_name,
                                no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if ('train' not in name) and (ds.has_field(target_field_name))]
                                )
        if len(tgt_vocab._no_create_word) > 0:
            warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \
                        f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \
                        f"data set but not in train data set!.\n" \
                        f"These label(s) are {tgt_vocab._no_create_word}"
            warnings.warn(warn_msg)
            logger.warning(warn_msg)
        tgt_vocab.index_dataset(*[ds for ds in data_bundle.datasets.values() if ds.has_field(target_field_name)], field_name=target_field_name)
        data_bundle.set_vocab(tgt_vocab, target_field_name)

    input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
    target_fields = [Const.TARGET, Const.INPUT_LEN]
        
    for name, dataset in data_bundle.datasets.items():
        dataset.add_seq_len(Const.CHAR_INPUT)
        
    data_bundle.set_input(*input_fields)
    data_bundle.set_target(*target_fields)
    '''
    '''
    from fastNLP.io import WeiboNERPipe
    data_bundle = WeiboNERPipe().process_from_file()
    '''
    from fastNLP.io.loader.conll import CNNERLoader
    data_bundle = CNNERLoader().load("data/")

    from fastNLP.io.pipe.conll import _CNNERPipe
    data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle)

    src_vocab = data_bundle.get_vocab('chars')
    tgt_vocab = data_bundle.get_vocab('target')

    #至此数据处理完毕,两个词典也构建完成
    #需要增加数据的是data_bundle.get_dataset('test')这一个fastNLP dataset对象
    #该数据结构格式为 raw_chars target chars seq_len

    from fastNLP import Instance

    my_raw_chars = []
    my_target = []
    my_words = []
    for i in range(0, len(sentence)):
        my_raw_chars.append(sentence[i])
        my_target.append(0)
        my_words.append(src_vocab.to_index(sentence[i]))

    my_seq_len = len(sentence)

    ins = Instance()

    ins.add_field('raw_chars', my_raw_chars)
    ins.add_field('target', my_target)
    ins.add_field('chars', my_words)
    ins.add_field('seq_len', my_seq_len)

    data_bundle.get_dataset('test').append(ins)

    #加载模型
    from fastNLP.io import ModelLoader

    loader = ModelLoader()

    model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl")

    data_bundle.rename_field(
        'chars',
        'words')  # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名

    from fastNLP import SpanFPreRecMetric
    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))

    #进行测试
    from fastNLP import Tester

    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)

    final = tester.get_pred()

    my_label = []
    #我们要的是final的最后一个的最后一行
    for i in final[len(final) - 1][len(final[len(final) - 1]) - 1]:
        i = i.cpu().item()
        my_label.append(tgt_vocab.to_word(i))

    output = ''
    for j in range(0, my_seq_len):
        output = output + sentence[j] + ' ' + my_label[j] + '\n'

    return output
Esempio n. 2
0
 def test_add_field(self):
     fields = {"x": [1, 2, 3], "y": [4, 5, 6]}
     ins = Instance(**fields)
     ins.add_field("z", [1, 1, 1])
     fields.update({"z": [1, 1, 1]})
     self.assertEqual(ins.fields, fields)
Esempio n. 3
0
def my_model_passage(sentences):
    from fastNLP.io.loader.conll import CNNERLoader
    data_bundle = CNNERLoader().load("data/")

    from fastNLP.io.pipe.conll import _CNNERPipe
    data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle)

    src_vocab = data_bundle.get_vocab('chars')
    tgt_vocab = data_bundle.get_vocab('target')

    for i in range(0, 27):
        data_bundle.get_dataset('test').delete_instance(0)

    #至此数据处理完毕,两个词典也构建完成
    #构建新的dataset

    from fastNLP import Instance

    for i in range(0, len(sentences)):
        my_raw_chars = []
        my_target = []
        my_words = []
        for j in range(0, len(sentences[i])):
            my_raw_chars.append(sentences[i][j])
            my_target.append(0)
            my_words.append(src_vocab.to_index(sentences[i][j]))

        my_seq_len = len(sentences[i])

        ins = Instance()

        ins.add_field('raw_chars', my_raw_chars)
        ins.add_field('target', my_target)
        ins.add_field('chars', my_words)
        ins.add_field('seq_len', my_seq_len)

        data_bundle.get_dataset('test').append(ins)

    data_bundle.get_dataset('test').delete_instance(0)

    #加载模型
    from fastNLP.io import ModelLoader

    loader = ModelLoader()

    model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl")

    data_bundle.get_dataset('test').rename_field(
        'chars',
        'words')  # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名

    from fastNLP import SpanFPreRecMetric
    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))

    #进行测试
    from fastNLP import Tester

    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)

    final = tester.get_pred()

    output = ''
    labels = []
    #我们要的是final所有内容
    #原test有两个batch 数目为16 12
    for i in range(0, len(final)):
        for j in range(0, len(final[i])):
            my_label = []
            for item in final[i][j]:
                my_label.append(tgt_vocab.to_word(item.cpu().item()))
            labels.append(my_label)

    print(labels[0])
    print(final[0][0])

    for i in range(0, len(sentences)):
        for j in range(0, len(sentences[i])):
            output = output + sentences[i][j] + ' ' + labels[i][j] + '\n'

        output = output + '\n'
    return output