Exemple #1
0
def load_dataset(
    data_dir='/remote-home/ygxu/workspace/Product_all',
    data_path='mr.task.train',
    # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12',
    bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16',
):

    path = os.path.join(data_dir, data_path)

    ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t')

    ds.apply(lambda x: x['raw_sentence'].lower(),
             new_field_name='raw_sentence')

    ds.apply(lambda x: int(x['label']),
             new_field_name='label_seq',
             is_target=True)

    def transfer_bert_to_fastnlp(ins):
        result = "[CLS] "
        bert_text = ins['bert_tokenize_list']
        for text in bert_text:
            result += text + " "
        return result.strip()

    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line[:-1])

    vocab_bert = Vocabulary(unknown=None, padding=None)
    vocab_bert.add_word_lst(vocabs)
    vocab_bert.build_vocab()
    vocab_bert.unknown = '[UNK]'
    vocab_bert.padding = '[PAD]'

    from pytorch_pretrained_bert import BertTokenizer, BertModel
    tokenizer = BertTokenizer.from_pretrained(
        os.path.join(bert_dir, 'vocab.txt'))
    ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']),
             new_field_name='bert_tokenize_list')
    ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize')
    ds.apply(lambda x:
             [vocab_bert.to_index(word) for word in x['bert_tokenize_list']],
             new_field_name='index_words',
             is_input=True)

    ds.rename_field('index_words', 'tokens')
    ds.apply(lambda x: [1.] * len(x['tokens']),
             new_field_name='masks',
             is_input=True)

    return ds
Exemple #2
0
    def test_fastnlp_advanced_tutorial(self):
        import os
        os.chdir("tutorials/fastnlp_advanced_tutorial")

        from fastNLP import DataSet
        from fastNLP import Instance
        from fastNLP import Vocabulary
        from fastNLP import Trainer
        from fastNLP import Tester

        # ### Instance
        # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值
        # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法

        # In[2]:

        # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成
        instance = Instance(premise='an premise example .',
                            hypothesis='an hypothesis example.',
                            label=1)
        instance

        # In[3]:

        data_set = DataSet([instance] * 5)
        data_set.append(instance)
        data_set[-2:]

        # In[4]:

        # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中
        instance2 = Instance(premise='the second premise example .',
                             hypothesis='the second hypothesis example.',
                             label='1')
        try:
            data_set.append(instance2)
        except:
            pass
        data_set[-2:]

        # In[5]:

        # 如果某一个field的名字不对,则该instance不能被append到dataset中
        instance3 = Instance(premises='the third premise example .',
                             hypothesis='the third hypothesis example.',
                             label=1)
        try:
            data_set.append(instance3)
        except:
            print('cannot append instance')
            pass
        data_set[-2:]

        # In[6]:

        # 除了文本以外,还可以将tensor作为其中一个field的value
        import torch
        tensor_ins = Instance(image=torch.randn(5, 5), label=0)
        ds = DataSet()
        ds.append(tensor_ins)
        ds

        from fastNLP import DataSet
        from fastNLP import Instance

        # 从csv读取数据到DataSet
        # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取
        dataset = DataSet.read_csv('tutorial_sample_dataset.csv',
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        # 查看DataSet的大小
        len(dataset)

        # In[8]:

        # 使用数字索引[k],获取第k个样本
        dataset[0]

        # In[9]:

        # 获取的样本是一个Instance
        type(dataset[0])

        # In[10]:

        # 使用数字索引[a: b],获取第a到第b个样本
        dataset[0:3]

        # In[11]:

        # 索引也可以是负数
        dataset[-1]

        data_path = ['premise', 'hypothesis', 'label']

        # 读入文件
        with open(data_path[0]) as f:
            premise = f.readlines()

        with open(data_path[1]) as f:
            hypothesis = f.readlines()

        with open(data_path[2]) as f:
            label = f.readlines()

        assert len(premise) == len(hypothesis) and len(hypothesis) == len(
            label)

        # 组织DataSet
        data_set = DataSet()
        for p, h, l in zip(premise, hypothesis, label):
            p = p.strip()  # 将行末空格去除
            h = h.strip()  # 将行末空格去除
            data_set.append(Instance(premise=p, hypothesis=h, truth=l))

        data_set[0]

        # ### DataSet的其他操作
        # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply()

        # In[13]:

        # 将premise域的所有文本转成小写
        data_set.apply(lambda x: x['premise'].lower(),
                       new_field_name='premise')
        data_set[-2:]

        # In[14]:

        # label转int
        data_set.apply(lambda x: int(x['truth']), new_field_name='truth')
        data_set[-2:]

        # In[15]:

        # 使用空格分割句子
        def split_sent(ins):
            return ins['premise'].split()

        data_set.apply(split_sent, new_field_name='premise')
        data_set.apply(lambda x: x['hypothesis'].split(),
                       new_field_name='hypothesis')
        data_set[-2:]

        # In[16]:

        # 筛选数据
        origin_data_set_len = len(data_set)
        data_set.drop(lambda x: len(x['premise']) <= 6)
        origin_data_set_len, len(data_set)

        # In[17]:

        # 增加长度信息
        data_set.apply(lambda x: [1] * len(x['premise']),
                       new_field_name='premise_len')
        data_set.apply(lambda x: [1] * len(x['hypothesis']),
                       new_field_name='hypothesis_len')
        data_set[-1]

        # In[18]:

        # 设定特征域、标签域
        data_set.set_input("premise", "premise_len", "hypothesis",
                           "hypothesis_len")
        data_set.set_target("truth")

        # In[19]:

        # 重命名field
        data_set.rename_field('truth', 'label')
        data_set[-1]

        # In[20]:

        # 切分训练、验证集、测试集
        train_data, vad_data = data_set.split(0.5)
        dev_data, test_data = vad_data.split(0.4)
        len(train_data), len(dev_data), len(test_data)

        # In[21]:

        # 深拷贝一个数据集
        import copy
        train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy(
            dev_data)
        del copy

        # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语
        # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'
        vocab = Vocabulary(max_size=10000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')

        # 构建词表
        train_data.apply(lambda x: [vocab.add(word) for word in x['premise']])
        train_data.apply(
            lambda x: [vocab.add(word) for word in x['hypothesis']])
        vocab.build_vocab()

        # In[23]:

        # 根据词表index句子
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        dev_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        dev_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        train_data[-1], dev_data[-1], test_data[-1]

        # 读入vocab文件
        with open('vocab.txt') as f:
            lines = f.readlines()
        vocabs = []
        for line in lines:
            vocabs.append(line.strip())

        # 实例化Vocabulary
        vocab_bert = Vocabulary(unknown=None, padding=None)
        # 将vocabs列表加入Vocabulary
        vocab_bert.add_word_lst(vocabs)
        # 构建词表
        vocab_bert.build_vocab()
        # 更新unknown与padding的token文本
        vocab_bert.unknown = '[UNK]'
        vocab_bert.padding = '[PAD]'

        # In[25]:

        # 根据词表index句子
        train_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['premise']],
            new_field_name='premise')
        train_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        dev_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['premise']],
            new_field_name='premise')
        dev_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        train_data_2[-1], dev_data_2[-1]

        # step 1:加载模型参数(非必选)
        from fastNLP.io.config_io import ConfigSection, ConfigLoader
        args = ConfigSection()
        ConfigLoader().load_config("./data/config", {"esim_model": args})
        args["vocab_size"] = len(vocab)
        args.data

        # In[27]:

        # step 2:加载ESIM模型
        from fastNLP.models import ESIM
        model = ESIM(**args.data)
        model

        # In[28]:

        # 另一个例子:加载CNN文本分类模型
        from fastNLP.models import CNNText
        cnn_text_model = CNNText(embed_num=len(vocab),
                                 embed_dim=50,
                                 num_classes=5,
                                 padding=2,
                                 dropout=0.1)
        cnn_text_model

        from fastNLP import CrossEntropyLoss
        from fastNLP import Adam
        from fastNLP import AccuracyMetric
        trainer = Trainer(
            train_data=train_data,
            model=model,
            loss=CrossEntropyLoss(pred='pred', target='label'),
            metrics=AccuracyMetric(),
            n_epochs=3,
            batch_size=16,
            print_every=-1,
            validate_every=-1,
            dev_data=dev_data,
            use_cuda=False,
            optimizer=Adam(lr=1e-3, weight_decay=0),
            check_code_level=-1,
            metric_key='acc',
            use_tqdm=False,
        )
        trainer.train()

        tester = Tester(
            data=test_data,
            model=model,
            metrics=AccuracyMetric(),
            batch_size=args["batch_size"],
        )
        tester.test()

        os.chdir("../..")