Ejemplo n.º 1
0
    def test_save_and_load(self):
        model = CNNText((10, 10), 2)
        saver = ModelSaver('tmp')
        loader = ModelLoader()
        saver.save_pytorch(model)

        new_cnn = CNNText((10, 10), 2)
        loader.load_pytorch(new_cnn, 'tmp')

        new_model = loader.load_pytorch_model('tmp')

        for i in range(10):
            for j in range(10):
                self.assertEqual(model.embed.embed.weight[i, j],
                                 new_cnn.embed.embed.weight[i, j])
                self.assertEqual(model.embed.embed.weight[i, j],
                                 new_model["embed.embed.weight"][i, j])

        os.system('rm tmp')
Ejemplo n.º 2
0
    def test_fastnlp_1min_tutorial(self):
        # tutorials/fastnlp_1min_tutorial.ipynb
        data_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        ds = DataSet.read_csv(data_path,
                              headers=('raw_sentence', 'label'),
                              sep='\t')
        print(ds[1])

        # 将所有数字转为小写
        ds.apply(lambda x: x['raw_sentence'].lower(),
                 new_field_name='raw_sentence')
        # label转int
        ds.apply(lambda x: int(x['label']),
                 new_field_name='target',
                 is_target=True)

        def split_sent(ins):
            return ins['raw_sentence'].split()

        ds.apply(split_sent, new_field_name='words', is_input=True)

        # 分割训练集/验证集
        train_data, dev_data = ds.split(0.3)
        print("Train size: ", len(train_data))
        print("Test size: ", len(dev_data))

        from fastNLP import Vocabulary
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words',
            is_input=True)
        dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                       new_field_name='words',
                       is_input=True)

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50),
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

        trainer = Trainer(model=model,
                          train_data=train_data,
                          dev_data=dev_data,
                          loss=CrossEntropyLoss(),
                          optimizer=Adam(),
                          metrics=AccuracyMetric(target='target'))
        trainer.train()
        print('Train finished!')
Ejemplo n.º 3
0
    def test_fastnlp_10min_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        dataset = CSVLoader(headers=['raw_sentence', 'label'], sep='	')._load(sample_path)
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import DataSetIter
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(target="label_seq")
        metric = AccuracyMetric(target="label_seq")

        # 实例化Trainer,传入模型和数据,进行训练
        # 先在test_data拟合(确保模型的实现是正确的)
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5,
                                  dev_data=test_data, metrics=metric, save_path=None)
        overfit_trainer.train()

        # 用train_data训练,在test_data验证
        trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
                          loss=CrossEntropyLoss(target="label_seq"),
                          metrics=AccuracyMetric(target="label_seq"),
                          save_path=None,
                          batch_size=32,
                          n_epochs=5)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
Ejemplo n.º 4
0
def run_cnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()

    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    os.remove("./formalized_train_data.csv")
    os.remove("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词,并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    #train_dataset[0],test_dataset[0]

    # 将label转为整数,并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP.models import CNNText
    embed_dim = 2048  #50
    model = CNNText((len(vocab), embed_dim),
                    num_classes=4,
                    padding=2,
                    dropout=0.1)
    model

    from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

    # 定义trainer并进行训练
    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      metrics=AccuracyMetric())
    trainer.train()
Ejemplo n.º 5
0
    def test_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "./data_for_tests/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')
        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        # print(len(dataset))
        # print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        # print(len(test_data))
        # print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        model = CNNText(embed_num=len(vocab),
                        embed_dim=50,
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        # 实例化Trainer,传入模型和数据,进行训练
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data,
                                  model=copy_model,
                                  loss=CrossEntropyLoss(pred="output",
                                                        target="label_seq"),
                                  metrics=AccuracyMetric(pred="predict",
                                                         target="label_seq"),
                                  n_epochs=10,
                                  batch_size=4,
                                  dev_data=test_data,
                                  save_path="./save")
        overfit_trainer.train()

        trainer = Trainer(train_data=train_data,
                          model=model,
                          loss=CrossEntropyLoss(pred="output",
                                                target="label_seq"),
                          metrics=AccuracyMetric(pred="predict",
                                                 target="label_seq"),
                          n_epochs=10,
                          batch_size=4,
                          dev_data=test_data,
                          save_path="./save")
        trainer.train()
        print('Train finished!')

        # 使用fastNLP的Tester测试脚本
        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
Ejemplo n.º 6
0
 def test_summary(self):
     model = CNNText(embed=(4, 4), num_classes=2, kernel_nums=(9,5), kernel_sizes=(1,3))
     # 4 * 4 + 4 * (9 * 1 + 5 * 3)  +  2 * (9 + 5 + 1) = 142
     self.assertSequenceEqual((142, 142, 0), summary(model))
     model.embed.requires_grad = False
     self.assertSequenceEqual((142, 126, 16), summary(model))
Ejemplo n.º 7
0
 target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
 logger.warn('加载数据集')
 data_bundle = load_serialize_obj(train_data_bundle_pkl_file)
 logger.warn('获取词典')
 char_vocab = data_bundle.get_vocab('words')
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = data_bundle.get_vocab('target')
 logger.info('target_vocab:{}'.format(target_vocab))
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 word2vec_embed = StaticEmbedding(char_vocab,
                                  model_dir_or_name='cn-char-fastnlp-100d')
 logger.warn('神经网络模型')
 model = CNNText(word2vec_embed, num_classes=len(target_vocab))
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = CrossEntropyLoss()
 optimizer = Adam(
     [param for param in model.parameters() if param.requires_grad])
 # metric = AccuracyMetric()
 metric = ClassifyFPreRecMetric(
     tag_vocab=data_bundle.get_vocab(Const.TARGET),
     only_gross=False)  # 若only_gross=False, 即还会返回各个label的metric统计值
 device = 'cuda' if torch.cuda.is_available(
 ) else 'cpu'  # 如果有gpu的话在gpu上运行,训练速度会更快
 logger.info('device:{}'.format(device))
 batch_size = 32
 n_epochs = 10
 early_stopping = 10
Ejemplo n.º 8
0
def train(args):
    text_data = TextData()
    with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin:
        text_data = pickle.load(fin)
    vocab_size = text_data.vocab_size
    class_num = text_data.class_num
    # class_num = 1
    seq_len = text_data.max_seq_len
    print("(vocab_size,class_num,seq_len):({0},{1},{2})".format(
        vocab_size, class_num, seq_len))

    train_data = text_data.train_set
    val_data = text_data.val_set
    test_data = text_data.test_set
    train_data.set_input('words', 'seq_len')
    train_data.set_target('target')
    val_data.set_input('words', 'seq_len')
    val_data.set_target('target')

    test_data.set_input('words', 'seq_len')
    test_data.set_target('target')

    init_embeds = None
    if args.pretrain_model == "None":
        print("No pretrained model with be used.")
        print("vocabsize:{0}".format(vocab_size))
        init_embeds = (vocab_size, args.embed_size)
    elif args.pretrain_model == "word2vec":
        embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl')
        print("Loading Word2Vec pretrained embedding from {0}.".format(
            embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove':
        embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove2wv':
        embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    else:
        init_embeds = (vocab_size, args.embed_size)

    if args.model == "CNNText":
        print("Using CNN Model.")
        model = CNNText(init_embeds,
                        num_classes=class_num,
                        padding=2,
                        dropout=args.dropout)
    elif args.model == "StarTransformer":
        print("Using StarTransformer Model.")
        model = STSeqCls(init_embeds,
                         num_cls=class_num,
                         hidden_size=args.hidden_size)
    elif args.model == "MyCNNText":
        model = MyCNNText(init_embeds=init_embeds,
                          num_classes=class_num,
                          padding=2,
                          dropout=args.dropout)
        print("Using user defined CNNText")
    elif args.model == "LSTMText":
        print("Using LSTM Model.")
        model = LSTMText(init_embeds=init_embeds,
                         output_dim=class_num,
                         hidden_dim=args.hidden_size,
                         num_layers=args.num_layers,
                         dropout=args.dropout)
    elif args.model == "Bert":
        print("Using Bert Model.")
    else:
        print("Using default model: CNNText.")
        model = CNNText((vocab_size, args.embed_size),
                        num_classes=class_num,
                        padding=2,
                        dropout=0.1)
    print(model)
    if args.cuda:
        device = torch.device('cuda')
    else:
        device = None

    print("train_size:{0} ; val_size:{1} ; test_size:{2}".format(
        train_data.get_length(), val_data.get_length(),
        test_data.get_length()))

    if args.optim == "Adam":
        print("Using Adam as optimizer.")
        optimizer = fastnlp_optim.Adam(lr=0.001,
                                       weight_decay=args.weight_decay)
        if (args.model_suffix == "default"):
            args.model_suffix == args.optim
    else:
        print("No Optimizer will be used.")
        optimizer = None

    criterion = CrossEntropyLoss()
    metric = AccuracyMetric()
    model_save_path = os.path.join(args.model_dir, args.model,
                                   args.model_suffix)
    earlystop = EarlyStopCallback(args.patience)
    fitlog_back = FitlogCallback({"val": val_data, "train": train_data})
    trainer = Trainer(train_data=train_data,
                      model=model,
                      save_path=model_save_path,
                      device=device,
                      n_epochs=args.epochs,
                      optimizer=optimizer,
                      dev_data=val_data,
                      loss=criterion,
                      batch_size=args.batch_size,
                      metrics=metric,
                      callbacks=[fitlog_back, earlystop])
    trainer.train()
    print("Train Done.")

    tester = Tester(data=val_data,
                    model=model,
                    metrics=metric,
                    batch_size=args.batch_size,
                    device=device)
    tester.test()
    print("Test Done.")

    print("Predict the answer with best model...")
    acc = 0.0
    output = []
    data_iterator = Batch(test_data, batch_size=args.batch_size)
    for data_x, batch_y in data_iterator:
        i_data = Variable(data_x['words']).cuda()
        pred = model(i_data)[C.OUTPUT]
        pred = pred.sigmoid()
        # print(pred.shape)
        output.append(pred.cpu().data)
    output = torch.cat(output, 0).numpy()
    print(output.shape)
    print("Predict Done. {} records".format(len(output)))
    result_save_path = os.path.join(args.result_dir,
                                    args.model + "_" + args.model_suffix)
    with open(result_save_path + ".pkl", 'wb') as f:
        pickle.dump(output, f)
    output = output.squeeze()[:, 1].tolist()
    projectid = text_data.test_projectid.values
    answers = []
    count = 0
    for i in range(len(output)):
        if output[i] > 0.5:
            count += 1
    print("true sample count:{}".format(count))
    add_count = 0
    for i in range(len(projectid) - len(output)):
        output.append([0.13])
        add_count += 1
    print("Add {} default result in predict.".format(add_count))

    df = pd.DataFrame()
    df['projectid'] = projectid
    df['y'] = output
    df.to_csv(result_save_path + ".csv", index=False)
    print("Predict Done, results saved to {}".format(result_save_path))

    fitlog.finish()
Ejemplo n.º 9
0
    def test_fastnlp_advanced_tutorial(self):
        import os
        os.chdir("tutorials/fastnlp_advanced_tutorial")

        from fastNLP import DataSet
        from fastNLP import Instance
        from fastNLP import Vocabulary
        from fastNLP import Trainer
        from fastNLP import Tester

        # ### Instance
        # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值
        # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法

        # In[2]:

        # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成
        instance = Instance(premise='an premise example .',
                            hypothesis='an hypothesis example.',
                            label=1)
        instance

        # In[3]:

        data_set = DataSet([instance] * 5)
        data_set.append(instance)
        data_set[-2:]

        # In[4]:

        # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中
        instance2 = Instance(premise='the second premise example .',
                             hypothesis='the second hypothesis example.',
                             label='1')
        try:
            data_set.append(instance2)
        except:
            pass
        data_set[-2:]

        # In[5]:

        # 如果某一个field的名字不对,则该instance不能被append到dataset中
        instance3 = Instance(premises='the third premise example .',
                             hypothesis='the third hypothesis example.',
                             label=1)
        try:
            data_set.append(instance3)
        except:
            print('cannot append instance')
            pass
        data_set[-2:]

        # In[6]:

        # 除了文本以外,还可以将tensor作为其中一个field的value
        import torch
        tensor_ins = Instance(image=torch.randn(5, 5), label=0)
        ds = DataSet()
        ds.append(tensor_ins)
        ds

        from fastNLP import DataSet
        from fastNLP import Instance

        # 从csv读取数据到DataSet
        # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取
        dataset = DataSet.read_csv('tutorial_sample_dataset.csv',
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        # 查看DataSet的大小
        len(dataset)

        # In[8]:

        # 使用数字索引[k],获取第k个样本
        dataset[0]

        # In[9]:

        # 获取的样本是一个Instance
        type(dataset[0])

        # In[10]:

        # 使用数字索引[a: b],获取第a到第b个样本
        dataset[0:3]

        # In[11]:

        # 索引也可以是负数
        dataset[-1]

        data_path = ['premise', 'hypothesis', 'label']

        # 读入文件
        with open(data_path[0]) as f:
            premise = f.readlines()

        with open(data_path[1]) as f:
            hypothesis = f.readlines()

        with open(data_path[2]) as f:
            label = f.readlines()

        assert len(premise) == len(hypothesis) and len(hypothesis) == len(
            label)

        # 组织DataSet
        data_set = DataSet()
        for p, h, l in zip(premise, hypothesis, label):
            p = p.strip()  # 将行末空格去除
            h = h.strip()  # 将行末空格去除
            data_set.append(Instance(premise=p, hypothesis=h, truth=l))

        data_set[0]

        # ### DataSet的其他操作
        # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply()

        # In[13]:

        # 将premise域的所有文本转成小写
        data_set.apply(lambda x: x['premise'].lower(),
                       new_field_name='premise')
        data_set[-2:]

        # In[14]:

        # label转int
        data_set.apply(lambda x: int(x['truth']), new_field_name='truth')
        data_set[-2:]

        # In[15]:

        # 使用空格分割句子
        def split_sent(ins):
            return ins['premise'].split()

        data_set.apply(split_sent, new_field_name='premise')
        data_set.apply(lambda x: x['hypothesis'].split(),
                       new_field_name='hypothesis')
        data_set[-2:]

        # In[16]:

        # 筛选数据
        origin_data_set_len = len(data_set)
        data_set.drop(lambda x: len(x['premise']) <= 6)
        origin_data_set_len, len(data_set)

        # In[17]:

        # 增加长度信息
        data_set.apply(lambda x: [1] * len(x['premise']),
                       new_field_name='premise_len')
        data_set.apply(lambda x: [1] * len(x['hypothesis']),
                       new_field_name='hypothesis_len')
        data_set[-1]

        # In[18]:

        # 设定特征域、标签域
        data_set.set_input("premise", "premise_len", "hypothesis",
                           "hypothesis_len")
        data_set.set_target("truth")

        # In[19]:

        # 重命名field
        data_set.rename_field('truth', 'label')
        data_set[-1]

        # In[20]:

        # 切分训练、验证集、测试集
        train_data, vad_data = data_set.split(0.5)
        dev_data, test_data = vad_data.split(0.4)
        len(train_data), len(dev_data), len(test_data)

        # In[21]:

        # 深拷贝一个数据集
        import copy
        train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy(
            dev_data)
        del copy

        # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语
        # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'
        vocab = Vocabulary(max_size=10000,
                           min_freq=2,
                           unknown='<unk>',
                           padding='<pad>')

        # 构建词表
        train_data.apply(lambda x: [vocab.add(word) for word in x['premise']])
        train_data.apply(
            lambda x: [vocab.add(word) for word in x['hypothesis']])
        vocab.build_vocab()

        # In[23]:

        # 根据词表index句子
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        dev_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        dev_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['premise']],
            new_field_name='premise')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        train_data[-1], dev_data[-1], test_data[-1]

        # 读入vocab文件
        with open('vocab.txt') as f:
            lines = f.readlines()
        vocabs = []
        for line in lines:
            vocabs.append(line.strip())

        # 实例化Vocabulary
        vocab_bert = Vocabulary(unknown=None, padding=None)
        # 将vocabs列表加入Vocabulary
        vocab_bert.add_word_lst(vocabs)
        # 构建词表
        vocab_bert.build_vocab()
        # 更新unknown与padding的token文本
        vocab_bert.unknown = '[UNK]'
        vocab_bert.padding = '[PAD]'

        # In[25]:

        # 根据词表index句子
        train_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['premise']],
            new_field_name='premise')
        train_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        dev_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['premise']],
            new_field_name='premise')
        dev_data_2.apply(
            lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
            new_field_name='hypothesis')
        train_data_2[-1], dev_data_2[-1]

        # step 1:加载模型参数(非必选)
        from fastNLP.io.config_io import ConfigSection, ConfigLoader
        args = ConfigSection()
        ConfigLoader().load_config("./data/config", {"esim_model": args})
        args["vocab_size"] = len(vocab)
        args.data

        # In[27]:

        # step 2:加载ESIM模型
        from fastNLP.models import ESIM
        model = ESIM(**args.data)
        model

        # In[28]:

        # 另一个例子:加载CNN文本分类模型
        from fastNLP.models import CNNText
        cnn_text_model = CNNText(embed_num=len(vocab),
                                 embed_dim=50,
                                 num_classes=5,
                                 padding=2,
                                 dropout=0.1)
        cnn_text_model

        from fastNLP import CrossEntropyLoss
        from fastNLP import Adam
        from fastNLP import AccuracyMetric
        trainer = Trainer(
            train_data=train_data,
            model=model,
            loss=CrossEntropyLoss(pred='pred', target='label'),
            metrics=AccuracyMetric(),
            n_epochs=3,
            batch_size=16,
            print_every=-1,
            validate_every=-1,
            dev_data=dev_data,
            use_cuda=False,
            optimizer=Adam(lr=1e-3, weight_decay=0),
            check_code_level=-1,
            metric_key='acc',
            use_tqdm=False,
        )
        trainer.train()

        tester = Tester(
            data=test_data,
            model=model,
            metrics=AccuracyMetric(),
            batch_size=args["batch_size"],
        )
        tester.test()

        os.chdir("../..")
Ejemplo n.º 10
0
def train(args):
    text_data = TextData()
    with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin:
        text_data = pickle.load(fin)
    vocab_size = text_data.vocab_size
    class_num = text_data.class_num
    seq_len = text_data.max_seq_len
    print("(vocab_size,class_num,seq_len):({0},{1},{2})".format(
        vocab_size, class_num, seq_len))

    train_data = text_data.train_set
    test_dev_data = text_data.test_set
    train_data.set_input('words', 'seq_len')
    train_data.set_target('target')
    test_dev_data.set_input('words', 'seq_len')
    test_dev_data.set_target('target')
    test_data, dev_data = test_dev_data.split(0.2)

    test_data = test_dev_data
    init_embeds = None
    if args.pretrain_model == "None":
        print("No pretrained model with be used.")
        print("vocabsize:{0}".format(vocab_size))
        init_embeds = (vocab_size, args.embed_size)
    elif args.pretrain_model == "word2vec":
        embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl')
        print("Loading Word2Vec pretrained embedding from {0}.".format(
            embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove':
        embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove2wv':
        embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    else:
        init_embeds = (vocab_size, args.embed_size)

    if args.model == "CNNText":
        print("Using CNN Model.")
        model = CNNText(init_embeds,
                        num_classes=class_num,
                        padding=2,
                        dropout=args.dropout)
    elif args.model == "StarTransformer":
        print("Using StarTransformer Model.")
        model = STSeqCls(init_embeds,
                         num_cls=class_num,
                         hidden_size=args.hidden_size)
    elif args.model == "MyCNNText":
        model = MyCNNText(init_embeds=init_embeds,
                          num_classes=class_num,
                          padding=2,
                          dropout=args.dropout)
        print("Using user defined CNNText")
    elif args.model == "LSTMText":
        print("Using LSTM Model.")
        model = LSTMText(init_embeds=init_embeds,
                         output_dim=class_num,
                         hidden_dim=args.hidden_size,
                         num_layers=args.num_layers,
                         dropout=args.dropout)
    elif args.model == "Bert":
        print("Using Bert Model.")
    else:
        print("Using default model: CNNText.")
        model = CNNText((vocab_size, args.embed_size),
                        num_classes=class_num,
                        padding=2,
                        dropout=0.1)
    print(model)
    if args.cuda:
        device = torch.device('cuda')
    else:
        device = None

    print("train_size:{0} ; dev_size:{1} ; test_size:{2}".format(
        train_data.get_length(), dev_data.get_length(),
        test_data.get_length()))

    if args.optim == "Adam":
        print("Using Adam as optimizer.")
        optimizer = fastnlp_optim.Adam(lr=0.001,
                                       weight_decay=args.weight_decay)
        if (args.model_suffix == "default"):
            args.model_suffix == args.optim
    else:
        print("No Optimizer will be used.")
        optimizer = None

    criterion = CrossEntropyLoss()
    metric = AccuracyMetric()
    model_save_path = os.path.join(args.model_dir, args.model,
                                   args.model_suffix)
    earlystop = EarlyStopCallback(args.patience)
    trainer = Trainer(train_data=train_data,
                      model=model,
                      save_path=model_save_path,
                      device=device,
                      n_epochs=args.epochs,
                      optimizer=optimizer,
                      dev_data=test_data,
                      loss=criterion,
                      batch_size=args.batch_size,
                      metrics=metric,
                      callbacks=[FitlogCallback(test_data), earlystop])
    trainer.train()
    print("Train Done.")

    tester = Tester(data=test_data,
                    model=model,
                    metrics=metric,
                    batch_size=args.batch_size,
                    device=device)
    tester.test()
    print("Test Done.")
    fitlog.finish()
Ejemplo n.º 11
0
    test_dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    test_dataset.set_target(Const.TARGET)

    # Split into development dataset
    train_dataset, dev_dataset = train_dataset.split(0.1)

    return train_dataset, dev_dataset, test_dataset, vocab


if __name__ == '__main__':

    # Usage
    train_data, dev_data, test_data, vocab = get_train_dev_test_vocab()

    model_cnn = CNNText((len(vocab), 50),
                        num_classes=20,
                        padding=2,
                        dropout=0.1)

    loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

    metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

    trainer = Trainer(model=model_cnn,
                      train_data=train_data,
                      dev_data=dev_data,
                      loss=loss,
                      metrics=metrics)

    trainer.train()

    tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())