Example #1
0
def bilstm_text():
    w = pickle.load(open("weight.bin", "rb"))

    (vocab, train_data, dev_data, test_data) = read_data()

    model_lstm = MyBLSTMText(class_num=4,
                             vocab_size=len(vocab),
                             dropout=0.5,
                             embed_weights=w)
    loss = CrossEntropyLoss()
    metrics = AccuracyMetric()
    trainer = Trainer(model=model_lstm,
                      train_data=train_data,
                      dev_data=dev_data,
                      optimizer=Adam(lr=0.0015),
                      print_every=10,
                      use_tqdm=False,
                      device='cuda:0',
                      save_path="./lstm_model",
                      loss=loss,
                      metrics=metrics)
    # callbacks=[EarlyStopCallback(10)])

    trainer.train()

    tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())
    tester.test()
Example #2
0
    def test_trainer_suggestion6(self):
        # 检查报错提示能否正确提醒用户
        # 这里传入多余参数,让其duplicate
        dataset = prepare_fake_dataset2('x1', 'x_unused')
        dataset.rename_field('x_unused', 'x2')
        dataset.set_input('x1', 'x2')
        dataset.set_target('y', 'x1')

        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.fc = nn.Linear(5, 4)

            def forward(self, x1, x2):
                x1 = self.fc(x1)
                x2 = self.fc(x2)
                x = x1 + x2
                time.sleep(0.1)
                # loss = F.cross_entropy(x, y)
                return {'preds': x}

        model = Model()
        with self.assertRaises(NameError):
            trainer = Trainer(train_data=dataset,
                              model=model,
                              loss=CrossEntropyLoss(),
                              print_every=2,
                              dev_data=dataset,
                              metrics=AccuracyMetric(),
                              use_tqdm=False)
    def run1(self):
        # test distributed training
        print('local rank', get_local_rank())
        set_rng_seed(100)
        data_set = prepare_fake_dataset()
        data_set.set_input("x", flag=True)
        data_set.set_target("y", flag=True)

        model = NaiveClassifier(2, 2)

        trainer = DistTrainer(
            model=model,
            train_data=data_set,
            optimizer=SGD(lr=0.1),
            loss=CrossEntropyLoss(pred="predict", target="y"),
            batch_size_per_gpu=8,
            n_epochs=3,
            print_every=50,
            save_path=self.save_path,
        )
        trainer.train()
        """
        # 应该正确运行
        """
        if trainer.is_master and os.path.exists(self.save_path):
            shutil.rmtree(self.save_path)
Example #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--methods",
                        "-m",
                        default="lstm",
                        choices=["rnn", "lstm", "cnn"])
    parser.add_argument("--n_epochs", "-n", default=5, type=int)
    parser.add_argument("--embedding", "-e", default=100, type=int)
    parser.add_argument("--category", "-c", default=4, type=int)
    parser.add_argument("--batch", "-b", default=4, type=int)
    parser.add_argument("--learning_rate", "-l", default=0.005, type=float)
    args = parser.parse_args()
    if args.category > 20 or args.category < 1:
        raise Exception("the number of category must be between 1 and 20")
    train_data, test_data, dic_size = handle_data(args.category)
    if args.methods == "rnn":
        model = rnn(dic_size, args.category)
        output = "rnn_model.pth"
    elif args.methods == "lstm":
        model = myLSTM(dic_size, args.category)
        output = "lstm_model.pth"
    else:
        #model = cnn(dic_size, args.category)
        model = torch.load("cnn_model.pth")
        output = "cnn_model.pth"
    trainer = Trainer(train_data,
                      model,
                      loss=CrossEntropyLoss(pred="pred", target='target'),
                      optimizer=SGD(model.parameters(), lr=args.learning_rate),
                      n_epochs=args.n_epochs,
                      dev_data=test_data,
                      metrics=AccuracyMetric(pred="pred", target='target'),
                      batch_size=args.batch)
    trainer.train()
    torch.save(model, output)
Example #5
0
    def test_collect_fn3(self):
        """
        测试应该会覆盖

        :return:
        """
        dataset = prepare_fake_dataset2('x1', 'x2')
        dataset.set_input('x1', 'x2')
        dataset.set_target('y')
        import torch
        def fn(ins_list):
            x = []
            for ind, ins in ins_list:
                x.append(ins['x1']+ins['x2'])
            x = torch.FloatTensor(x)
            return {'x1':torch.zeros_like(x)}, {'target':torch.zeros(x.size(0)).long(), 'y':x}
        dataset.add_collect_fn(fn)

        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.fc = nn.Linear(5, 1, bias=False)

            def forward(self, x1):
                x1 = self.fc(x1)
                assert x1.sum()==0, "Should be replaced to one"
                # loss = F.cross_entropy(x, y)
                return {'pred': x1}

        model = Model()
        trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2,
                          dev_data=dataset, metrics=AccuracyMetric(), use_tqdm=False, n_epochs=1)
        best_metric = trainer.train()['best_eval']['AccuracyMetric']['acc']
        self.assertTrue(best_metric==1)
Example #6
0
    def test_collect_fn2(self):
        """测试能否实现batch_x, batch_y"""
        dataset = prepare_fake_dataset2('x1', 'x2')
        dataset.set_input('x1', 'x2')
        dataset.set_target('y', 'x1')
        import torch
        def fn(ins_list):
            x = []
            for ind, ins in ins_list:
                x.append(ins['x1']+ins['x2'])
            x = torch.FloatTensor(x)
            return {'x':x}, {'target':x[:, :4].argmax(dim=-1)}
        dataset.add_collect_fn(fn)

        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.fc = nn.Linear(5, 4)

            def forward(self, x1, x2, x):
                x1 = self.fc(x1)
                x2 = self.fc(x2)
                x = self.fc(x)
                sum_x = x1 + x2 + x
                time.sleep(0.1)
                # loss = F.cross_entropy(x, y)
                return {'pred': sum_x}

        model = Model()
        trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2,
                          dev_data=dataset, metrics=AccuracyMetric(), use_tqdm=False)
        trainer.train()
def train():

    train_data, dev_data, test_data, vocab = get_train_dev_test_vocab()

    model = CNNText(vocab_size=len(vocab), embedding_dim=50, output_size=20)
    model = torch.load(load_path)

    loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)

    metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)
    '''
    trainer = Trainer(model=model,
                      train_data=train_data, 
                      dev_data=dev_data, 
                      loss=loss, 
                      metrics=metrics, 
                      n_epochs=100, 
                      save_path=checkpoint_path)

    trainer.train()
    '''

    tester = Tester(test_data, model, metrics=AccuracyMetric())

    tester.test()
Example #8
0
def train(config):
    train_data = pickle.load(open(os.path.join(config.data_path, config.train_name), "rb"))
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb"))
    vocabulary = pickle.load(open(os.path.join(config.data_path, config.vocabulary_name), "rb"))
    # load w2v data
    weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb"))

    if config.task_name == "lstm":
        text_model = LSTM(vocab_size=len(vocabulary), embed_dim=config.embed_dim,
                         output_dim=config.class_num, hidden_dim=config.hidden_dim,
                         num_layers=config.num_layers, dropout=config.dropout)
    elif config.task_name == "lstm_maxpool":
        text_model = LSTM_maxpool(vocab_size=len(vocabulary), embed_dim=config.embed_dim,
                         		  output_dim=config.class_num, hidden_dim=config.hidden_dim,
                         		  num_layers=config.num_layers, dropout=config.dropout)
    elif config.task_name == "rnn":
        text_model = RNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim,
                         output_dim=config.class_num, hidden_dim=config.hidden_dim,
                         num_layers=config.num_layers, dropout=config.dropout)
    elif config.task_name == "cnn":
        text_model = CNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim,
                         class_num=config.class_num, kernel_num=config.kernel_num,
                         kernel_sizes=config.kernel_sizes, dropout=config.dropout,
                         static=config.static, in_channels=config.in_channels)
    elif config.task_name == "cnn_w2v":
        text_model = CNN_w2v(vocab_size=len(vocabulary), embed_dim=config.embed_dim,
                             class_num=config.class_num, kernel_num=config.kernel_num,
                             kernel_sizes=config.kernel_sizes, dropout=config.dropout,
                             static=config.static, in_channels=config.in_channels,
                             weight=weight)
    elif config.task_name == "rcnn":
        text_model = RCNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim, 
                          output_dim=config.class_num, hidden_dim=config.hidden_dim, 
                          num_layers=config.num_layers, dropout=config.dropout)
    optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay)
    timing = TimingCallback()
    early_stop = EarlyStopCallback(config.patience)
    accuracy = AccuracyMetric(pred='output', target='target')

    trainer = Trainer(train_data=train_data, model=text_model, loss=CrossEntropyLoss(),
                      batch_size=config.batch_size, check_code_level=0,
                      metrics=accuracy, n_epochs=config.epoch,
                      dev_data=dev_data, save_path=config.save_path,
                      print_every=config.print_every, validate_every=config.validate_every,
                      optimizer=optimizer, use_tqdm=False,
                      device=config.device, callbacks=[timing, early_stop])
    trainer.train()

    # test result
    tester = Tester(test_data, text_model, metrics=accuracy)
    tester.test()
Example #9
0
def train_TextRNN():
    model = TextRNN(TextRNNConfig)
    loss = CrossEntropyLoss(pred="pred", target="target")
    metrics = AccuracyMetric(pred="pred", target="target")
    trainer = Trainer(model=model,
                      train_data=dataset_train,
                      dev_data=dataset_dev,
                      loss=loss,
                      metrics=metrics,
                      batch_size=16,
                      n_epochs=20)
    trainer.train()
    tester = Tester(dataset_test, model, metrics)
    tester.test()
Example #10
0
def train_classifier():
    n_epochs = 50

    trainer = Trainer(train_data=fast_data.train_data,
                      dev_data=fast_data.test_data,
                      model=disc,
                      loss=CrossEntropyLoss(target='label_seq'),
                      metrics=AccuracyMetric(target='label_seq'),
                      n_epochs=n_epochs,
                      batch_size=batch_size,
                      optimizer=Adam(lr=0.001,
                                     weight_decay=0,
                                     model_params=disc.parameters()))
    trainer.train()
    print('Disc Train finished!')
Example #11
0
    def __init__(self, embed,label_vocab,pos_idx=31,
                Parsing_rnn_layers=3, Parsing_arc_mlp_size=500,
                Parsing_label_mlp_size=100,Parsing_use_greedy_infer=False,
                encoding_type='bmeso',embedding_dim=768,dropout=0.1,use_pos_embedding=True,
                use_average=True):
        super().__init__()
        self.embed = embed
        self.use_pos_embedding=use_pos_embedding
        self.use_average=use_average
        self.label_vocab=label_vocab
        self.pos_idx=pos_idx
        self.user_dict_weight=0.05
        embedding_dim_1=512
        embedding_dim_2=256
        
        
        self.layers_map={'CWS':'-1','POS':'-1','Parsing':'-1','NER':'-1'}
        #NER
        self.ner_linear=nn.Linear(embedding_dim,len(label_vocab['NER']))
        trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True)
        self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans)

        #parsing
        self.biaffine_parser=BertCharParser(
                    app_index=self.label_vocab['Parsing'].to_index('APP'),
                    vector_size=768,
                    num_label=len(label_vocab['Parsing']),
                    rnn_layers=Parsing_rnn_layers,
                    arc_mlp_size=Parsing_arc_mlp_size,
                    label_mlp_size=Parsing_label_mlp_size,
                    dropout=dropout,
                    use_greedy_infer=Parsing_use_greedy_infer)
        
        if self.use_pos_embedding:
            self.pos_embedding=nn.Embedding(len(self.label_vocab['pos']),embedding_dim, padding_idx=0)
        
        
        self.loss=CrossEntropyLoss(padding_idx=0)

        #CWS
        self.cws_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['CWS'])], 'relu', output_activation=None)
        trans=allowed_transitions(label_vocab['CWS'],include_start_end=True)
        self.cws_crf = ConditionalRandomField(len(label_vocab['CWS']), include_start_end_trans=True, allowed_transitions=trans)

        #POS
        self.pos_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['POS'])], 'relu', output_activation=None)
        trans=allowed_transitions(label_vocab['POS'],include_start_end=True)
        self.pos_crf = ConditionalRandomField(len(label_vocab['POS']), include_start_end_trans=True, allowed_transitions=trans)
Example #12
0
def cnn_text():
    w = pickle.load(open("weight.bin", "rb"))

    (vocab, train_data, dev_data, test_data) = read_data()

    model_cnn = MyCNNText(class_num=4, vocab_size=len(vocab), embed_weights=w)
    loss = CrossEntropyLoss()
    metrics = AccuracyMetric()
    trainer = Trainer(model=model_cnn,
                      train_data=train_data,
                      dev_data=dev_data,
                      batch_size=32,
                      print_every=10,
                      use_tqdm=False,
                      device='cuda:0',
                      save_path="./cnn_model",
                      loss=loss,
                      metrics=metrics)

    trainer.train()

    tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
    tester.test()
Example #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--method",
        default='cnn',
        help="train model and test it",
        choices=['cnn', 'cnn_glove', 'rnn', 'rnn_maxpool', 'rnn_avgpool'])
    parser.add_argument("--dataset",
                        default='1',
                        help="1: small dataset; 2: big dataset",
                        choices=['1', '2'])
    args = parser.parse_args()

    # 超参数
    embedding_dim = 256
    batch_size = 32
    # RNN
    hidden_dim = 256
    # CNN
    kernel_sizes = (3, 4, 5)
    num_channels = (120, 160, 200)
    acti_function = 'relu'

    learning_rate = 1e-3
    train_patience = 8
    cate_num = 4

    # GloVe
    embedding_file_path = "glove.6B.100d.txt"

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    vocab = read_vocab("vocab.txt")
    print("vocabulary length:", len(vocab))
    train_data = DataSet().load("train_set")
    dev_data = DataSet().load("dev_set")
    test_data = DataSet().load("test_set")

    if (args.dataset == '1'):
        cate_num = 4
        num_channels = (48, 48, 48)
        embedding_dim = 128
        hidden_dim = 128
    elif (args.dataset == '2'):
        cate_num = 20

    if (args.method == 'cnn'):
        model = TextCNN(vocab_size=len(vocab),
                        embedding_dim=embedding_dim,
                        kernel_sizes=kernel_sizes,
                        num_channels=num_channels,
                        num_classes=cate_num,
                        activation=acti_function)
    elif (args.method == 'cnn_glove'):
        glove_embedding = EmbedLoader.load_with_vocab(embedding_file_path,
                                                      vocab)
        embedding_dim = glove_embedding.shape[1]
        print("GloVe embedding_dim:", embedding_dim)

        model = TextCNN_glove(vocab_size=len(vocab),
                              embedding_dim=embedding_dim,
                              kernel_sizes=kernel_sizes,
                              num_channels=num_channels,
                              num_classes=cate_num,
                              activation=acti_function)
        model.embedding.load_state_dict(
            {"weight": torch.from_numpy(glove_embedding)})
        model.constant_embedding.load_state_dict(
            {"weight": torch.from_numpy(glove_embedding)})
        model.constant_embedding.weight.requires_grad = False
        model.embedding.weight.requires_grad = True

    elif (args.method == 'rnn'):
        embedding_dim = 128
        hidden_dim = 128
        model = BiRNNText(vocab_size=len(vocab),
                          embedding_dim=embedding_dim,
                          output_dim=cate_num,
                          hidden_dim=hidden_dim)
    elif (args.method == 'rnn_maxpool'):
        model = BiRNNText_pool(vocab_size=len(vocab),
                               embedding_dim=embedding_dim,
                               output_dim=cate_num,
                               hidden_dim=hidden_dim,
                               pool_name="max")
    elif (args.method == 'rnn_avgpool'):
        model = BiRNNText_pool(vocab_size=len(vocab),
                               embedding_dim=embedding_dim,
                               output_dim=cate_num,
                               hidden_dim=hidden_dim,
                               pool_name="avg")

    tester = Tester(test_data, model, metrics=AccuracyMetric())

    trainer = Trainer(
        train_data=train_data,
        model=model,
        loss=CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET),
        metrics=AccuracyMetric(),
        n_epochs=80,
        batch_size=batch_size,
        print_every=10,
        validate_every=-1,
        dev_data=dev_data,
        optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate),
        check_code_level=2,
        metric_key='acc',
        use_tqdm=True,
        callbacks=[EarlyStopCallback(train_patience)],
        device=device,
    )

    trainer.train()
    tester.test()
Example #14
0
            {
                'dev_matched': data_info.datasets['dev_matched'],
                'dev_mismatched': data_info.datasets['dev_mismatched']
            },
            verbose=1))

trainer = Trainer(train_data=data_info.datasets['train'],
                  model=model,
                  optimizer=optimizer,
                  num_workers=0,
                  batch_size=arg.batch_size,
                  n_epochs=arg.n_epochs,
                  print_every=-1,
                  dev_data=data_info.datasets[arg.devset_name],
                  metrics=AccuracyMetric(pred="pred", target="target"),
                  metric_key='acc',
                  device=[i for i in range(torch.cuda.device_count())],
                  check_code_level=-1,
                  callbacks=callbacks,
                  loss=CrossEntropyLoss(pred="pred", target="target"))
trainer.train(load_best_model=True)

tester = Tester(
    data=data_info.datasets[arg.testset_name],
    model=model,
    metrics=AccuracyMetric(),
    batch_size=arg.batch_size,
    device=[i for i in range(torch.cuda.device_count())],
)
tester.test()
Example #15
0
def train():
    train_data = pickle.load(open(opt.train_data_path, 'rb'))
    validate_data = pickle.load(open(opt.validate_data_path, 'rb'))

    vocab = pickle.load(open(opt.vocab, 'rb'))
    word2idx = vocab.word2idx
    idx2word = vocab.idx2word
    input_size = len(word2idx)

    vocab_size = opt.class_num
    class_num = opt.class_num

    embedding_dim = opt.embedding_dim

    if opt.model_name == "LSTMModel":
        model = utils.find_class_by_name(opt.model_name,
                                         [models])(input_size, vocab_size,
                                                   embedding_dim,
                                                   opt.use_word2vec,
                                                   opt.embedding_weight_path)
    elif opt.model_name == "B_LSTMModel":
        model = utils.find_class_by_name(opt.model_name,
                                         [models])(input_size, vocab_size,
                                                   embedding_dim,
                                                   opt.use_word2vec,
                                                   opt.embedding_weight_path)
    elif opt.model_name == "CNNModel":
        model = utils.find_class_by_name(opt.model_name,
                                         [models])(input_size, vocab_size,
                                                   embedding_dim,
                                                   opt.use_word2vec,
                                                   opt.embedding_weight_path)
    elif opt.model_name == "MyBertModel":
        #bert_dir = "./BertPretrain"
        #bert_dir = None
        #model = utils.find_class_by_name(opt.model_name, [models])(10, 0.1, 4, bert_dir)
        train_data.apply(lambda x: x['input_data'][:2500],
                         new_field_name='input_data')
        validate_data.apply(lambda x: x['input_data'][:2500],
                            new_field_name='input_data')

        model = utils.find_class_by_name(opt.model_name, [models])(
            input_size=input_size,
            hidden_size=512,
            hidden_dropout_prob=0.1,
            num_labels=class_num,
            use_word2vec=opt.use_word2vec,
            embedding_weight_path=opt.embedding_weight_path,
        )

    if not os.path.exists(opt.save_model_path):
        os.mkdir(opt.save_model_path)

    # define dataloader
    train_data.set_input('input_data', flag=True)
    train_data.set_target('target', flag=True)
    validate_data.set_input('input_data', flag=True)
    validate_data.set_target('target', flag=True)

    if opt.optimizer == 'SGD':
        _optimizer = SGD(lr=opt.learning_rate, momentum=0)
    elif opt.optimizer == 'SGD_momentum':
        _optimizer = SGD(lr=opt.learning_rate, momentum=0.9)
    elif opt.optimizer == 'Adam':
        _optimizer = Adam(lr=opt.learning_rate, weight_decay=0)

    overfit_trainer = Trainer(
        model=model,
        train_data=train_data,
        loss=CrossEntropyLoss(pred="output", target="target"),
        n_epochs=opt.epoch,
        batch_size=opt.batch_size,
        device=[0, 1, 2, 3],
        #device=None,
        dev_data=validate_data,
        metrics=AccuracyMetric(pred="output", target="target"),
        metric_key="+acc",
        validate_every=opt.validate_every,
        optimizer=_optimizer,
        callbacks=[EarlyStopCallback(opt.patience)],
        save_path=opt.save_model_path)

    overfit_trainer.train()
Example #16
0
#pad the input array
bundle.set_pad_val("words", 0)
bundle.set_input("words")
bundle.set_target("target")

model = BertForSentenceMatching(embed)
from fastNLP import AccuracyMetric
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

# train the model
from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
N_EPOCHS = 1
BATCH_SIZE = 16

trainer = Trainer(loss=CrossEntropyLoss(),
                  model=model,
                  train_data=bundle.get_dataset("train"),
                  dev_data=bundle.get_dataset("dev"),
                  metrics=metrics,
                  n_epochs=N_EPOCHS,
                  batch_size=BATCH_SIZE)
trainer.train()

from fastNLP import Tester

tester = Tester(bundle.get_dataset("test"), model, metrics)
tester.test()

# print result
Example #17
0
        chars = self.embedding(chars)
        outputs = self.mlp(chars)

        return {Const.OUTPUT: outputs}

embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext',
                        pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5)

callbacks = [
                GradientClipCallback(clip_type='norm', clip_value=1),
                WarmupCallback(warmup=0.1, schedule='linear')
            ]

model = BertCNNER(embed, len(data.vocabs[Const.TARGET]))
optimizer = AdamW(model.parameters(), lr=3e-5)

for name, dataset in data.datasets.items():
    original_len = len(dataset)
    dataset.drop(lambda x:x['seq_len']>256, inplace=True)
    clipped_len = len(dataset)
    print("Delete {} instances in {}.".format(original_len-clipped_len, name))

trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
                  device=0, dev_data=data.datasets['test'], batch_size=6,
                  metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
                  loss=CrossEntropyLoss(reduction='sum'),
                  callbacks=callbacks, num_workers=2, n_epochs=5,
                  check_code_level=0, update_every=3)
trainer.train()

Example #18
0
def run_cnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()

    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    os.remove("./formalized_train_data.csv")
    os.remove("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词,并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    #train_dataset[0],test_dataset[0]

    # 将label转为整数,并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    #train_dataset[0],test_dataset[0]

    from fastNLP.models import CNNText
    embed_dim = 2048  #50
    model = CNNText((len(vocab), embed_dim),
                    num_classes=4,
                    padding=2,
                    dropout=0.1)
    model

    from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

    # 定义trainer并进行训练
    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      metrics=AccuracyMetric())
    trainer.train()
Example #19
0
class CharModel(nn.Module):
    def __init__(self,
                 embed,
                 label_vocab,
                 pos_idx,
                 Parsing_rnn_layers,
                 Parsing_arc_mlp_size,
                 Parsing_label_mlp_size,
                 Parsing_use_greedy_infer=False,
                 encoding_type='bmeso',
                 embedding_dim=768,
                 dropout=0.1,
                 use_pos_embedding=False,
                 use_average=False):
        super().__init__()
        self.embed = embed
        self.use_pos_embedding = use_pos_embedding
        self.use_average = use_average
        self.label_vocab = label_vocab
        self.pos_idx = pos_idx
        embedding_dim_1 = 512
        embedding_dim_2 = 256

        self.layers_map = {'CWS': '1', 'POS': '2', 'Parsing': '3', 'NER': '2'}
        #NER
        self.ner_linear = nn.Linear(embedding_dim, len(label_vocab['NER']))
        trans = allowed_transitions(label_vocab['NER'],
                                    encoding_type='bmeso',
                                    include_start_end=True)
        self.ner_crf = ConditionalRandomField(len(label_vocab['NER']),
                                              include_start_end_trans=True,
                                              allowed_transitions=trans)

        #parsing
        self.biaffine_parser = BertCharParser(
            vector_size=768,
            num_label=len(label_vocab['Parsing']),
            rnn_layers=Parsing_rnn_layers,
            arc_mlp_size=Parsing_arc_mlp_size,
            label_mlp_size=Parsing_label_mlp_size,
            dropout=dropout,
            use_greedy_infer=Parsing_use_greedy_infer)

        if self.use_pos_embedding:
            self.pos_embedding = nn.Embedding(len(self.label_vocab['pos']),
                                              embedding_dim,
                                              padding_idx=0)

        self.loss = CrossEntropyLoss(padding_idx=0)

        #CWS
        self.cws_mlp = MLP([
            embedding_dim, embedding_dim_1, embedding_dim_2,
            len(label_vocab['CWS'])
        ],
                           'relu',
                           output_activation=None)

        #POS
        self.pos_mlp = MLP([
            embedding_dim, embedding_dim_1, embedding_dim_2,
            len(label_vocab['POS'])
        ],
                           'relu',
                           output_activation=None)

    def _generate_embedding(self, feats, word_lens, seq_len, pos):
        new_feats = []
        batch_size = feats.size()[0]
        sentence_length = feats.size()[1]
        device = feats.device
        if self.use_average == False:
            for i in range(batch_size):
                new_feats.append(torch.index_select(feats[i], 0, word_lens[i]))
            new_feats = torch.stack(new_feats, 0)
        else:
            for i in range(batch_size):
                feats_for_one_sample = []
                for j in range(word_lens.size()[1]):
                    if word_lens[i][j] == 0 and j != 0:
                        feats_for_one_word = torch.zeros(feats.size()[-1])
                    else:
                        if j == word_lens.size()[1] - 1 or word_lens[i][
                                j + 1] == 0:
                            index = range(word_lens[i][j], seq_len[i])
                        else:
                            index = range(word_lens[i][j], word_lens[i][j + 1])
                        index = torch.tensor(index).to(device)
                        feats_for_one_word = torch.index_select(
                            feats[i], 0, index)
                        word_len = feats_for_one_word.size()[0]
                        feats_for_one_word = torch.mean(feats_for_one_word,
                                                        dim=0)
                    feats_for_one_sample.append(feats_for_one_word)
                feats_for_one_sample = torch.stack(feats_for_one_sample, dim=0)
                new_feats.append(feats_for_one_sample)
            new_feats = torch.stack(new_feats, 0)
        if self.use_pos_embedding:
            pos_feats = self.pos_embedding(pos)
            new_feats = new_feats + pos_feats
        return new_feats

    def _generate_from_pos(self, paths, seq_len):
        device = paths.device
        word_lens = []
        batch_size = paths.size()[0]
        new_seq_len = []
        batch_pos = []
        for i in range(batch_size):
            word_len = []
            pos = []
            for j in range(seq_len[i]):
                tag = paths[i][j]
                tag = self.label_vocab['POS'].to_word(int(tag))
                if tag.startswith('<'):
                    continue
                tag1, tag2 = tag.split('-')
                tag2 = self.label_vocab['pos'].to_index(tag2)
                if tag1 == 'S' or tag1 == 'B':
                    word_len.append(j)
                    pos.append(tag2)
            if len(pos) == 1:
                word_len.append(seq_len[i] - 1)
                pos.append(tag2)
            new_seq_len.append(len(pos))
            word_lens.append(word_len)
            batch_pos.append(pos)
        max_len = max(new_seq_len)
        for i in range(batch_size):
            word_lens[i] = word_lens[i] + [0] * (max_len - new_seq_len[i])
            batch_pos[i] = batch_pos[i] + [0] * (max_len - new_seq_len[i])
        word_lens = torch.tensor(word_lens, device=device)
        batch_pos = torch.tensor(batch_pos, device=device)
        new_seq_len = torch.tensor(new_seq_len, device=device)
        return word_lens, batch_pos, new_seq_len

    def _decode_parsing(self, dep_head, dep_label, seq_len,
                        seq_len_for_wordlist, word_lens):
        device = dep_head.device
        heads = []
        labels = []
        batch_size = dep_head.size()[0]
        app_index = self.label_vocab['Parsing'].to_index('APP')

        max_len = seq_len.max()
        for i in range(batch_size):
            head = list(range(1, seq_len[i] + 1))
            label = [app_index] * int(seq_len[i])
            head[0] = 0

            for j in range(1, seq_len_for_wordlist[i]):
                if j + 1 == seq_len_for_wordlist[i]:
                    idx = seq_len[i] - 1
                else:
                    idx = word_lens[i][j + 1] - 1

                label[idx] = int(dep_label[i][j])
                root = dep_head[i][j]
                if root >= seq_len_for_wordlist[i] - 1:
                    head[idx] = int(seq_len[i] - 1)
                else:
                    try:
                        head[idx] = int(word_lens[i][root + 1] - 1)
                    except:
                        print(len(head), idx, word_lens.size(), i, root)

            head = head + [0] * int(max_len - seq_len[i])
            label = label + [0] * int(max_len - seq_len[i])

            heads.append(head)
            labels.append(label)
        heads = torch.tensor(heads, device=device)
        labels = torch.tensor(labels, device=device)

        return heads, labels

    def forward(self,
                chars,
                seq_len,
                task_class,
                target,
                seq_len_for_wordlist=None,
                dep_head=None,
                dep_label=None,
                pos=None,
                word_lens=None):
        task = task_class[0]
        mask = chars.ne(0)

        layers = self.layers_map[task]
        feats = self.embed(chars, layers)

        if task == 'Parsing':
            parsing_feats = self._generate_embedding(feats, word_lens, seq_len,
                                                     pos)
            loss_parsing = self.biaffine_parser(parsing_feats,
                                                seq_len_for_wordlist, dep_head,
                                                dep_label)

            return loss_parsing

        if task == 'NER':
            #?需要relu吗
            feats = F.relu(self.ner_linear(feats))
            logits = F.log_softmax(feats, dim=-1)
            loss = self.ner_crf(logits, target, mask)
            return {'loss': loss}

        if task == 'CWS':
            feats = self.cws_mlp(feats)
            #logits=F.log_softmax(feats, dim=-1)
            #loss=self.cws_crf(logits, target, mask)
            loss = self.loss.get_loss(feats, target, seq_len)
            return {'loss': loss}

        if task == 'POS':
            feats = self.pos_mlp(feats)
            #logits=F.log_softmax(feats, dim=-1)
            #loss=self.pos_crf(logits, target, mask)
            loss = self.loss.get_loss(feats, target, seq_len)
            return {'loss': loss}

    def predict(self, chars, seq_len, task_class):
        task = task_class[0]
        mask = chars.ne(0)
        layers = self.layers_map[task]
        feats = self.embed(chars, layers)

        if task == 'Parsing':
            for sample in chars:
                sample[0] = self.pos_idx
            pos_feats = self.embed(chars, '2')
            pos_feats = self.pos_mlp(pos_feats)
            #logits = F.log_softmax(pos_feats, dim=-1)
            #paths, _ = self.pos_crf.viterbi_decode(logits, mask)
            paths = pos_feats.max(dim=-1)[1]

            word_lens, batch_pos, seq_len_for_wordlist = self._generate_from_pos(
                paths, seq_len)
            parsing_feats = self._generate_embedding(feats, word_lens, seq_len,
                                                     batch_pos)
            answer = self.biaffine_parser.predict(parsing_feats,
                                                  seq_len_for_wordlist)
            head_preds = answer['head_preds']
            label_preds = answer['label_preds']
            heads, labels = self._decode_parsing(head_preds, label_preds,
                                                 seq_len, seq_len_for_wordlist,
                                                 word_lens)

            return {'head_preds': heads, 'label_preds': labels, 'pred': paths}

        if task == 'CWS':
            feats = self.cws_mlp(feats)
            #logits = F.log_softmax(feats, dim=-1)
            #paths, _ = self.cws_crf.viterbi_decode(logits, mask)
            paths = feats.max(dim=-1)[1]
            return {'pred': paths}

        if task == 'POS':
            feats = self.pos_mlp(feats)
            #logits = F.log_softmax(feats, dim=-1)
            #paths, _ = self.pos_crf.viterbi_decode(logits, mask)
            paths = feats.max(dim=-1)[1]
            return {'pred': paths}
            #output=feats.max(dim=-1)[1]

        if task == 'NER':
            feats = F.relu(self.ner_linear(feats))
            logits = F.log_softmax(feats, dim=-1)
            paths, _ = self.ner_crf.viterbi_decode(logits, mask)
            return {'pred': paths}
Example #20
0
from create_dataset import create_dataset

from rnn import lstm
from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
from fastNLP import AccuracyMetric

vocab, train_data, dev_data, test_data = create_dataset()

model = lstm(vocab_size=len(vocab),
             embedding_length=200,
             hidden_size=128,
             output_size=20)
model.cuda()

loss = CrossEntropyLoss(pred='pred', target='target')
metrics = AccuracyMetric(pred='pred', target='target')

trainer = Trainer(model=model,
                  train_data=train_data,
                  dev_data=dev_data,
                  loss=loss,
                  metrics=metrics,
                  save_path='./',
                  device=0,
                  n_epochs=20)
trainer.train()
Example #21
0
if torch.cuda.is_available():
    model.cuda()

optimizer = optim.AdamW(model.parameters(), lr=lr)

data = {}
for name in ['seen', 'unseen', 'desc']:
    data[name] = data_bundle.get_dataset(name)

callbacks = [GradientClipCallback(clip_type='value', clip_value=5), WarmupCallback(warmup=0.01, schedule='linear')]
callbacks.append(FitlogCallback(data=data, verbose=1))
train_data = data_bundle.get_dataset('train')
train_data.add_seq_len('input')

# from collections import Counter
# print(Counter(train_data.get_field('seq_len').content))
# exit(0)

sampler = BucketSampler()
clip_max_length(train_data, data_bundle)

trainer = Trainer(train_data=train_data, model=model,
                  optimizer=optimizer, loss=CrossEntropyLoss(),
                 batch_size=batch_size, sampler=sampler, drop_last=False, update_every=1,
                 num_workers=1, n_epochs=n_epochs, print_every=5,
                 dev_data=data_bundle.get_dataset('dev'), metrics=MonoMetric(),
                 metric_key='t10',
                 validate_every=-1, save_path='save_models/', use_tqdm=True, device=None,
                 callbacks=callbacks, check_code_level=0)
trainer.train(load_best_model=False)
fitlog.add_other(trainer.start_time, name='start_time')
Example #22
0
vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset])
vocab.index_dataset(train_dataset, test_dataset, field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset])
target_vocab.index_dataset(train_dataset, test_dataset, field_name='target')

'''build bundle'''
data_dict = {"train":train_dataset, "test":test_dataset}
vocab_dict = {"words":vocab, "target":target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)

'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True)
model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))
# model = BertForSequenceClassification(embed, 2)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(), device=device,
                  batch_size=8, dev_data=data_bundle.get_dataset('train'),
                  metrics=AccuracyMetric(), n_epochs=10, print_every=1)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())
tester.test()


Example #23
0
    "mysgd":
    lambda: MySGD(params=net.parameters(), lr=C.lr),
    "adam":
    lambda: tc.optim.Adam(params=net.parameters(), lr=C.lr),
    "sgd":
    lambda: tc.optim.SGD(params=net.parameters(), lr=C.lr),
}

optim = optims[C.optim]()

trainer = Trainer(
    train_data=data["train"],
    dev_data=data[C.valid_data],
    model=net,
    batch_size=C.batch_size,
    loss=CrossEntropyLoss(pred="pred", target="label"),
    metrics=AccuracyMetric(pred="pred", target="label"),
    optimizer=optim,
    n_epochs=C.n_epochs,
    save_path=os.path.join(C.model_save, C.model, "./"),
    device=C.gpus,
    use_tqdm=True,
    check_code_level=-1,
)

train_result = trainer.train(load_best_model=True)
logger.log("train: {0}".format(train_result))

print("Training done. Now testing.")
tester = Tester(
    data=test_data,
Example #24
0
if __name__ == "__main__":

    vocab = pickle.load(open(config.vocab_path, 'rb'))
    train_data = pickle.load(open(config.train_data_path, 'rb'))
    dev_data = pickle.load(open(config.dev_data_path, 'rb'))
    test_data = pickle.load(open(config.test_data_path, 'rb'))

    if config.model == "CNN":
        model = CNN(len(vocab), config.intput_size, config.class_num)
    elif config.model == "RNN":
        model = RNN(len(vocab), config.intput_size, config.hidden_size,
                    config.class_num, config.rnn_type)

    optimizer = Adam(lr=config.learning_rate, weight_decay=0)
    loss = CrossEntropyLoss(pred="output", target="target")
    metrics = AccuracyMetric(pred="output", target="target")
    trainer = Trainer(model=model,
                      n_epochs=config.epoch,
                      validate_every=config.validate_every,
                      optimizer=optimizer,
                      train_data=train_data,
                      dev_data=dev_data,
                      metrics=metrics,
                      loss=loss,
                      batch_size=config.batch_size,
                      device='cuda:0',
                      save_path=config.save_path,
                      metric_key="acc",
                      callbacks=[EarlyStopCallback(config.patience)])
    trainer.train()
Example #25
0
data_dict = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset}
vocab_dict = {"words": vocab, "target": target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)
'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'),
                      model_dir_or_name='en-base-uncased',
                      include_cls_sep=True)
model = BertForSequenceClassification(embed,
                                      len(data_bundle.get_vocab('target')))
# model = BertForSequenceClassification(embed, 2)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'),
                  model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(target='target'),
                  device=device,
                  batch_size=8,
                  dev_data=data_bundle.get_dataset('dev'),
                  metrics=AccuracyMetric(target='target'),
                  n_epochs=2,
                  print_every=1)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'),
                model,
                batch_size=128,
                metrics=AccuracyMetric())
tester.test()
Example #26
0
print("\t* Loading word embeddings...")
embeddings = util.load_pickle(os.path.normpath(args["data_dir"]),
                              args["embeddings_file"])
embeddings = torch.Tensor(embeddings)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = myESIM(embeddings.shape[0],
               embeddings.shape[1],
               300,
               embeddings=embeddings,
               dropout=0.5,
               num_classes=3,
               device=device).to(device)

trainer = Trainer(train_data=train_data,
                  model=model,
                  loss=CrossEntropyLoss(pred='pred', target='label'),
                  metrics=AccuracyMetric(),
                  n_epochs=10,
                  batch_size=32,
                  print_every=-1,
                  validate_every=-1,
                  dev_data=dev_data,
                  use_cuda=True,
                  optimizer=Adam(lr=0.0004, weight_decay=0),
                  check_code_level=-1,
                  metric_key='acc',
                  use_tqdm=False)

trainer.train()
# 训练结束后model为dev的最佳模型,保存
torch.save(model.state_dict(), '../data/checkpoints/best_model.pkl')
            kernel_sizes=kernel_sizes,
            padding=padding)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    
    def forward(self, words, seq_len=None):
        x = self.embed(words)  # [N,L] -> [N,L,C]
        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    
    def predict(self, words, seq_len=None):
        output = self(words, seq_len)
        _, predict = output[C.OUTPUT].max(dim=1)
        return {C.OUTPUT: predict}


#demo version

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words',new_field_name='words')
trainData.set_target('target')
model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1)
train_data, dev_data = trainData.split(0.2)
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16)
trainer.train()
Example #28
0
dataset.rename_field('words', Const.INPUT)
dataset.rename_field('target', Const.TARGET)
dataset.rename_field('seq_len', Const.INPUT_LEN)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET)

testset.rename_field('words', Const.INPUT)
testset.rename_field('target', Const.TARGET)
testset.rename_field('seq_len', Const.INPUT_LEN)
testset.set_input(Const.INPUT, Const.INPUT_LEN)
testset.set_target(Const.TARGET)

train_data, dev_data = dataset.split(0.1)

loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)
metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)
trainer = Trainer(model=model,
                  train_data=train_data,
                  dev_data=dev_data,
                  loss=loss,
                  batch_size=16,
                  metrics=metrics,
                  n_epochs=20,
                  callbacks=[FitlogCallback(dataset)])
trainer.train()

tester = Tester(data=testset, model=model, metrics=metrics)
tester.test()

tester = Tester(data=train_data, model=model, metrics=metrics)
Example #29
0
 logger.warn('获取词典')
 char_vocab = data_bundle.get_vocab('words')
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = data_bundle.get_vocab('target')
 logger.info('target_vocab:{}'.format(target_vocab))
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 word2vec_embed = StaticEmbedding(char_vocab,
                                  model_dir_or_name='cn-char-fastnlp-100d')
 logger.warn('神经网络模型')
 model = CNNText(word2vec_embed, num_classes=len(target_vocab))
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = CrossEntropyLoss()
 optimizer = Adam(
     [param for param in model.parameters() if param.requires_grad])
 # metric = AccuracyMetric()
 metric = ClassifyFPreRecMetric(
     tag_vocab=data_bundle.get_vocab(Const.TARGET),
     only_gross=False)  # 若only_gross=False, 即还会返回各个label的metric统计值
 device = 'cuda' if torch.cuda.is_available(
 ) else 'cpu'  # 如果有gpu的话在gpu上运行,训练速度会更快
 logger.info('device:{}'.format(device))
 batch_size = 32
 n_epochs = 5
 early_stopping = 10
 trainer = Trainer(save_path=model_path,
                   train_data=data_bundle.get_dataset('train'),
                   model=model,
Example #30
0
def run_rnn():
    dataset_train_p2, dataset_test_p2 = get_text_classification_datasets()
    line_len = len(dataset_train_p2.data)
    with open("formalized_train_data.csv", "w") as file:
        for i in range(line_len):
            file.write(
                document2line(dataset_train_p2.data[i]) + "\t" +
                str(dataset_train_p2.target[i]) + '\n')
        file.close()

    line_len = len(dataset_test_p2.data)
    with open("formalized_test_data.csv", "w") as file2:
        for i in range(line_len):
            file2.write(
                document2line(dataset_test_p2.data[i]) + "\t" +
                str(dataset_test_p2.target[i]) + '\n')
        file2.close()

    loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
    train_dataset = loader.load("./formalized_train_data.csv")
    test_dataset = loader.load("./formalized_test_data.csv")

    train_dataset.apply(lambda x: x['raw_sentence'].lower(),
                        new_field_name='sentence')
    train_dataset.apply(lambda x: x['sentence'].split(),
                        new_field_name='words',
                        is_input=True)

    test_dataset.apply(lambda x: x['raw_sentence'].lower(),
                       new_field_name='sentence')
    test_dataset.apply(lambda x: x['sentence'].split(),
                       new_field_name='words',
                       is_input=True)

    from fastNLP import Vocabulary

    # 使用Vocabulary类统计单词,并将单词序列转化为数字序列
    vocab = Vocabulary(min_freq=2).from_dataset(train_dataset,
                                                field_name='words')
    vocab.index_dataset(train_dataset,
                        field_name='words',
                        new_field_name='words')
    vocab.index_dataset(test_dataset,
                        field_name='words',
                        new_field_name='words')
    # 将label转为整数,并设置为 target
    train_dataset.apply(lambda x: int(x['label']),
                        new_field_name='target',
                        is_target=True)
    test_dataset.apply(lambda x: int(x['label']),
                       new_field_name='target',
                       is_target=True)

    embed_dim = 1024
    hidden_dim = 128
    layer = 4

    model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4)
    use_gpu = torch.cuda.is_available()  # 判断是否有GPU加速
    if use_gpu:
        model = model.cuda()

    trainer = Trainer(model=model,
                      train_data=train_dataset,
                      dev_data=test_dataset,
                      loss=CrossEntropyLoss(),
                      n_epochs=100,
                      metrics=AccuracyMetric())
    trainer.train()