Beispiel #1
0
 def test_train_use_pretrained_embedding(self):
     """ 模型训练,使用预训练embed.
     """
     print('{} test_train_use_pretrained_embedding {}'.format(
         '-' * 15, '-' * 15))
     # 数据集加载
     test_data_pickle = './data/aclImdb/test_data.pkl'
     test_data = load_serialize_obj(test_data_pickle)
     test_data = test_data[:1000]  # 数据量比较大,cpu电脑跑不动,取一部分进行训练
     test_data_tokenized = get_tokenized_imdb(imdb_data=test_data)
     test_data_vocab = get_tokenized_vocab(test_data_tokenized)
     vocab_size = len(test_data_vocab)
     print('vocab len:{}'.format(vocab_size))  # vocab len:4345
     test_iter = get_imdb_data_iter(test_data,
                                    test_data_vocab,
                                    batch_size=8,
                                    shuffle=True)
     print('test_iter len:{}'.format(len(test_iter)))  # test_iter len:125
     # 构造模型
     net = TextCNN(vocab_size=vocab_size, labels_size=2)
     print('参数量:{}'.format(get_parameter_number(
         net)))  # total:263.152 Thousand, trainable:263.152 Thousand
     # 使用预训练embed初始化
     glove_embedding = torchtext.vocab.GloVe(name='6B',
                                             dim=50,
                                             cache='./data/torchtext')
     print("glove_embedding 一共包含%d个词。" %
           len(glove_embedding.stoi))  # 一共包含400000个词。
     words = test_data_vocab.itos
     embed = load_pretrained_embedding(
         words=words,
         pretrained_vocab=glove_embedding)  # There are 1004 oov words.
     net.embedding.weight.data.copy_(embed)
     net.embedding.weight.requires_grad = False  # 直接加载预训练好的, 所以不需要更新它
     print('参数量:{}'.format(get_parameter_number(
         net)))  # total:263.152 Thousand, trainable:45.902 Thousand
     print(net)
     optimizer = torch.optim.Adam(
         filter(lambda p: p.requires_grad, net.parameters()))
     loss_func = nn.CrossEntropyLoss()
     # 训练
     train_net(net,
               train_iter=test_iter,
               dev_iter=test_iter,
               max_epoch=2,
               optimizer=optimizer,
               loss_func=loss_func)
     init_file_path('./data/save/text_classify/textcnn/')
     # 保存模型
     torch.save(net, f='./data/save/text_classify/textcnn/model.pkl')
     # 保存vocabulary
     save_vocab_words(
         test_data_vocab,
         file_name='./data/save/text_classify/textcnn/vocab_words.txt')
     save_serialize_obj(
         test_data_vocab,
         filename='./data/save/text_classify/textcnn/vocab.pkl')
Beispiel #2
0
    def test_train(self):
        """ 模型训练.
        """
        print('{} test_train {}'.format('-' * 15, '-' * 15))
        # 数据集加载
        data_file = './data/nlp/fr-en-small.txt'
        in_vocab, out_vocab, dataset = read_data(data_file, max_seq_len=7)
        print('in_vocab len:{}, out_vocab len:{}'.format(
            len(in_vocab), len(out_vocab)))  # n_vocab len:46, out_vocab len:38
        # 构造模型
        encoder_net = Encoder(vocab_size=len(in_vocab),
                              embed_size=50,
                              num_hiddens=64,
                              num_layers=2,
                              drop_prob=0.5)
        print('encoder_net 参数量:{}'.format(get_parameter_number(
            encoder_net)))  # total:49.532 Thousand, trainable:49.532 Thousand
        decoder_net = Decoder(vocab_size=len(out_vocab),
                              embed_size=50,
                              num_hiddens=64,
                              num_layers=2,
                              attention_size=10,
                              drop_prob=0.5)
        print('decoder_net 参数量:{}'.format(get_parameter_number(
            decoder_net)))  # total:65.18 Thousand, trainable:65.18 Thousand
        # 训练
        train_machine_translation_net(encoder=encoder_net,
                                      decoder=decoder_net,
                                      dataset=dataset,
                                      out_vocab=out_vocab,
                                      max_epoch=100)
        # 2020-04-12 16:48:01 I [train.py:74] epoch 100, loss 0.008
        # 保存模型
        file_path = './data/save/machine_translation/attention'
        init_file_path(file_path)
        torch.save(encoder_net, f=os.path.join(file_path, 'encoder_net.pkl'))

        torch.save(decoder_net, f=os.path.join(file_path, 'decoder_net.pkl'))
        # 保存vocabulary
        save_vocab_words(in_vocab,
                         file_name=os.path.join(file_path,
                                                'in_vocab_words.txt'))
        save_serialize_obj(in_vocab,
                           filename=os.path.join(file_path, 'in_vocab.pkl'))

        save_vocab_words(out_vocab,
                         file_name=os.path.join(file_path,
                                                'out_vocab_words.txt'))
        save_serialize_obj(out_vocab,
                           filename=os.path.join(file_path, 'out_vocab.pkl'))
Beispiel #3
0
 def test_train(self):
     """ 模型训练.
     """
     print('{} test_train {}'.format('-' * 15, '-' * 15))
     # 数据集加载
     test_data_pickle = './data/aclImdb/test_data.pkl'
     test_data = load_serialize_obj(test_data_pickle)
     test_data = test_data[:100]  # 数据量比较大,cpu电脑跑不动,取一部分进行训练
     test_data_tokenized = get_tokenized_imdb(imdb_data=test_data)
     test_data_vocab = get_tokenized_vocab(test_data_tokenized)
     vocab_size = len(test_data_vocab)
     print('vocab len:{}'.format(vocab_size))  # vocab len:45098
     test_iter = get_imdb_data_iter(test_data,
                                    test_data_vocab,
                                    batch_size=8,
                                    shuffle=True)
     print('test_iter len:{}'.format(len(test_iter)))  # test_iter len:3218
     # 构造模型
     net = BiLSTM(vocab_size=vocab_size, labels_size=2)
     print('参数量:{}'.format(get_parameter_number(
         net)))  # total:436.002 Thousand, trainable:436.002 Thousand
     print(net)
     optimizer = torch.optim.Adam(
         filter(lambda p: p.requires_grad, net.parameters()))
     loss_func = nn.CrossEntropyLoss()
     # 训练
     train_net(net,
               train_iter=test_iter,
               dev_iter=test_iter,
               max_epoch=5,
               optimizer=optimizer,
               loss_func=loss_func)
Beispiel #4
0
 def test_get_parameter_number(self):
     """ 统计神经网络参数个数.
     """
     print('{} test_get_parameter_number {}'.format('-'*15, '-'*15))
     net = LeNet()
     print(net)
     print(get_parameter_number(net))  # {'total': 44426, 'trainable': 44426}
Beispiel #5
0
 def test_get_parameter_number(self):
     """ 统计神经网络参数个数.
     """
     print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15))
     net = BiLSTM(vocab_size=100000, labels_size=2)
     print(net)
     print(get_parameter_number(
         net))  # {'total': 454802, 'trainable': 454802}
Beispiel #6
0
 def test_get_parameter_number(self):
     """ 统计神经网络参数个数.
     """
     print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15))
     net = ResNet18()
     print(net)
     print(get_parameter_number(
         net))  # {'total': 11178378, 'trainable': 11178378}
Beispiel #7
0
 def test_get_parameter_number(self):
     """ 统计神经网络参数个数.
     """
     print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15))
     net = VGG11()
     print(net)
     print(get_parameter_number(
         net))  # {'total': 128806154, 'trainable': 128806154}
Beispiel #8
0
 def test_get_parameter_number(self):
     """ 统计神经网络参数个数.
     """
     print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15))
     encoder_net = Encoder(vocab_size=100,
                           embed_size=50,
                           num_hiddens=64,
                           num_layers=2,
                           drop_prob=0.5)
     print(encoder_net)
     print(get_parameter_number(
         encoder_net))  # total:52.232 Thousand, trainable:52.232 Thousand
     decoder_net = Decoder(vocab_size=100,
                           embed_size=50,
                           num_hiddens=64,
                           num_layers=2,
                           attention_size=10,
                           drop_prob=0.5)
     print(decoder_net)
     print(get_parameter_number(
         decoder_net))  # total:72.31 Thousand, trainable:72.31 Thousand