def test_train_use_pretrained_embedding(self): """ 模型训练,使用预训练embed. """ print('{} test_train_use_pretrained_embedding {}'.format( '-' * 15, '-' * 15)) # 数据集加载 test_data_pickle = './data/aclImdb/test_data.pkl' test_data = load_serialize_obj(test_data_pickle) test_data = test_data[:1000] # 数据量比较大,cpu电脑跑不动,取一部分进行训练 test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) test_data_vocab = get_tokenized_vocab(test_data_tokenized) vocab_size = len(test_data_vocab) print('vocab len:{}'.format(vocab_size)) # vocab len:4345 test_iter = get_imdb_data_iter(test_data, test_data_vocab, batch_size=8, shuffle=True) print('test_iter len:{}'.format(len(test_iter))) # test_iter len:125 # 构造模型 net = TextCNN(vocab_size=vocab_size, labels_size=2) print('参数量:{}'.format(get_parameter_number( net))) # total:263.152 Thousand, trainable:263.152 Thousand # 使用预训练embed初始化 glove_embedding = torchtext.vocab.GloVe(name='6B', dim=50, cache='./data/torchtext') print("glove_embedding 一共包含%d个词。" % len(glove_embedding.stoi)) # 一共包含400000个词。 words = test_data_vocab.itos embed = load_pretrained_embedding( words=words, pretrained_vocab=glove_embedding) # There are 1004 oov words. net.embedding.weight.data.copy_(embed) net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它 print('参数量:{}'.format(get_parameter_number( net))) # total:263.152 Thousand, trainable:45.902 Thousand print(net) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, net.parameters())) loss_func = nn.CrossEntropyLoss() # 训练 train_net(net, train_iter=test_iter, dev_iter=test_iter, max_epoch=2, optimizer=optimizer, loss_func=loss_func) init_file_path('./data/save/text_classify/textcnn/') # 保存模型 torch.save(net, f='./data/save/text_classify/textcnn/model.pkl') # 保存vocabulary save_vocab_words( test_data_vocab, file_name='./data/save/text_classify/textcnn/vocab_words.txt') save_serialize_obj( test_data_vocab, filename='./data/save/text_classify/textcnn/vocab.pkl')
def test_train(self): """ 模型训练. """ print('{} test_train {}'.format('-' * 15, '-' * 15)) # 数据集加载 data_file = './data/nlp/fr-en-small.txt' in_vocab, out_vocab, dataset = read_data(data_file, max_seq_len=7) print('in_vocab len:{}, out_vocab len:{}'.format( len(in_vocab), len(out_vocab))) # n_vocab len:46, out_vocab len:38 # 构造模型 encoder_net = Encoder(vocab_size=len(in_vocab), embed_size=50, num_hiddens=64, num_layers=2, drop_prob=0.5) print('encoder_net 参数量:{}'.format(get_parameter_number( encoder_net))) # total:49.532 Thousand, trainable:49.532 Thousand decoder_net = Decoder(vocab_size=len(out_vocab), embed_size=50, num_hiddens=64, num_layers=2, attention_size=10, drop_prob=0.5) print('decoder_net 参数量:{}'.format(get_parameter_number( decoder_net))) # total:65.18 Thousand, trainable:65.18 Thousand # 训练 train_machine_translation_net(encoder=encoder_net, decoder=decoder_net, dataset=dataset, out_vocab=out_vocab, max_epoch=100) # 2020-04-12 16:48:01 I [train.py:74] epoch 100, loss 0.008 # 保存模型 file_path = './data/save/machine_translation/attention' init_file_path(file_path) torch.save(encoder_net, f=os.path.join(file_path, 'encoder_net.pkl')) torch.save(decoder_net, f=os.path.join(file_path, 'decoder_net.pkl')) # 保存vocabulary save_vocab_words(in_vocab, file_name=os.path.join(file_path, 'in_vocab_words.txt')) save_serialize_obj(in_vocab, filename=os.path.join(file_path, 'in_vocab.pkl')) save_vocab_words(out_vocab, file_name=os.path.join(file_path, 'out_vocab_words.txt')) save_serialize_obj(out_vocab, filename=os.path.join(file_path, 'out_vocab.pkl'))
def test_train(self): """ 模型训练. """ print('{} test_train {}'.format('-' * 15, '-' * 15)) # 数据集加载 test_data_pickle = './data/aclImdb/test_data.pkl' test_data = load_serialize_obj(test_data_pickle) test_data = test_data[:100] # 数据量比较大,cpu电脑跑不动,取一部分进行训练 test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) test_data_vocab = get_tokenized_vocab(test_data_tokenized) vocab_size = len(test_data_vocab) print('vocab len:{}'.format(vocab_size)) # vocab len:45098 test_iter = get_imdb_data_iter(test_data, test_data_vocab, batch_size=8, shuffle=True) print('test_iter len:{}'.format(len(test_iter))) # test_iter len:3218 # 构造模型 net = BiLSTM(vocab_size=vocab_size, labels_size=2) print('参数量:{}'.format(get_parameter_number( net))) # total:436.002 Thousand, trainable:436.002 Thousand print(net) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, net.parameters())) loss_func = nn.CrossEntropyLoss() # 训练 train_net(net, train_iter=test_iter, dev_iter=test_iter, max_epoch=5, optimizer=optimizer, loss_func=loss_func)
def test_get_parameter_number(self): """ 统计神经网络参数个数. """ print('{} test_get_parameter_number {}'.format('-'*15, '-'*15)) net = LeNet() print(net) print(get_parameter_number(net)) # {'total': 44426, 'trainable': 44426}
def test_get_parameter_number(self): """ 统计神经网络参数个数. """ print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15)) net = BiLSTM(vocab_size=100000, labels_size=2) print(net) print(get_parameter_number( net)) # {'total': 454802, 'trainable': 454802}
def test_get_parameter_number(self): """ 统计神经网络参数个数. """ print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15)) net = ResNet18() print(net) print(get_parameter_number( net)) # {'total': 11178378, 'trainable': 11178378}
def test_get_parameter_number(self): """ 统计神经网络参数个数. """ print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15)) net = VGG11() print(net) print(get_parameter_number( net)) # {'total': 128806154, 'trainable': 128806154}
def test_get_parameter_number(self): """ 统计神经网络参数个数. """ print('{} test_get_parameter_number {}'.format('-' * 15, '-' * 15)) encoder_net = Encoder(vocab_size=100, embed_size=50, num_hiddens=64, num_layers=2, drop_prob=0.5) print(encoder_net) print(get_parameter_number( encoder_net)) # total:52.232 Thousand, trainable:52.232 Thousand decoder_net = Decoder(vocab_size=100, embed_size=50, num_hiddens=64, num_layers=2, attention_size=10, drop_prob=0.5) print(decoder_net) print(get_parameter_number( decoder_net)) # total:72.31 Thousand, trainable:72.31 Thousand