def train(args): dir_model, dir_log = pre_create_file_train(model_path, log_path, args) writer = SummaryWriter(dir_log) model = DeepVO().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) pbar = tqdm(range(args.epoch_max)) data_set_t_1 = DeepVODataset(seq=2, interval=1, phase='train') data_set_t_2 = DeepVODataset(seq=4, interval=2, phase='train') data_set_t_3 = DeepVODataset(seq=8, interval=4, phase='train') # data_set_v = DeepVODataset(seq=4, interval=40, phase='valid') loader_t_1 = DataLoader(data_set_t_1, batch_size=16, shuffle=True) loader_t_2 = DataLoader(data_set_t_2, batch_size=8, shuffle=True) loader_t_3 = DataLoader(data_set_t_3, batch_size=4, shuffle=True) # loader_v = DataLoader(data_set_v, batch_size=4, shuffle=False) step_per_epoch = int( math.ceil(data_set_t_1.__len__() / loader_t_1.batch_size)) step_val = int(math.floor(step_per_epoch / 3)) # vaildate 3 times one epoch for epoch in pbar: # test if (epoch + 1) % args.epoch_test == 0: run_test() loss_list = [] for step, (sample_t_1, sample_t_2, sample_t_3) in enumerate( zip(loader_t_1, loader_t_2, loader_t_3)): tic = time() step_global = epoch * step_per_epoch + step loss = run_batch(sample=[sample_t_1, sample_t_2, sample_t_3], model=model, loss_func=loss_func, optimizer=optimizer) loss_list.append(loss) hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) if (step + 1) % 5 == 0: display_loss_tb(hour_per_epoch, pbar, step, step_per_epoch, optimizer, loss, writer, step_global) # save model if args.save and (epoch + 1) % args.epoch_save == 0: print(f'\nSave model: {dir_model}/model-{epoch+1}.pkl') torch.save(model.state_dict(), f'{dir_model}/model-{epoch+1}.pkl')
def main(args): """ 1. Train till the loss minimized for 4 epoch 2. Use the best model to test """ print('\n\n') print('START'.center(70, '=')) print('Net\t\t\t{:s}\nPhase\t\t\t{:s}\nSentence length\t\t{:d}'.format( args.net_name, args.phase, args.sen_len)) torch.set_default_tensor_type('torch.FloatTensor') if args.phase == 'Train': model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Train') if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) print('LOADING DATA '.center(70, '=')) dir_model_date, dir_log_date = pre_create_file_train( dir_model, dir_log, args) writer = SummaryWriter(dir_log_date) loss_func = nn.BCEWithLogitsLoss() # loss(input, target) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=args.sen_len, phase='Train') loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) step_per_epoch = int(math.ceil(len(data_set) / loader.batch_size)) print('TRAIN'.center(70, '=')) loss_best = -1 epoch_best = 0 epoch_current = 0 while True: adjust_learning_rate(optimizer, epoch_current, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) loss_list = [] for step, sample_batch in enumerate(loader): step_global = epoch_current * step_per_epoch + step tic = time() loss, _ = run_batch(sample=sample_batch, model=model, loss_func=loss_func, optimizer=optimizer, phase='Train') hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) loss_list.append(loss) # display result and add to tensor board if (step + 1) % 10 == 0: display_loss(hour_per_epoch, epoch_current, args, step, step_per_epoch, optimizer, loss, loss_list, writer, step_global) loss_mean = np.mean(loss_list) epoch_current += 1 if loss_mean < loss_best or loss_best == -1: loss_best = loss_mean epoch_best = epoch_current torch.save(model.state_dict(), dir_model_date + '/model-best.pkl') print('>>>save current best model in {:s}\n'.format( dir_model_date + '/model-best.pkl')) else: if epoch_current - epoch_best == 5: break print('TEST'.center(70, '=')) model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Test') if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) dir_model_date = dir_model + '/' + args.net_name + '/' + args.dir_date model.load_state_dict(torch.load(dir_model_date + '/model-best.pkl')) print('>>>load best model in {:s}\n'.format(dir_model_date + '/model-best.pkl')) data_set = ToxicCommentDataSet(dir_data=dir_data, sentence_length=args.sen_len, phase='Test') loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) predicts = [] # 153164 for step, sample_batch in enumerate(tqdm(loader)): predict = run_batch(sample=sample_batch, model=model, phase='Test') # bs x 6 predicts.extend(predict.cpu().numpy()) ret = pd.DataFrame(data=predicts, columns=data_set.CLASSES) ret['id'] = data_set.test_id ret = ret[['id'] + data_set.CLASSES] ret.to_csv(dir_model_date + '/submit.csv', index=False) print('END'.center(70, '='))
def main(args): """ 1. Train, and save best model for every fold 2. Use every best model to test, averaged """ print('\n\n') print('START'.center(70, '=')) print('Net\t\t\t{:s}\nPhase\t\t\t{:s}\nSentence length\t\t{:d}'.format( args.net_name, args.phase, args.sen_len)) torch.set_default_tensor_type('torch.FloatTensor') print('LOADING DATA '.center(70, '=')) data_set = ToxicComment(dir_data=dir_data, sentence_length=args.sen_len, fold_count=args.fold_count) if args.phase == 'Train': print('TRAIN'.center(70, '=')) model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Train') if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) dir_model_date, dir_log_date = pre_create_file_train( dir_model, dir_log, args) loss_func = nn.BCEWithLogitsLoss() # loss(input, target) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) for fold_id in range(0, args.fold_count): print('>>>Fold {:d}\n'.format(fold_id + 1)) x_t, y_t, x_v, y_v = data_set.get_fold_by_id(fold_id) data_fold_train = ToxicCommentDataSet(x_t, data_set.embeddings, y_t, phase='Train') data_fold_valid = ToxicCommentDataSet(x_v, data_set.embeddings, y_v, phase='Valid') loader_train = DataLoader(data_fold_train, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) loader_valid = DataLoader(data_fold_valid, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) get_best_model(args, loader_train, loader_valid, fold_id, dir_model_date, model, loss_func, optimizer) print('TEST'.center(70, '=')) dir_model_date = dir_model + '/' + args.net_name + '/' + args.dir_date model = Net(in_features=300, hidden_size=args.hidden_size, layer_num=args.layer_num, phase='Test') if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) data_test = ToxicCommentDataSet(data_set.x_test, data_set.embeddings, phase='Test') loader_test = DataLoader(data_test, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) predicts_list = [] for fold_id in range(0, args.fold_count): print('\n>>>Fold {:d}'.format(fold_id + 1)) dir_restore = dir_model_date + '/fold{}-best.pkl'.format(fold_id + 1) model.load_state_dict(torch.load(dir_restore)) predicts = [] # 153164 for step, sample_batch in enumerate(tqdm(loader_test)): predict = run_batch(sample=sample_batch, model=model, phase='Test') # bs x 6 predicts.extend(predict.cpu().numpy()) predicts = np.array(predicts).reshape([-1, 6]) np.savetxt(dir_model_date + '/predicts-fold{}'.format(fold_id), predicts) predicts_list.append(predicts) predicts_ret = np.ones(predicts_list[0].shape) for predicts_fold in predicts_list: predicts_ret *= predicts_fold predicts_ret **= (1. / len(predicts_list)) predicts_ret **= args.coefficient ret = pd.DataFrame(data=predicts_ret, columns=data_set.CLASSES) ret['id'] = data_set.test_id ret = ret[['id'] + data_set.CLASSES] ret.to_csv(dir_model_date + '/submit.csv', index=False) print('END'.center(70, '='))
def main(): torch.set_default_tensor_type('torch.FloatTensor') model = Net() if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) # Set weights print('\n========================================') print('Phase: {:s}\nNet architecture: {:s}'.format(args.phase, args.net_architecture)) if args.net_architecture == 'cnn-lstm': if args.resume == 'cnn': print('Restore from CNN: {:s}'.format(dir_restore)) pre_trained_dict = torch.load(dir_restore) model_dict = model.state_dict() pre_trained_dict = { k: v for k, v in pre_trained_dict.items() if k in model_dict } # tick the useless dict model_dict.update(pre_trained_dict) # update the dict model.load_state_dict( model_dict) # load updated dict into the model elif args.resume == 'lstm' or args.phase == 'Test': print('Restore from CNN-LSTM: {:s}'.format(dir_restore)) model.load_state_dict(torch.load(dir_restore)) else: print('Initialize from scratch') else: if args.resume == 'Yes' or args.phase == 'Test': print('Restore from CNN: {:s}'.format(dir_restore)) model.load_state_dict(torch.load(dir_restore)) else: print('Initialize from scratch') print('========================================') # Start training if args.phase == 'Train': dir_model, dir_log = pre_create_file_train(model_dir, log_dir, args) writer = SummaryWriter(dir_log) loss_func = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) if args.net_architecture == 'cnn-lstm': data_set_t_1 = KITTIDataSet(dir_data, dir_label, img_pairs=2, start_interval=1, phase='Train') data_set_t_2 = KITTIDataSet(dir_data, dir_label, img_pairs=4, start_interval=2, phase='Train') data_set_t_3 = KITTIDataSet(dir_data, dir_label, img_pairs=8, start_interval=4, phase='Train') data_set_v = KITTIDataSet(dir_data, dir_label, img_pairs=4, start_interval=40, phase='Valid') loader_t_1 = DataLoader(data_set_t_1, batch_size=16, shuffle=True, num_workers=args.workers) loader_t_2 = DataLoader(data_set_t_2, batch_size=8, shuffle=True, num_workers=args.workers) loader_t_3 = DataLoader(data_set_t_3, batch_size=4, shuffle=True, num_workers=args.workers) loader_v = DataLoader(data_set_v, batch_size=4, shuffle=False, num_workers=args.workers) step_per_epoch = int( math.ceil(len(data_set_t_1) / loader_t_1.batch_size)) step_val = int(math.floor(step_per_epoch / 3)) # 每个epoch验证3次 for epoch in np.arange(args.epoch_max): adjust_learning_rate(optimizer, epoch, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) # test a complete sequence and plot trajectory if epoch != 0 and epoch % args.epoch_test == 0: run_test(model, seq=9, dir_model=dir_model, epoch=epoch) run_test(model, seq=5, dir_model=dir_model, epoch=epoch) loss_list = [] # 记录每个epoch的loss loss1_list = [] loss2_list = [] for step, (sample_t_1, sample_t_2, sample_t_3) in enumerate( zip(loader_t_1, loader_t_2, loader_t_3)): tic = time() step_global = epoch * step_per_epoch + step loss1, loss2, loss = run_batch_2( sample=[sample_t_1, sample_t_2, sample_t_3], model=model, loss_func=loss_func, optimizer=optimizer) hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) loss_list.append(loss) loss1_list.append(loss1) loss2_list.append(loss2) # display and add to tensor board if (step + 1) % 5 == 0: display_loss_tb(hour_per_epoch, epoch, args, step, step_per_epoch, optimizer, loss, loss1, loss2, loss_list, loss1_list, loss2_list, writer, step_global) if (step + 1) % step_val == 0: batch_v = int( math.ceil(len(data_set_v) / loader_v.batch_size)) loss_v, loss1_v, loss2_v = run_val( model, loss_func, loader_v) display_loss_tb_val(batch_v, loss_v, loss1_v, loss2_v, args, writer, step_global) # save if (epoch + 1) % args.epoch_save == 0: print('\nSaving model: {:s}/model-{:d}.pkl'.format( dir_model, epoch + 1)) torch.save( model.state_dict(), (dir_model + '/model-{:d}.pkl'.format(epoch + 1))) else: data_set_t = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, samples=args.samples, phase='Train') data_set_v = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, phase='Valid') loader_t = DataLoader(data_set_t, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) loader_v = DataLoader(data_set_v, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) step_per_epoch = int( math.floor(len(data_set_t) / loader_t.batch_size)) step_val = int(math.floor(step_per_epoch / 3)) # 每个epoch验证3次 for epoch in np.arange(args.epoch_max): adjust_learning_rate(optimizer, epoch, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) # test a complete sequence and plot trajectory if epoch != 0 and epoch % args.epoch_test == 0: run_test(model, seq=9, dir_model=dir_model, epoch=epoch) run_test(model, seq=5, dir_model=dir_model, epoch=epoch) loss_list = [] # 记录每个epoch的loss loss1_list = [] loss2_list = [] for step, sample_t in enumerate(loader_t): step_global = epoch * step_per_epoch + step tic = time() loss, loss1, loss2, _ = \ run_batch(sample=sample_t, model=model, loss_func=loss_func, optimizer=optimizer, phase='Train') hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) loss_list.append(loss) loss1_list.append(loss1) loss2_list.append(loss2) # display and add to tensor board if (step + 1) % 10 == 0: display_loss_tb(hour_per_epoch, epoch, args, step, step_per_epoch, optimizer, loss, loss1, loss2, loss_list, loss1_list, loss2_list, writer, step_global) if (step + 1) % step_val == 0: batch_v = int( math.ceil(len(data_set_v) / loader_v.batch_size)) loss_v, loss1_v, loss2_v = run_val( model, loss_func, loader_v) display_loss_tb_val(batch_v, loss_v, loss1_v, loss2_v, args, writer, step_global) # save if (epoch + 1) % args.epoch_save == 0: print('\nSaving model: {:s}/model-{:d}.pkl'.format( dir_model, epoch + 1)) torch.save( model.state_dict(), (dir_model + '/model-{:d}.pkl'.format(epoch + 1))) else: dir_time = pre_create_file_test(args) for seq in range(11): run_test(model, seq=seq, dir_time=dir_time)
def train(args): print('\n') print('Create Hierarchical Encoder Model'.center(100, '=')) torch.set_default_tensor_type('torch.FloatTensor') model = HierarchicalEncoder(vocab_size=args.vocab_size, batch_size=args.batch_size, input_size=args.vec_size, hidden_size=args.hidden_size, layer_num=args.layer_num) print(model) if torch.cuda.is_available(): model.cuda() loss_func = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) print('Load Data'.center(100, '=')) # TODO 1. 训练generator,2. 生成(query, answer),3. 重写dataset/discdata.py data_set = HierarchicalEncoder() dir_model_date, dir_log_date = pre_create_file_train( dir_model, dir_log, args) writer = SummaryWriter(dir_log_date) print('Prepare data loader') loader_train = DataLoader(data_set, batch_size=args.batch_size, shuffle=True, num_workers=4) step_per_epoch = data_set.__len__() // loader_train.batch_size loss_best = -1 epoch_best = 0 epoch_current = 0 print('Start Training'.center(100, '=')) while True: adjust_learning_rate(optimizer, epoch_current, args.lr_base, args.lr_decay_rate, args.epoch_lr_decay) loss_list = [] for step, sample_batch in enumerate(loader_train): step_global = epoch_current * step_per_epoch + step tic = time() loss = run_batch(sample=sample_batch, model=model, optimizer=optimizer, loss_func=loss_func, args=args, phase='Train') hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) loss_list.append(loss) # display result and add to tensor board if (step + 1) % 1 == 0: display_loss(hour_per_epoch, epoch_current, args, step, step_per_epoch, optimizer, loss, loss_list, writer, step_global) loss_mean = np.mean(loss_list) epoch_current += 1 if loss_mean < loss_best or loss_best == -1: loss_best = loss_mean epoch_best = epoch_current torch.save(model.state_dict(), dir_model_date + '/model-best.pkl') print('>>>save current best model in {:s}\n'.format( dir_model_date + '/model-best.pkl')) else: if epoch_current - epoch_best == 5: break
def main(): torch.set_default_tensor_type('torch.FloatTensor') model = CNN() if torch.cuda.is_available(): model = nn.DataParallel(model.cuda(), device_ids=args.gpu) if (args.phase == 'Train' and args.resume == 'Yes') or args.phase == 'Test': dir_restore = 'model/' + args.net_restore + '/' + args.dir_restore + '/' + args.model_restore + '.pkl' print('\nRestore from {:s}'.format(dir_restore)) model.load_state_dict(torch.load(dir_restore)) if args.phase == 'Train': if args.resume == 'No': print('\nInitialize from scratch') dir_model, dir_log = pre_create_file_train(args) data_set_t = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, phase='Train') loader_t = DataLoader(data_set_t, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) # data_set_v = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, phase='Val') # loader_v = DataLoader(data_set_v, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) loss_func = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base) step_per_epoch = int(math.floor(len(data_set_t) / loader_t.batch_size)) # step_val = int(math.floor(step_per_epoch / 3)) writer = SummaryWriter(dir_log) for epoch in np.arange(args.epoch_max): adjust_learning_rate(optimizer, epoch, args.lr_base, gamma=args.lr_decay_rate, epoch_lr_decay=args.epoch_lr_decay) # plot trajectory # if epoch % args.epoch_test == 0: # run_test(model, loss_func, seq=9, dir_model=dir_model, epoch=epoch, is_testing=False) # run_test(model, loss_func, seq=5, dir_model=dir_model, epoch=epoch, is_testing=False) for step, sample_t in enumerate(loader_t): step_global = epoch * step_per_epoch + step tic = time() loss, loss1, loss2, _, loss_x, loss_y, loss_z, loss_tx, loss_ty, loss_tz = \ run_batch(sample=sample_t, model=model, loss_func=loss_func, optimizer=optimizer, phase='Train') hour_per_epoch = step_per_epoch * ((time() - tic) / 3600) # display and add to tensor board if (step + 1) % 10 == 0: print( '\n{:.3f} [{:03d}/{:03d}] [{:03d}/{:03d}] lr {:.6f} L {:.4f}={:.4f}+{:d}*{:.4f} ' '[{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}]'.format( hour_per_epoch, epoch + 1, args.epoch_max, step + 1, step_per_epoch, optimizer.param_groups[0]['lr'], loss, loss1, args.beta, loss2, loss_x, loss_y, loss_z, loss_tx, loss_ty, loss_tz)) writer.add_scalars( './train', { 'loss_t': loss, 'loss1_t': loss1, 'loss2_t': loss2, 'loss_x_t': loss_x, 'loss_y_t': loss_y, 'loss_z_t': loss_z, 'loss_tx_t': loss_tx, 'loss_ty_t': loss_ty, 'loss_tz_t': loss_tz }, step_global) # if (step+1) % step_val == 0: # batch_v = int(math.ceil(len(data_set_v)/loader_v.batch_size)) # loss_v, loss1_v, loss2_v, loss_x_v, loss_y_v, loss_z_v, loss_tx_v, loss_ty_v, loss_tz_v = \ # run_val(model, loss_func, loader_v) # print('\n{:d} batches: L {:.4f}={:.4f}+{:d}*{:.4f} [{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}]'. # format(batch_v, loss_v, loss1_v, args.beta, loss2_v, loss_x_v, loss_y_v, loss_z_v, loss_tx_v, # loss_ty_v, loss_tz_v)) # writer.add_scalars('./train-val', # {'loss_v': loss_v, 'loss1_v': loss1_v, 'loss2_v': loss2_v, 'loss_x_v': loss_x_v, # 'loss_y_v': loss_y_v, 'loss_z_v': loss_z_v, 'loss_tx_v': loss_tx_v, # 'loss_ty_v': loss_ty_v, 'loss_tz_v': loss_tz_v}, # step_global) # save if (epoch + 1) % args.epoch_save == 0: print('\nSaving model: {:s}/model-{:d}.pkl'.format( dir_model, epoch + 1)) torch.save(model.state_dict(), (dir_model + '/model-{:d}.pkl'.format(epoch + 1))) if args.phase == 'Test': dir_time = pre_create_file_test(args) loss_func = nn.MSELoss() for seq in range(11): run_test(model, loss_func, seq=seq, dir_time=dir_time, is_testing=True)