def main(): # Load the train data. train_csv = './mnist_data/train.csv' train_x, train_y, val_x, val_y = load_train_csv_dataset( train_csv, validation_percent=0.1) # Create pytorch dataloaders for train and validation sets. train_dataset = MnistDataset(train_x, train_y) train_dataloader = DataLoader(train_dataset, batch_size=200, shuffle=True, num_workers=2) val_dataset = MnistDataset(val_x, val_y) val_dataloader = DataLoader(val_dataset, batch_size=200, shuffle=False, num_workers=2) # Define model, optimizer and loss function. model = MnistCNN() optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) loss_func = nn.CrossEntropyLoss() # Train our model. train_model(model, train_dataloader, loss_func, optimizer, epochs=NUM_EPOCHS) val_accuracy = eval_model(model, val_dataloader) print('Validation set accuracy: {}'.format(val_accuracy)) # Save model weights for inference. torch.save(model.state_dict(), 'trained_model.pt')
def main(argv=None): FLAGS.output_path = os.path.join(FLAGS.root_dir, 'output', FLAGS.version) if FLAGS.mode == 'train': from train import train_model train_model(FLAGS) elif FLAGS.mode == 'drive': from drive import drive drive(FLAGS) elif FLAGS.mode == 'eval': from evaluate import eval_model eval_model(FLAGS) elif FLAGS.mode == 'save': from save_model import save_model_with_weights save_model_with_weights(FLAGS)
def train(): logger = logging.getLogger() is_dist = dist.is_initialized() ## dataset dl = data_factory[cfg.dataset].get_data_loader( cfg.im_root, cfg.train_im_anns, cfg.ims_per_gpu, cfg.scales, cfg.cropsize, cfg.max_iter, mode='train', distributed=is_dist) ## model net, criteria_pre, criteria_aux = set_model() ## optimizer optim = set_optimizer(net) ## mixed precision training scaler = amp.GradScaler() ## ddp training net = set_model_dist(net) ## meters time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters() ## lr scheduler lr_schdr = WarmupPolyLrScheduler(optim, power=0.9, max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters, warmup_ratio=0.1, warmup='exp', last_epoch=-1,) ## train loop for it, (im, lb) in enumerate(dl): im = im.cuda() lb = lb.cuda() lb = torch.squeeze(lb, 1) optim.zero_grad() with amp.autocast(enabled=cfg.use_fp16): logits, *logits_aux = net(im) loss_pre = criteria_pre(logits, lb) loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)] loss = loss_pre + sum(loss_aux) scaler.scale(loss).backward() scaler.step(optim) scaler.update() torch.cuda.synchronize() time_meter.update() loss_meter.update(loss.item()) loss_pre_meter.update(loss_pre.item()) _ = [mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux)] ## print training log message if (it + 1) % 100 == 0: lr = lr_schdr.get_lr() lr = sum(lr) / len(lr) print_log_msg( it, cfg.max_iter, lr, time_meter, loss_meter, loss_pre_meter, loss_aux_meters) lr_schdr.step() ## dump the final model and evaluate the result save_pth = osp.join(cfg.respth, 'model_final.pth') logger.info('\nsave models to {}'.format(save_pth)) state = net.module.state_dict() if dist.get_rank() == 0: torch.save(state, save_pth) logger.info('\nevaluating the final model') torch.cuda.empty_cache() heads, mious = eval_model(cfg, net, 2, cfg.im_root, cfg.val_im_anns) logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl')) return
def train(args): ## setup cfg and logger spec = importlib.util.spec_from_file_location('mod_cfg', args.cfg) mod_cfg = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod_cfg) cfg = mod_cfg.cfg cfg_str = json.dumps(cfg, ensure_ascii=False, indent=2) if not osp.exists(cfg.res_pth): os.makedirs(cfg.res_pth) setup_logger(cfg.res_pth) logger = logging.getLogger(__name__) logger.info(cfg_str) ## modules and losses logger.info('creating model and loss module') net = DeepLabLargeFOV(3, cfg.n_classes) net.train() net.cuda() if not torch.cuda.device_count() == 0: net = nn.DataParallel(net) n_min = (cfg.crop_size**2) * cfg.batchsize // 16 criteria = OhemCELoss(0.7, n_min) criteria.cuda() ## dataset logger.info('creating dataset and dataloader') ds = eval(cfg.dataset)(cfg, mode='train') dl = DataLoader(ds, batch_size = cfg.batchsize, shuffle = True, num_workers = cfg.n_workers, drop_last = True) ## optimizer logger.info('creating optimizer') optimizer = Optimizer( params = net.parameters(), warmup_start_lr = cfg.warmup_start_lr, warmup_steps = cfg.warmup_iter, lr0 = cfg.start_lr, max_iter = cfg.iter_num, momentum = cfg.momentum, wd = cfg.weight_decay, power = cfg.power) ## train loop loss_avg = [] st = time.time() diter = iter(dl) logger.info('start training') for it in range(cfg.iter_num): try: im, lb = next(diter) if not im.size()[0] == cfg.batchsize: continue except StopIteration: diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() # if use_mixup: # lam = np.random.beta(alpha, alpha) # idx = torch.randperm(batchsize) # mix_im = im * lam + (1. - lam) * im[idx, :] # mix_lb = lb[idx, :] # optimizer.zero_grad() # out = net(mix_im) # out = F.interpolate(out, lb.size()[2:], mode = 'bilinear') # upsample to original size # lb = torch.squeeze(lb) # mix_lb = torch.squeeze(mix_lb) # loss = lam * Loss(out, lb) + (1. - lam) * Loss(out, mix_lb) # loss.backward() # optimizer.step() # else: # optimizer.zero_grad() # out = net(im) # out = F.interpolate(out, lb.size()[2:], mode = 'bilinear') # upsample to original size # lb = torch.squeeze(lb) # loss = Loss(out, lb) # loss.backward() # optimizer.step() optimizer.zero_grad() out = net(im) lb = torch.squeeze(lb) loss = criteria(out, lb) loss.backward() optimizer.step() loss = loss.detach().cpu().numpy() loss_avg.append(loss) ## log message if it%cfg.log_iter==0 and not it==0: loss_avg = sum(loss_avg) / len(loss_avg) ed = time.time() t_int = ed - st lr = optimizer.get_lr() msg = 'iter: {}/{}, loss: {:.4f}'.format(it, cfg.iter_num, loss_avg) msg = '{}, lr: {:4f}, time: {:.4f}'.format(msg, lr, t_int) logger.info(msg) st = ed loss_avg = [] ## dump model model_pth = osp.join(cfg.res_pth, 'model_final.pkl') net.cpu() state_dict = net.module.state_dict() if hasattr(net, 'module') else net.state_dict() torch.save(state_dict, model_pth) logger.info('training done, model saved to: {}'.format(model_pth)) ## test after train if cfg.test_after_train: net.cuda() mIOU = eval_model(net, cfg) logger.info('iou in whole is: {}'.format(mIOU))
# data[2] = sp_1[2] # data[3] = sp_1[3] # print (len(data[0]), len(data[2]), len(data[1]), len(data[3])) # data = dataset.cv_split(index=1) # print (len(data[0]), len(data[2]), len(data[1]), len(data[3])) # data = dataset.cv_split(index=3) # print (len(data[0]), len(data[2]), len(data[1]), len(data[3])) # init and run tf graph g = tf.Graph() with g.as_default(): sess = tf.Session() with sess.as_default(): if config['eval']: evaluate.eval_model( sess, g, config["load_last_checkpoint"], data, config, ) else: train.set_train(sess, config, data, pretrained_embeddings=pretrained_vectors)
def train(args): ## setup cfg and logger spec = importlib.util.spec_from_file_location('mod_cfg', args.cfg) mod_cfg = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod_cfg) cfg = mod_cfg.cfg cfg_str = json.dumps(cfg, ensure_ascii=False, indent=2) if not osp.exists(cfg.res_pth): os.makedirs(cfg.res_pth) setup_logger(cfg.res_pth) logger = logging.getLogger(__name__) logger.info(cfg_str) device = torch.device('cuda:0') ## modules and losses logger.info('creating model and loss module') net = DeepLabLargeFOV(3, cfg.n_classes) #for item in net.parameters(): # print(item.size()) net.train() net.cuda() if not torch.cuda.device_count() == 0: net = nn.DataParallel(net) n_min = (cfg.crop_size**2) * cfg.batchsize // 8 criteria = OhemCELoss(0.7, n_min) globloss = GlobLoss(0.7, n_min) criteria.cuda() #hook the feature extractor features_blobs_1 = [] features_blobs_2 = [] features_blobs_3 = [] features_blobs_4 = [] def hook_feature(module, input, output): features_blobs_1.append(output.data.cpu().numpy()) features_blobs_2.append(output.data.cpu().numpy()) features_blobs_3.append(output.data.cpu().numpy()) features_blobs_4.append(output.data.cpu().numpy()) net._modules['module']._modules['MDC_DC_1']._modules.get( '1').register_forward_hook(hook_feature) net._modules['module']._modules['MDC_DC_2']._modules.get( '1').register_forward_hook(hook_feature) net._modules['module']._modules['MDC_DC_3']._modules.get( '1').register_forward_hook(hook_feature) net._modules['module']._modules['MDC_DC_4']._modules.get( '1').register_forward_hook(hook_feature) params = list(net.parameters()) count = 0 #summary(net,(3,321,321)) weight_softmax = np.squeeze(params[-3].data.cpu().numpy()) ## dataset logger.info('creating dataset and dataloader') ds = eval(cfg.dataset)(cfg, mode='train') dl = DataLoader(ds, batch_size=cfg.batchsize, shuffle=True, num_workers=cfg.n_workers, drop_last=True) ## optimizer logger.info('creating optimizer') optimizer = Optimizer(params=net.parameters(), warmup_start_lr=cfg.warmup_start_lr, warmup_steps=cfg.warmup_iter, lr0=cfg.start_lr, max_iter=cfg.iter_num, momentum=cfg.momentum, wd=cfg.weight_decay, power=cfg.power) ## train loop loss_avg = [] st = time.time() diter = iter(dl) logger.info('start training') max = 0 min = 0 mean = 0 for it in range(cfg.iter_num): if it / 20 == 0: print('training {}/{}'.format(it, cfg.iter_num)) try: im, lb, clb = next(diter) if not im.size()[0] == cfg.batchsize: continue except StopIteration: diter = iter(dl) im, lb = next(diter) im = im.cuda() lb = lb.cuda() #16,1,321,321 optimizer.zero_grad() out, pred_c1, pred_c2, pred_c3, pred_c4 = net( im) #out:16.21.321.321 pred:(16,21) #print(out.size()) lb = torch.squeeze(lb) probs, idx = pred_c1.sort(1, True) CAMs_1 = getCams(pred_c1, features_blobs_1[0], weight_softmax) CAMs_2 = getCams(pred_c2, features_blobs_2[0], weight_softmax) CAMs_3 = getCams(pred_c3, features_blobs_3[0], weight_softmax) CAMs_4 = getCams(pred_c4, features_blobs_4[0], weight_softmax) #print(features_blobs_1[0].shape,len(features_blobs_1)) location_map = np.argmax((CAMs_1 + (CAMs_2 + CAMs_3 + CAMs_4) / 3), axis=1) #(16,21,321,321) pred_mask = torch.argmax(out, dim=1) #false_mask = torch.from_numpy(false_mask) loss = globloss(out, location_map, pred_mask, clb, pred_c1, pred_c2, pred_c3, pred_c4) loss.backward() optimizer.step() loss = loss.detach().cpu().numpy() loss_avg.append(loss) ## log message if it % cfg.log_iter == 0 and not it == 0: loss_avg = sum(loss_avg) / len(loss_avg) ed = time.time() t_int = ed - st lr = optimizer.get_lr() msg = 'iter: {}/{}, loss: {:.4f}'.format(it, cfg.iter_num, loss_avg) msg = '{}, lr: {:4f}, time: {:.4f}'.format(msg, lr, t_int) logger.info(msg) st = ed loss_avg = [] ## dump model model_pth = osp.join(cfg.res_pth, 'model_final.pkl') net.cpu() state_dict = net.module.state_dict() if hasattr( net, 'module') else net.state_dict() torch.save(state_dict, model_pth) logger.info('training done, model saved to: {}'.format(model_pth)) ## test after train if cfg.test_after_train: net.cuda() mIOU = eval_model(net, cfg) logger.info('iou in whole is: {}'.format(mIOU))
def train_model(args): # Hyper Parameters sequence_length = args.seq_len input_size = args.input_size hidden_size = args.hidden_size num_layers = args.num_layers num_classes = args.num_classes batch_size = args.batch_size num_epochs = args.num_epochs learning_rate = args.learning_rate dropout = args.dropout # Create the dataset train_dataset = create_dataset('data/train/', timesteps=sequence_length) train_loader = dataloader(train_dataset, batch_size=batch_size) test_dataset = create_dataset('data/test/', timesteps=sequence_length) test_loader = dataloader(test_dataset, batch_size=batch_size) # Define model and loss rnn = RNN('LSTM', input_size, hidden_size, num_layers, num_classes, dropout) criterion = nn.CrossEntropyLoss() if args.cuda: # switch to cuda rnn, criterion = rnn.cuda(), criterion.cuda() # Adam Optimizer optimizer = torch.optim.Adam(rnn.parameters(), learning_rate) # Train the Model i = 0 # updates best_test_acc = 0.0 for epoch in range(num_epochs): # Generate random batches every epoch train_loader = dataloader(train_dataset, batch_size) for batch_X, batch_y in train_loader: # points = pack_padded_sequence(Variable(torch.from_numpy(batch_X)), batch_seq_lens) points = Variable(torch.from_numpy(batch_X)) labels = Variable(torch.from_numpy(batch_y)) if args.cuda: points, labels = points.cuda(), labels.cuda() # Forward + Backward + Optimize optimizer.zero_grad() outputs = rnn(points) # final hidden state # outputs = pad_packed_sequence(outputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() print('Epoch [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, loss.data[0])) if i % 100 == 0: # every 100 updates, evaluate on test set # print("training accuracy = %.4f" % eval_model(rnn, train_loader)) test_acc = eval_model(rnn, test_loader) print("test accuracy = %.4f" % test_acc) if test_acc > best_test_acc: print("best test accuracy found") best_test_acc = test_acc torch.save(rnn.state_dict(), 'rnn_best.pkl') i += 1
# eval_model(model)if __name__ == "__main__": # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # model = model_all_2048_512_5.build_whole_model() # PATH = os.path.abspath(os.path.dirname(__file__)) # path_to_model = os.path.join(PATH, 'pretrained_model') # model.load_state_dict(torch.load(os.path.join(path_to_model, '%s.pth' %(model.name)))) # model.to(device) # eval_model(model)) # ... after training, save your model # print([name for finder, name, _ in pkgutil.iter_modules(['model'])]) # for finder, name, _ in pkgutil.iter_modules(['model']): # print(name) # mod = finder.find_module(name).load_module(name) # model = mod.build_whole_model() # model = start_train(model) # eval_model(model) for finder, name, _ in pkgutil.iter_modules(['model']): if (name == 'model_128_7_35_32_7'): mod = finder.find_module(name).load_module(name) model = mod.build_whole_model() model = start_train(model) eval_model(model) ########
def train(self, train_exs, test_exs, num_epochs, examples_by_epoch, data_dir, save_name, eval_during_training=False): trainN, testN = len(train_exs), len(test_exs) print('Training on {} examples, testing on {} examples.'.format( trainN, testN)) print("Starting training...") ex_count = 0 train_losses = [] rps = [] for epoch in range(1, num_epochs + 1): if epoch < 3: train_losses = [] start_time = time.time() print('Epoch {}/{} running...'.format(epoch, num_epochs), end='') train_loss = 0 train_loss_with_reg = 0 invalid_example = 0 tot_t = 0 for z in range(examples_by_epoch): db, i, j = train_exs[ex_count % trainN] XY = load_steps(db, i, self.params)[j] x, y = numpy.reshape( XY[0], (1, 1, 5000)).astype('float16'), numpy.reshape( XY[1], (1, 5000)).astype('float16') ex_count += 1 if numpy.sum(y) == 0.: invalid_example += 1 continue t1 = time.time() tmp_loss = self.train_fn(x, y) tot_t += time.time() - t1 train_loss += tmp_loss train_losses.append(train_loss / (z + 1 - invalid_example)) nnn = examples_by_epoch - invalid_example print('Done in {:.3f}s! ({:.3f}ms/example)'.format( time.time() - start_time, tot_t / nnn * 1000)) print(" - training loss:\t\t{:.6f}".format(train_loss / nnn)) if eval_during_training: # Eval on examples: test_loss = 0 test_reg_loss = 0 test_acc = 0 invalid_example = 0 for k in range(testN): db, i, j = train_exs[k] XY = load_steps(db, i, self.params)[j] x, y = numpy.reshape( XY[0], (1, 1, 5000)).astype('float16'), numpy.reshape( XY[1], (1, 5000)).astype('float16') if numpy.sum(y) == 0.: invalid_example += 1 continue tmp_loss, tmp_reg_loss, tmp_acc = self.eval_fn(x, y) test_loss += tmp_loss test_reg_loss += (tmp_reg_loss - tmp_loss) test_acc += tmp_acc N = (testN - invalid_example) print(" - test loss:\t\t\t{:.6f} | {:.6f} | {:.6f}".format( test_loss / N, test_reg_loss / N, (test_loss + test_reg_loss) / N)) acc = test_acc / N * 100 print(" - test accuracy:\t\t{:.4f} %".format(acc)) plt.plot(train_losses, color='r') plt.plot(rps, color='g') plt.show() eval_model(test_exs, self.evaluate, self.params, plot_examples=True, nb=3, nearest_fpr=0.01, threshold=0.98, eval_margin=10) save_lasagne_nn_epoch(save_name, self, epoch, train_loss / examples_by_epoch) self.trained = True
def __init__(self, conf={}, log_handler=None, model_dir=None): """Train a model, and evaluate it""" """ Ensure the workspace, to save model parameters there """ if not model_dir: model_dir = abs_file_path('models/%s' % nows()) if not os.path.exists(model_dir): os.makedirs(model_dir) """ Get configurations """ __conf = {} __conf.update(_conf()) __conf.update(conf) conf = __conf batch_size = conf.get('batch_size') num_epochs = conf.get('num_epochs') """ Get the corpus data """ _corpus = Corpus(conf) _embedding = Embedding({ 'pretrained_file': conf.get('pretrained_file'), 'unknown_token': conf.get('unknown_token') }) _corpus.embed_features(_embedding.vocab, unknown_token=conf.get('unknown_token'), seq_len=conf.get('seq_len')) ctx = _corpus.ctx train_labels = _corpus.train_labels test_labels = _corpus.test_labels train_features = _corpus.train_features test_features = _corpus.test_features train_size = train_features.shape[0] print('train_features.shape: %s, %s' % train_features.shape) """ Get the checkpoint """ model_type = conf.get('model_type', 'cnn') checkpoint = conf.get('checkpoint') if checkpoint is not None: if not isinstance(checkpoint, int): checkpoint = int(checkpoint) checkpoint = os.path.join( model_dir, '%s-%04d.params' % (model_type, checkpoint)) if not os.path.exists(checkpoint): logging.error('Invalid checkpoint: %s' % checkpoint) return """ Initialize the net, and create a trainer """ net = Model( conf, (_embedding.embed, _embedding.embed_size, _embedding.vocab_size), ctx, params_file=checkpoint) trainer = Trainer(net, conf) results = [] batch_count = train_size // batch_size print('batch_count: %d' % batch_count) """ Iterate """ for epoch in range(1, num_epochs + 1): result = {'epoch': epoch} if eval_period: l_sum = 0 l_n = 0 accuracy = metric.Accuracy() time0 = int(time.time()) """ Train batch by batch """ for i in range(batch_count): X = train_features[i * batch_size:(i + 1) * batch_size].as_in_context(ctx).T y = train_labels[i * batch_size:(i + 1) * batch_size].as_in_context(ctx).T output = None with autograd.record(): output = net(X) l = loss(output, y) l.backward() trainer.step(batch_size) if eval_period: l_sum += l.sum().asscalar() l_n += l.size accuracy.update(preds=nd.argmax(output, axis=1), labels=y) if i % eval_period_interval == 0 and i > 0: print('epoch %d, batch %d; train loss %.6f, acc %.4f' % (epoch, i, l_sum / l_n, accuracy.get()[1])) l_sum = 0 l_n = 0 accuracy = metric.Accuracy() """ Calculate the training time """ time1 = int(time.time()) print('epoch %d, time %ds:' % (epoch, time1 - time0)) result['time'] = time1 - time0 """ Evaluate the model upon the testing dataset """ time0 = int(time.time()) test_loss, test_acc, test_prf = eval_model(test_features, test_labels, net, batch_size) time1 = int(time.time()) print(' [test] loss %.6f, acc %.4f, time %ds' % (test_loss, test_acc, time1 - time0)) result['test'] = { 'loss': test_loss, 'acc': test_acc, 'time': time1 - time0, 'prf': test_prf } if epoch % eval_train_interval == 0: """ Evaluate the model upon the training dataset """ time0 = int(time.time()) train_loss, train_acc, train_prf = eval_model( train_features, train_labels, net, batch_size) time1 = int(time.time()) print(' [train] loss %.6f, acc %.4f, time %ds' % (train_loss, train_acc, time1 - time0)) result['train'] = { 'loss': train_loss, 'acc': train_acc, 'time': time1 - time0, 'prf': train_prf } if self.save_params: net.save_params( os.path.join(model_dir, '%s-%04d.params' % (net.model_type, epoch))) results.append(result) if log_handler: log_handler(result) self.conf = conf self.corpus = _corpus self.embedding = _embedding self.ctx = ctx self.net = net self.model_dir = model_dir