class LogROCCallback(object): """save roc graphs periodically in TensorBoard. write TensorBoard event file, holding the roc graph for every epoch logging_dir : str this function can only be executed after 'eval_metric.py', since that function is responsible for the graph creation where the tensorboard file will be created roc_path : list[str] list of paths to future roc's class_names : list[str] list of class names. """ def __init__(self, logging_dir=None, prefix='val', roc_path=None, class_names=None): self.prefix = prefix self.roc_path = roc_path self.class_names = class_names try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log ROC graph as an image in TensorBoard.""" for class_name in self.class_names: roc = os.path.join(self.roc_path, 'roc_' + class_name + '.png') if not os.path.exists(roc): continue im = scipy.misc.imread(roc) self.summary_writer.add_image(self.prefix + '_' + class_name, im)
class LogDistributionsCallback(object): """ This function has been deprecated because it consumes too much time. The faster way is to use "ParseLogCallback" with a 'iter_monitor' flag Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. logging_dir : str where the tensorboard file will be created layers_list : list[str] list of layers to be tracked """ def __init__(self, logging_dir, prefix=None, layers_list=None): self.prefix = prefix self.layers_list = layers_list try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log layers' distributions in TensorBoard.""" if param.locals is None: return for name, value in param.locals['arg_params'].iteritems(): # TODO - implement layer to choose from.. if self.layers_list is None: continue if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_histogram(name, value.asnumpy().flatten())
class LogROCCallback(object): """save roc graphs periodically in TensorBoard. write TensorBoard event file, holding the roc graph for every epoch logging_dir : str this function can only be executed after 'eval_metric.py', since that function is responsible for the graph creation where the tensorboard file will be created roc_path : list[str] list of paths to future roc's class_names : list[str] list of class names. """ def __init__(self, logging_dir=None, prefix='val', roc_path=None, class_names=None): self.prefix = prefix self.roc_path = roc_path self.class_names = class_names try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log ROC graph as an image in TensorBoard.""" for class_name in self.class_names: roc = os.path.join(self.roc_path, 'roc_'+class_name+'.png') if not os.path.exists(roc): continue im = scipy.misc.imread(roc) self.summary_writer.add_image(self.prefix+'_'+class_name, im)
class LogDistributionsCallback(object): """ This function has been deprecated because it consumes too much time. The faster way is to use "ParseLogCallback" with a 'iter_monitor' flag Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. logging_dir : str where the tensorboard file will be created layers_list : list[str] list of layers to be tracked """ def __init__(self, logging_dir, prefix=None, layers_list=None): self.prefix = prefix self.layers_list = layers_list try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log layers' distributions in TensorBoard.""" if param.locals is None: return for name, value in param.locals['arg_params'].iteritems(): # TODO - implement layer to choose from.. if self.layers_list is None: continue if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_histogram(name, value.asnumpy().flatten())
def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
class ParseLogCallback(object): """ 1. log distribution's std to tensorboard (as distribution) This function make use of mxnet's "monitor" module, and it's output to a log file. while training, it is possible to specify layers to be monitored. these layers will be printed to a given log file, their values are computed **asynchronously**. 2. log training loss to tensorboard (as scalar) Currently - does not support resume training.. """ def __init__(self, dist_logging_dir=None, scalar_logging_dir=None, logfile_path=None, batch_size=None, iter_monitor=0, frequent=None, prefix='ssd'): self.scalar_logging_dir = scalar_logging_dir self.dist_logging_dir = dist_logging_dir self.logfile_path = logfile_path self.batch_size = batch_size self.iter_monitor = iter_monitor self.frequent = frequent self.prefix = prefix self.batch = 0 self.line_idx = 0 try: from tensorboard import SummaryWriter self.dist_summary_writer = SummaryWriter(dist_logging_dir) self.scalar_summary_writer = SummaryWriter(scalar_logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to parse a log file and and add params to TensorBoard.""" # save distributions from the monitor output log if not self.iter_monitor == 0 and self.batch % self.iter_monitor == 0: with open(self.logfile_path) as fp: for i in range(self.line_idx): fp.next() for line in fp: if line.startswith('Batch'): line = line.split(' ') line = [x for x in line if x] layer_name = line[2] layer_value = np.array(float(line[3].split('\t')[0])).flatten() if np.isfinite(layer_value): self.dist_summary_writer.add_histogram(layer_name, layer_value) self.line_idx += 1 # save training loss if self.batch % self.frequent == 0: if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.scalar_summary_writer.add_scalar(name, value, global_step=self.batch) self.batch += 1
def __init__(self, logging_dir, score_store=False, prefix=None): self.prefix = prefix self.step = 0 self.score_store = score_store try: self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.')
def __init__(self, logging_dir=None, prefix='val', roc_path=None, class_names=None): self.prefix = prefix self.roc_path = roc_path self.class_names = class_names try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=param.epoch)
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value)
def main(): kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} CarSet = CarDataSet(ROOT, TRAIN, MASK) # split train val # train_idx, valid_idx = augmented_train_valid_split(CarSet, test_size = 0.15,shuffle = True ,random_seed=args.seed) # train_sampler = SubsetRandomSampler(train_idx) # val_samper = SubsetRandomSampler(valid_idx) train_loader = DataLoader(CarSet, # sampler=train_sampler, shuffle=True, batch_size=args.batch_size, **kwargs) # val_loader = DataLoader(CarSet, # sampler=val_samper, # batch_size=2, # **kwargs) model = uNet(NUM_CLASS) if args.cuda: model.cuda() optimizer=optim.Adam(model.parameters(),lr=args.lr,betas=(0.9, 0.999)) writer=SummaryWriter('logs/'+datetime.now().strftime('%B-%d')) best_loss=1e+5 iters=0 # resume training if args.resume: model,optimizer,args.start_epoch,best_loss,iters = resume(args.resume,model) for epoch in range(args.start_epoch ,args.epochs): adjust_lr(optimizer,epoch,decay=5) t1=time.time() loss, iters = train(epoch, model, optimizer, train_loader, writer, iters) is_best = loss < best_loss best_loss = min(best_loss, loss) state={ 'epoch':epoch, 'state_dict':model.state_dict(), 'optimizer':optimizer, 'loss':best_loss, 'iters': iters, } save_checkpoint(state, is_best) writer.close()
def add_loss(self, prefix, **losses): for name, value in losses.items(): if name not in self.writers: dir = os.path.join(self.log_dir, name) self.writers[name] = SummaryWriter(dir) self.writers[name].add_scalar(prefix, value, self.next())
def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
def main(): # parametor epoch_num = 60 batch_size = 128 ana_freq = 10 gpu = -1 # set logger logging.config.fileConfig('./log/log.conf') logger = getLogger(__name__) logger.info('file = {}'.format(__file__)) logger.info('epoch_num = {}'.format(epoch_num)) logger.info('batch_size = {}'.format(batch_size)) logger.info('ana_freq = {}'.format(ana_freq)) logger.info('gpu = {}'.format(gpu)) # set writer writer = SummaryWriter('results/' + datetime.now().strftime('%B%d %H:%M:%S')) # read data data_obj = dataset.data_dsprites.DspritesDataset(db_path='/Users/yamada/lab/dat/dsprites') # data_obj = dataset.data_celeba.CelebADataset(db_path='./dataset/celebA', data_size=1000) # data_obj = dataset.data_mnist.MnistDataset() data_obj.train_size = 200 # adjust train data size for speed data_obj.test_size = 20 # model and optimizer model = ae.AE(data_obj) opt = chainer.optimizers.Adam() trainer = Trainer(model=model, optimizer=opt, writer=writer, gpu=gpu) trainer.fit(data_obj, epoch_num=epoch_num, batch_size=batch_size, ana_freq=ana_freq)
class Logger(): def __init__(self, root): self.writer = SummaryWriter(root) self.last_indexes = defaultdict(int) def scalar(self, key, value, index=None): index = index if index is not None else self.last_indexes[key] self.last_indexes[key] += 1 value = to_numeric(value) self.writer.add_scalar(key, value, index) def from_stats(self, key_value_dictionary, index=None): for key in key_value_dictionary: self.scalar(key, key_value_dictionary[key], index)
def __init__(self, logging_dir=None, prefix='val', images_path=None, class_names=None, batch_size=None, mean_pixels=None, det_thresh=0.5): self.logging_dir = logging_dir self.prefix = prefix if not os.path.exists(images_path): os.mkdir(images_path) self.images_path = images_path self.class_names = class_names self.batch_size = batch_size self.mean_pixels = mean_pixels self.det_thresh = det_thresh try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
def __init__(self, dist_logging_dir=None, scalar_logging_dir=None, logfile_path=None, batch_size=None, iter_monitor=0, frequent=None, prefix='ssd'): self.scalar_logging_dir = scalar_logging_dir self.dist_logging_dir = dist_logging_dir self.logfile_path = logfile_path self.batch_size = batch_size self.iter_monitor = iter_monitor self.frequent = frequent self.prefix = prefix self.batch = 0 self.line_idx = 0 try: from tensorboard import SummaryWriter self.dist_summary_writer = SummaryWriter(dist_logging_dir) self.scalar_summary_writer = SummaryWriter(scalar_logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
def __init__(self, logging_dir=None, prefix='val', roc_path=None, class_names=None): self.prefix = prefix self.roc_path = roc_path self.class_names = class_names try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
def __init__(self, dist_logging_dir=None, scalar_logging_dir=None, logfile_path=None, batch_size=None, iter_monitor=0, frequent=None, prefix='ssd'): self.scalar_logging_dir = scalar_logging_dir self.dist_logging_dir = dist_logging_dir self.logfile_path = logfile_path self.batch_size = batch_size self.iter_monitor = iter_monitor self.frequent = frequent self.prefix = prefix self.batch = 0 self.line_idx = 0 try: from tensorboard import SummaryWriter self.dist_summary_writer = SummaryWriter(dist_logging_dir) self.scalar_summary_writer = SummaryWriter(scalar_logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
class LogMetricsCallback(object): def __init__(self, logging_dir, prefix=None): self.prefix = prefix self.itr = 0 try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, name_value): """Callback to log training speed and metrics in TensorBoard.""" if name_value is None: return for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, self.itr) self.itr += 1
def main(): env_name = 'CartPole-v0' env = gym.make(env_name) action_space = env.action_space.n observation_space = env.observation_space.low.shape # set logger logging.config.fileConfig('./log/log.conf') logger = logging.getLogger(__name__) logger.info('START') # set network model shared_model = A3CFFSoftmaxFFF(observation_space, action_space) # set optimizer opt = RMSpropAsync(lr=LEARNING_RATE , alpha=0.99 , eps=RMSPROP_EPS) opt.setup(shared_model) opt.add_hook(chainer.optimizer.GradientClipping(40)) writer = SummaryWriter('results/' + datetime.datetime.now().strftime('%B%d %H:%M:%S')) state = env.reset() state = chainer.Variable(np.expand_dims(np.array(state).astype(np.float32), axis=0)) pi, v = shared_model.get_pi_and_v(state) writer.add_graph([pi, v]) writer.close() async_train(env_name, shared_model, opt, phi) logger.info('END')
def build_report_manager(opt): if opt.tensorboard: from tensorboard import SummaryWriter writer = SummaryWriter(opt.tensorboard_log_dir + datetime.now().strftime("/%b-%d_%H-%M-%S"), comment="Unmt") else: writer = None report_mgr = ReportMgr(opt.report_every, start_time=-1, tensorboard_writer=writer) return report_mgr
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). """ def __init__(self, logging_dir, prefix=None, global_step=100): self.prefix = prefix self.global_step = global_step try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=self.global_step)
def main(): # parametor epoch_num = 400 batch_size = 128 ana_freq = 5 gpu = -1 # set logger logging.config.fileConfig('./log/log.conf') logger = getLogger(__name__) logger.info('file = {}'.format(__file__)) logger.info('epoch_num = {}'.format(epoch_num)) logger.info('batch_size = {}'.format(batch_size)) logger.info('ana_freq = {}'.format(ana_freq)) logger.info('gpu = {}'.format(gpu)) # set writer current_time = datetime.now().strftime('%B%d %H:%M:%S') head = './results/' + current_time writer = SummaryWriter(head) # read data # data_obj = dataset.data_dsprites.DspritesDataset(db_path='/Users/yamada/lab/dat/dsprites') data_obj = dataset.data_dsprites.DspritesDataset( db_path='/home/masanori_yamada/lab/dat/dsprites') # data_obj = dataset.data_celeba.CelebADataset(db_path='/home/masanori_yamada / lab / dat / celeba / syorizumi', data_size=200000) # data_obj = dataset.data_celeba.CelebADataset(db_path='/Users/yamada/lab/dat/celeba/syorizumi', data_size=200) # data_obj = dataset.data_mnist.MnistDataset() # data_obj.train_size = 10 #00 # adjust train data size for speed data_obj.test_size = 16 # model and optimizer model = cae.CAE(data_obj) opt = chainer.optimizers.Adam() trainer = Trainer(model=model, optimizer=opt, writer=writer, gpu=gpu) try: trainer.fit(data_obj, epoch_num=epoch_num, batch_size=batch_size, ana_freq=ana_freq, noise_type='salt') except KeyboardInterrupt: trainer.save(head)
def __init__(self, logging_dir=None, prefix='val', images_path=None, class_names=None, batch_size=None, mean_pixels=None, det_thresh=0.5): self.logging_dir = logging_dir self.prefix = prefix if not os.path.exists(images_path): os.mkdir(images_path) self.images_path = images_path self.class_names = class_names self.batch_size = batch_size self.mean_pixels = mean_pixels self.det_thresh = det_thresh try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.')
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg)
def main(args): mkdir_if_missing(args.logs_dir) writer = SummaryWriter(args.logs_dir) sys.stdout = Logger(osp.join(args.logs_dir, 'test_log.txt')) print(args) cudnn.benchmark = True # create data loaders data_dir = args.data_dir dataset, num_class, test_loader = \ get_data( args.dataset, data_dir, args.crop_w, args.crop_h, args.batch_size, args.workers) # create model model = VGGNet( args.depth, with_bn=True, pretrained=True, num_class=num_class, input_size=(args.crop_w, args.crop_h)) model = model.cuda() # load from checkpoint checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar')) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_recall5 = checkpoint['best_recall5'] print("=> get epoch {} best top5 recall {:.1%}".format(args.start_epoch, best_recall5)) # create trainer evaluator = Evaluator(model) # test print('Test with best model:') evaluator.test(test_loader)
import torchvision.transforms as standard_transforms import torchvision.utils as vutils from tensorboard import SummaryWriter from torch import optim from torch.autograd import Variable from torch.utils.data import DataLoader import utils.joint_transforms as joint_transforms import utils.transforms as extended_transforms from datasets import voc from models import * from utils import check_mkdir, evaluate, AverageMeter, CrossEntropyLoss2d ckpt_path = '../../ckpt' exp_name = 'voc-psp_net' writer = SummaryWriter(os.path.join(ckpt_path, 'exp', exp_name)) args = { 'train_batch_size': 1, 'lr': 1e-2 / sqrt(16 / 4), 'lr_decay': 0.9, 'max_iter': 3e4, 'longer_size': 512, 'crop_size': 473, 'stride_rate': 2 / 3., 'weight_decay': 1e-4, 'momentum': 0.9, 'snapshot': '', 'print_freq': 10, 'val_save_to_img_file': True, 'val_img_sample_rate': 0.01, # randomly sample some validation results to display,
from torch.utils.data import DataLoader import utils.simul_transforms as simul_transforms import utils.transforms as expanded_transforms from config import ckpt_path from datasets.cityscapes import CityScapes from datasets.cityscapes.config import num_classes, ignored_label from datasets.cityscapes.utils import colorize_mask from models import FCN8ResNet from utils.io import rmrf_mkdir from utils.loss import CrossEntropyLoss2d from utils.training import calculate_mean_iu cudnn.benchmark = True exp_name = 'fcn8resnet_cityscapes224*448' writer = SummaryWriter('exp/' + exp_name) pil_to_tensor = standard_transforms.ToTensor() train_record = {'best_val_loss': 1e20, 'corr_mean_iu': 0, 'corr_epoch': -1} train_args = { 'batch_size': 16, 'epoch_num': 800, # I stop training only when val loss doesn't seem to decrease anymore, so just set a large value. 'pretrained_lr': 1e-6, # used for the pretrained layers of model 'new_lr': 1e-6, # used for the newly added layers of model 'weight_decay': 5e-4, 'snapshot': 'epoch_184_loss_0.8953_mean_iu_0.3923_lr_0.00001000.pth', # empty string denotes initial training, otherwise it should be a string of snapshot name 'print_freq': 50, 'input_size': (224, 448), # (height, width) }
def main(args): writer = SummaryWriter(args.logs_dir) sys.stdout = Logger(osp.join(args.logs_dir, 'train_log.txt')) print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.benchmark = True # Create data loaders data_dir = osp.join(args.data_dir, args.dataset) dataset, num_classes, dim_featx, dim_featy, train_loader, val_loader, test_loader = \ get_data( args.dataset, data_dir, args.data_type, args.batch_size, args.workers, args.combine_trainval, head_feat_dir=args.head_feat_dir, face_feat_dir=args.face_feat_dir, body_feat_dir=args.body_feat_dir, upperbody_feat_dir=args.upperbody_feat_dir) # Create model model = RANet(4, num_features=dim_featx) # model = torch.nn.DataParallel(model).cuda() model = model.cuda() # load from checkpoint if args.resume: checkpoint = load_checkpoint(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_top1'] print("=> start epoch {} best top1 {:.1%}".format( args.start_epoch, best_top1)) else: best_top1 = 0 # Criterion criterion = OIM4bLoss(dim_featy, num_classes, scalar=args.oim_scalar, momentum=args.oim_momentum) criterion.init_lut(train_loader) criterion.cuda() # Optimizer if args.optimizer == 'sgd': param_groups = model.parameters() optimizer = torch.optim.SGD(param_groups, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("Cannot recognize optimizer type:", args.optimizer) # Evaluator and Trainer evaluator = RAEvaluator(model) trainer = RATrainer(model, criterion) # Schedule learning rate def adjust_lr(epoch): if args.optimizer == 'sgd': lr = args.lr * (0.1**(epoch // 20)) elif args.optimizer == 'adam': lr = args.lr if epoch <= 50 else \ args.lr * (0.01 ** (epoch - 50) / 30) else: raise ValueError("Cannot recognize optimizer type:", args.optimizer) for g in optimizer.param_groups: g['lr'] = lr * g.get('lr_mult', 1) # start training top1 = evaluator.evaluate(val_loader, print_summary=True) test_top1 = evaluator.test(test_loader, dataset.gallery, dataset.query, print_summary=True) for epoch in range(args.start_epoch, args.epochs): adjust_lr(epoch) loss, prec = trainer.train(epoch, train_loader, optimizer, print_freq=1) writer.add_scalar('Train loss', loss, epoch + 1) writer.add_scalar('Train accuracy', prec, epoch + 1) top1 = evaluator.evaluate(val_loader, print_summary=False) writer.add_scalar('Val accuracy', top1, epoch + 1) test_top1 = evaluator.test(test_loader, dataset.gallery, dataset.query, print_summary=True) test_top1 = evaluator.test(test_loader, dataset.query, dataset.gallery, print_summary=True) writer.add_scalar('Test accuracy', test_top1, epoch + 1) is_best = top1 > best_top1 best_top1 = max(top1, best_top1) save_checkpoint( { 'state_dict': model.state_dict(), 'epoch': epoch + 1, 'best_top1': best_top1, }, is_best, fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar')) print('\n * Finished epoch {:3d} top1: {:5.1%} best: {:5.1%}{}\n'. format(epoch, top1, best_top1, ' *' if is_best else '')) # final test print('Test with best model:') checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar')) model.load_state_dict(checkpoint['state_dict']) evaluator.test(test_loader, dataset.gallery, dataset.query)
def __init__(self, model: Model, optimizer: torch.optim.Optimizer, iterator: DataIterator, train_dataset: Dataset, validation_dataset: Optional[Dataset] = None, patience: int = 2, validation_metric: str = "-loss", num_epochs: int = 20, serialization_dir: Optional[str] = None, cuda_device: int = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[PytorchLRScheduler] = None, no_tqdm: bool = False) -> None: """ Parameters ---------- model : ``Model``, required. An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their ``forward`` method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized. optimizer : ``torch.nn.Optimizer``, required. An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized. iterator : ``DataIterator``, required. A method for iterating over a ``Dataset``, yielding padded indexed batches. train_dataset : ``Dataset``, required. A ``Dataset`` to train on. The dataset should have already been indexed. validation_dataset : ``Dataset``, optional, (default = None). A ``Dataset`` to evaluate on. The dataset should have already been indexed. patience : int, optional (default=2) Number of epochs to be patient before early stopping. validation_metric : str, optional (default="loss") Validation metric to measure for whether to stop training using patience and whether to serialize an ``is_best`` model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. num_epochs : int, optional (default = 20) Number of training epochs. serialization_dir : str, optional (default=None) Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed. cuda_device : int, optional (default = -1) An integer specifying the CUDA device to use. If -1, the CPU is used. Multi-gpu training is not currently supported, but will be once the Pytorch DataParallel API stabilises. grad_norm : float, optional, (default = None). If provided, gradient norms will be rescaled to have a maximum of this value. grad_clipping : ``float``, optional (default = ``None``). If provided, gradients will be clipped `during the backward pass` to have an (absolute) maximum of this value. If you are getting ``NaNs`` in your gradients during training that are not solved by using ``grad_norm``, you may need this. learning_rate_scheduler : PytorchLRScheduler, optional, (default = None) A Pytorch learning rate scheduler. The learning rate will be decayed with respect to this schedule at the end of each epoch. If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`, this will use the ``validation_metric`` provided to determine if learning has plateaued. no_tqdm : ``bool``, optional (default=False) We use ``tqdm`` for logging, which will print a nice progress bar that updates in place after every batch. This is nice if you're running training on a local shell, but can cause problems with log files from, e.g., a docker image running on kubernetes. If ``no_tqdm`` is ``True``, we will not use tqdm, and instead log batch statistics using ``logger.info``, outputting a line at most every 10 seconds. """ self._model = model self._iterator = iterator self._optimizer = optimizer self._train_dataset = train_dataset self._validation_dataset = validation_dataset self._patience = patience self._num_epochs = num_epochs self._serialization_dir = serialization_dir self._cuda_device = cuda_device self._grad_norm = grad_norm self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler increase_or_decrease = validation_metric[0] if increase_or_decrease not in ["+", "-"]: raise ConfigurationError( "Validation metrics must specify whether they should increase " "or decrease by pre-pending the metric name with a +/-.") self._validation_metric = validation_metric[1:] self._validation_metric_decreases = increase_or_decrease == "-" self._no_tqdm = no_tqdm if self._cuda_device >= 0: #self._model = self._model.cuda(self._cuda_device) self._model = self._model.cuda() self._log_interval = 10 # seconds self._summary_interval = 100 # num batches between logging to tensorboard self._last_log = 0.0 # time of last logging if serialization_dir is not None: train_log = SummaryWriter( os.path.join(serialization_dir, "log", "train")) validation_log = SummaryWriter( os.path.join(serialization_dir, "log", "validation")) self._tensorboard = TensorboardWriter(train_log, validation_log) else: self._tensorboard = TensorboardWriter()
class LogDetectionsCallback(object): """ TODO complete """ def __init__(self, logging_dir=None, prefix='val', images_path=None, class_names=None, batch_size=None, mean_pixels=None, det_thresh=0.5): self.logging_dir = logging_dir self.prefix = prefix if not os.path.exists(images_path): os.mkdir(images_path) self.images_path = images_path self.class_names = class_names self.batch_size = batch_size self.mean_pixels = mean_pixels self.det_thresh = det_thresh try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log detections and gt-boxes as an image in TensorBoard.""" if param.locals is None: return result = [] pad = param.locals['eval_batch'].pad images = param.locals['eval_batch'].data[0][0:self.batch_size-pad].asnumpy() labels = param.locals['eval_batch'].label[0][0:self.batch_size - pad].asnumpy() outputs = [out[0:out.shape[0] - pad] for out in param.locals['self'].get_outputs()] # 'det' variable can be in different positions depending with train/test symbols if len(outputs) > 1: det_idx = [idx for idx,f in enumerate(param.locals['self'].output_names) if f.startswith('det')][0] detections = outputs[det_idx].asnumpy() else: detections = outputs[0].asnumpy() for i in range(detections.shape[0]): det = detections[i, :, :] det = det[np.where(det[:, 0] >= 0)[0]] label = labels[i,:,:] label = label[np.where(label[:, 0] >= 0)[0]] img = images[i,:,:,:] + np.reshape(self.mean_pixels, (3,1,1)) img = img.astype(np.uint8) img = img.transpose([1,2,0]) img[:, :, (0, 1, 2)] = img[:, :, (2, 1, 0)] self._visualize_detection_and_labels(img, det, label=label, classes=self.class_names, thresh=self.det_thresh, plt_path=os.path.join(self.images_path, 'image'+str(i)+'.png')) # save to tensorboard img_det_graph = scipy.misc.imread(os.path.join(self.images_path, 'image'+str(i)+'.png')) self.summary_writer.add_image('image'+str(i)+'.png', img_det_graph) return result def _visualize_detection_and_labels(self, img, dets, label, classes=[], thresh=None, plt_path=None): """ visualize detections in one image Parameters: ---------- img : numpy.array image, in bgr format dets : numpy.array ssd detections, numpy.array([[id, score, x1, y1, x2, y2]...]) each row is one object classes : tuple or list of str class names thresh : float score threshold """ fig = plt.figure() plt.imshow(img) height = img.shape[0] width = img.shape[1] colors = dict() # Visualize ground-truth boxes gt_color = (1.0, 0.0, 0.0) for i in range(label.shape[0]): cls_id = int(label[i, 0]) if cls_id >= 0: xmin = int(label[i, 1] * width) ymin = int(label[i, 2] * height) xmax = int(label[i, 3] * width) ymax = int(label[i, 4] * height) rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=gt_color, linewidth=2) plt.gca().add_patch(rect) class_name = str(cls_id) if classes and len(classes) > cls_id: class_name = classes[cls_id] plt.gca().text(xmin, ymin - 2, 'gt', bbox=dict(facecolor=gt_color, alpha=0.5), fontsize=8, color='white') # visualize predictions for i in range(dets.shape[0]): cls_id = int(dets[i, 0]) if cls_id >= 0: score = dets[i, 1] if score > thresh: if cls_id not in colors: colors[cls_id] = (random.random(), random.random(), random.random()) xmin = int(dets[i, 2] * width) ymin = int(dets[i, 3] * height) xmax = int(dets[i, 4] * width) ymax = int(dets[i, 5] * height) rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=colors[cls_id], linewidth=3.5) plt.gca().add_patch(rect) class_name = str(cls_id) if classes and len(classes) > cls_id: class_name = classes[cls_id] plt.gca().text(xmin, ymin - 2, '{:s} {:.3f}'.format(class_name, score), bbox=dict(facecolor=colors[cls_id], alpha=0.5), fontsize=8, color='white') plt.savefig(plt_path) plt.close(fig)
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} module.init_params() module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) n_epoch = 0 while True: if n_epoch >= num_epoch: break train_iter.reset() val_iter.reset() loss_metric.reset() for n_batch, data_batch in enumerate(train_iter): module.forward_backward(data_batch) module.update() module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) train_acc, train_loss, train_recon_err = loss_metric.get_name_value() loss_metric.reset() for n_batch, data_batch in enumerate(val_iter): module.forward(data_batch) module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) val_acc, val_loss, val_recon_err = loss_metric.get_name_value() summary_writer.add_scalar('train_acc', train_acc, n_epoch) summary_writer.add_scalar('train_loss', train_loss, n_epoch) summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) summary_writer.add_scalar('val_acc', val_acc, n_epoch) summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) n_epoch += 1 lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
class LogDetectionsCallback(object): """ TODO complete """ def __init__(self, logging_dir=None, prefix='val', images_path=None, class_names=None, batch_size=None, mean_pixels=None, det_thresh=0.5): self.logging_dir = logging_dir self.prefix = prefix if not os.path.exists(images_path): os.mkdir(images_path) self.images_path = images_path self.class_names = class_names self.batch_size = batch_size self.mean_pixels = mean_pixels self.det_thresh = det_thresh try: from tensorboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install tensorboard`.') def __call__(self, param): """Callback to log detections and gt-boxes as an image in TensorBoard.""" if param.locals is None: return result = [] pad = param.locals['eval_batch'].pad images = param.locals['eval_batch'].data[0][0:self.batch_size - pad].asnumpy() labels = param.locals['eval_batch'].label[0][0:self.batch_size - pad].asnumpy() outputs = [ out[0:out.shape[0] - pad] for out in param.locals['self'].get_outputs() ] # 'det' variable can be in different positions depending with train/test symbols if len(outputs) > 1: det_idx = [ idx for idx, f in enumerate(param.locals['self'].output_names) if f.startswith('det') ][0] detections = outputs[det_idx].asnumpy() else: detections = outputs[0].asnumpy() for i in range(detections.shape[0]): det = detections[i, :, :] det = det[np.where(det[:, 0] >= 0)[0]] label = labels[i, :, :] label = label[np.where(label[:, 0] >= 0)[0]] img = images[i, :, :, :] + np.reshape(self.mean_pixels, (3, 1, 1)) img = img.astype(np.uint8) img = img.transpose([1, 2, 0]) img[:, :, (0, 1, 2)] = img[:, :, (2, 1, 0)] self._visualize_detection_and_labels( img, det, label=label, classes=self.class_names, thresh=self.det_thresh, plt_path=os.path.join(self.images_path, 'image' + str(i) + '.png')) # save to tensorboard img_det_graph = scipy.misc.imread( os.path.join(self.images_path, 'image' + str(i) + '.png')) self.summary_writer.add_image('image' + str(i) + '.png', img_det_graph) return result def _visualize_detection_and_labels(self, img, dets, label, classes=[], thresh=None, plt_path=None): """ visualize detections in one image Parameters: ---------- img : numpy.array image, in bgr format dets : numpy.array ssd detections, numpy.array([[id, score, x1, y1, x2, y2]...]) each row is one object classes : tuple or list of str class names thresh : float score threshold """ fig = plt.figure() plt.imshow(img) height = img.shape[0] width = img.shape[1] colors = dict() # Visualize ground-truth boxes gt_color = (1.0, 0.0, 0.0) for i in range(label.shape[0]): cls_id = int(label[i, 0]) if cls_id >= 0: xmin = int(label[i, 1] * width) ymin = int(label[i, 2] * height) xmax = int(label[i, 3] * width) ymax = int(label[i, 4] * height) rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=gt_color, linewidth=2) plt.gca().add_patch(rect) class_name = str(cls_id) if classes and len(classes) > cls_id: class_name = classes[cls_id] plt.gca().text(xmin, ymin - 2, 'gt', bbox=dict(facecolor=gt_color, alpha=0.5), fontsize=8, color='white') # visualize predictions for i in range(dets.shape[0]): cls_id = int(dets[i, 0]) if cls_id >= 0: score = dets[i, 1] if score > thresh: if cls_id not in colors: colors[cls_id] = (random.random(), random.random(), random.random()) xmin = int(dets[i, 2] * width) ymin = int(dets[i, 3] * height) xmax = int(dets[i, 4] * width) ymax = int(dets[i, 5] * height) rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor=colors[cls_id], linewidth=3.5) plt.gca().add_patch(rect) class_name = str(cls_id) if classes and len(classes) > cls_id: class_name = classes[cls_id] plt.gca().text(xmin, ymin - 2, '{:s} {:.3f}'.format(class_name, score), bbox=dict(facecolor=colors[cls_id], alpha=0.5), fontsize=8, color='white') plt.savefig(plt_path) plt.close(fig)
from __future__ import print_function import argparse import os import random import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.optim as optim import torch.utils.data import torchvision.datasets as dset import torchvision.transforms as transforms import torchvision.utils as vutils from torch.autograd import Variable from tensorboard import SummaryWriter writer = SummaryWriter('runs') parser = argparse.ArgumentParser() parser.add_argument('--dataset', required=True, help='cifar10 | lsun | imagenet | folder | lfw ') parser.add_argument('--dataroot', required=True, help='path to dataset') parser.add_argument('--workers', type=int, help='number of data loading workers', default=2) parser.add_argument('--batchSize', type=int, default=64, help='input batch size') parser.add_argument('--imageSize', type=int,
n_minibatches_validation = np.ceil(len(data_validation) / batch_size) # Fix the number of epochs to train for n_epochs = 10 threshold = 0.01 * 10 ** (-21) # ------------------------------------------------------------------------- # SET UP A LOGGER FOR TENSORBOARD VISUALIZATION # ------------------------------------------------------------------------- run_start = datetime.datetime.now() log_name = [run_start, distances, sample_size, initial_lr, threshold] log_name_formatted = '[{:%Y-%m-%d_%H:%M}]-[{}]-[{}]-[lr_{:.1e}]-'\ '[thresh_{:.2e}]'.format(*log_name) writer = SummaryWriter(log_dir='logs/{}'.format(log_name_formatted)) writer.add_text(tag='Description', text_string='(Description missing.)') # ------------------------------------------------------------------------- # TRAIN THE NET FOR THE GIVEN NUMBER OF EPOCHS # ------------------------------------------------------------------------- print('\nStart training: Training on {} examples, validating on {} ' 'examples\n'.format(len(data_train), len(data_validation))) # ------------------------------------------------------------------------- for epoch in range(n_epochs): print('Epoch {}/{}'.format(epoch+1, n_epochs))
from torch.autograd.variable import Variable import torch.utils.data import torchvision.datasets as dset import torchvision.transforms as transforms import os from tensorboard import SummaryWriter from datetime import datetime from torchvision.utils import make_grid from torch.nn import functional as F import numpy as np import matplotlib.pyplot as plt check_root = './generate/draw_noattn' os.system('rm -rf ./runs/*') writer = SummaryWriter('./runs/' + datetime.now().strftime('%B%d %H:%M:%S')) if not os.path.exists(check_root): os.mkdir(check_root) batch_size = 512 # picture #batch_size = 300 # cartoon seq_len = 20 img_size = 64 enc_hidden_size = 800 dec_hidden_size = 1600 nz = 100 model = draw(seq_len) model.cuda()
def __init__(self, model: Model, optimizer: torch.optim.Optimizer, iterator: DataIterator, train_dataset: Iterable[Instance], validation_dataset: Optional[Iterable[Instance]] = None, patience: int = 2, validation_metric: str = "-loss", num_epochs: int = 20, serialization_dir: Optional[str] = None, num_serialized_models_to_keep: int = None, keep_serialized_model_every_num_seconds: int = None, model_save_interval: float = None, cuda_device: Union[int, List] = -1, grad_norm: Optional[float] = None, grad_clipping: Optional[float] = None, learning_rate_scheduler: Optional[PytorchLRScheduler] = None, histogram_interval: int = None) -> None: """ Parameters ---------- model : ``Model``, required. An AllenNLP model to be optimized. Pytorch Modules can also be optimized if their ``forward`` method returns a dictionary with a "loss" key, containing a scalar tensor representing the loss function to be optimized. optimizer : ``torch.nn.Optimizer``, required. An instance of a Pytorch Optimizer, instantiated with the parameters of the model to be optimized. iterator : ``DataIterator``, required. A method for iterating over a ``Dataset``, yielding padded indexed batches. train_dataset : ``Dataset``, required. A ``Dataset`` to train on. The dataset should have already been indexed. validation_dataset : ``Dataset``, optional, (default = None). A ``Dataset`` to evaluate on. The dataset should have already been indexed. patience : int, optional (default=2) Number of epochs to be patient before early stopping. validation_metric : str, optional (default="loss") Validation metric to measure for whether to stop training using patience and whether to serialize an ``is_best`` model each epoch. The metric name must be prepended with either "+" or "-", which specifies whether the metric is an increasing or decreasing function. num_epochs : int, optional (default = 20) Number of training epochs. serialization_dir : str, optional (default=None) Path to directory for saving and loading model files. Models will not be saved if this parameter is not passed. num_serialized_models_to_keep : ``int``, optional (default=None) Number of previous model checkpoints to retain. Default is to keep all checkpoints. keep_serialized_model_every_num_seconds : ``int``, optional (default=None) If num_serialized_models_to_keep is not None, then occasionally it's useful to save models at a given interval in addition to the last num_serialized_models_to_keep. To do so, specify keep_serialized_model_every_num_seconds as the number of seconds between permanently saved checkpoints. Note that this option is only used if num_serialized_models_to_keep is not None, otherwise all checkpoints are kept. model_save_interval : ``float``, optional (default=None) If provided, then serialize models every ``model_save_interval`` seconds within single epochs. In all cases, models are also saved at the end of every epoch if ``serialization_dir`` is provided. cuda_device : ``int``, optional (default = -1) An integer specifying the CUDA device to use. If -1, the CPU is used. grad_norm : ``float``, optional, (default = None). If provided, gradient norms will be rescaled to have a maximum of this value. grad_clipping : ``float``, optional (default = ``None``). If provided, gradients will be clipped `during the backward pass` to have an (absolute) maximum of this value. If you are getting ``NaNs`` in your gradients during training that are not solved by using ``grad_norm``, you may need this. learning_rate_scheduler : ``PytorchLRScheduler``, optional, (default = None) A Pytorch learning rate scheduler. The learning rate will be decayed with respect to this schedule at the end of each epoch. If you use :class:`torch.optim.lr_scheduler.ReduceLROnPlateau`, this will use the ``validation_metric`` provided to determine if learning has plateaued. To support updating the learning rate on every batch, this can optionally implement ``step_batch(batch_num)`` which updates the learning rate given the batch number. histogram_interval : ``int``, optional, (default = ``None``) If not None, then log histograms to tensorboard every ``histogram_interval`` batches. When this parameter is specified, the following additional logging is enabled: * Histograms of model parameters * The ratio of parameter update norm to parameter norm * Histogram of layer activations We log histograms of the parameters returned by ``model.get_parameters_for_histogram_tensorboard_logging``. The layer activations are logged for any modules in the ``Model`` that have the attribute ``should_log_activations`` set to ``True``. Logging histograms requires a number of GPU-CPU copies during training and is typically slow, so we recommend logging histograms relatively infrequently. Note: only Modules that return tensors, tuples of tensors or dicts with tensors as values currently support activation logging. """ self._model = model self._iterator = iterator self._optimizer = optimizer self._train_data = train_dataset self._validation_data = validation_dataset self._patience = patience self._num_epochs = num_epochs self._serialization_dir = serialization_dir self._num_serialized_models_to_keep = num_serialized_models_to_keep self._keep_serialized_model_every_num_seconds = keep_serialized_model_every_num_seconds self._serialized_paths: List[Any] = [] self._last_permanent_saved_checkpoint_time = time.time() self._model_save_interval = model_save_interval self._grad_norm = grad_norm self._batch_grad_norm = None self._grad_clipping = grad_clipping self._learning_rate_scheduler = learning_rate_scheduler increase_or_decrease = validation_metric[0] if increase_or_decrease not in ["+", "-"]: raise ConfigurationError( "Validation metrics must specify whether they should increase " "or decrease by pre-pending the metric name with a +/-.") self._validation_metric = validation_metric[1:] self._validation_metric_decreases = increase_or_decrease == "-" if not isinstance(cuda_device, int) and not isinstance( cuda_device, list): raise ConfigurationError( "Expected an int or list for cuda_device, got {}".format( cuda_device)) if isinstance(cuda_device, list): logger.info( f"WARNING: Multiple GPU support is experimental not recommended for use. " "In some cases it may lead to incorrect results or undefined behavior." ) self._multiple_gpu = True self._cuda_devices = cuda_device # data_parallel will take care of transfering to cuda devices, # so the iterator keeps data on CPU. self._iterator_device = -1 else: self._multiple_gpu = False self._cuda_devices = [cuda_device] self._iterator_device = cuda_device if self._cuda_devices[0] != -1: self._model = self._model.cuda(self._cuda_devices[0]) self._log_interval = 10 # seconds self._summary_interval = 100 # num batches between logging to tensorboard self._histogram_interval = histogram_interval self._log_histograms_this_batch = False self._batch_num_total = 0 self._last_log = 0.0 # time of last logging if serialization_dir is not None: train_log = SummaryWriter( os.path.join(serialization_dir, "log", "train")) validation_log = SummaryWriter( os.path.join(serialization_dir, "log", "validation")) self._tensorboard = TensorboardWriter(train_log, validation_log) else: self._tensorboard = TensorboardWriter()
i = F.dropout(i, 0.35) i = F.log_softmax(i) return i def classification_accuracy(out, labels): # mi servono argmax _, out = torch.max(out, 1) accuracy = torch.sum(out == labels).float() accuracy /= len(out) return accuracy net = Net().cuda() writer = SummaryWriter('runs/' + datetime.now().strftime('%B%d %H:%M:%S')) writer.add_graph( net, net(Variable(torch.rand(1, 75, features_size), requires_grad=True).cuda())) loader = DataLoader(MyDataset("data/text_1", input_len=75, output_len=1), batch_size=64, shuffle=True) # net = net.cuda() optimizer = Adam(params=net.parameters(), lr=0.001) # loss loss = nn.NLLLoss() batch_number = len(loader) num_epochs = 500
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_validation_metric,is_epoch_end=True) # tensorboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, seq_length=seq_len,is_logging=enable_logging_train_metric,is_epoch_end=False) optimizer = args.config.get('train', 'optimizer') momentum = args.config.getfloat('train', 'momentum') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('train', 'clip_gradient') weight_decay = args.config.getfloat('train', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') n_epoch=begin_epoch if clip_gradient == 0: clip_gradient = None module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): if optimizer == "sgd": module.init_optimizer(kvstore='device', optimizer=optimizer, optimizer_params={'lr_scheduler': lr_scheduler, 'momentum': momentum, 'clip_gradient': clip_gradient, 'wd': weight_decay}, force_init=force_init) elif optimizer == "adam": module.init_optimizer(kvstore='device', optimizer=optimizer, optimizer_params={'lr_scheduler': lr_scheduler, #'momentum': momentum, 'clip_gradient': clip_gradient, 'wd': weight_decay}, force_init=force_init) else: raise Exception('Supported optimizers are sgd and adam. If you want to implement others define them in train.py') if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) #tensorboard setting tblog_dir = args.config.get('common', 'tensorboard_log_dir') summary_writer = SummaryWriter(tblog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # tensorboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch+1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # tensorboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() # tensorboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value() summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate=learning_rate/learning_rate_annealing log.info('FINISH')