def run_epoch(e, network, dataloader, trainer, log_dir, print_name, is_train): total_loss = nd.zeros(1, ctx) for i, (x, y) in enumerate(dataloader): x = x.as_in_context(ctx) y = y.as_in_context(ctx) with autograd.record(train_mode=is_train): output = network(x) loss_ctc = ctc_loss(output, y) if is_train: loss_ctc.backward() trainer.step(x.shape[0]) if i == 0 and e % SEND_IMAGE_EVERY_N == 0 and e > 0: predictions = output.softmax().topk(axis=2).asnumpy() decoded_text = decode(predictions) # print(decoded_text) output_image = draw_text_on_image(x.asnumpy(), decoded_text) output_image[output_image < 0] = 0 output_image[output_image > 1] = 1 print("{} first decoded text = {}".format(print_name, decoded_text[0])) with SummaryWriter(logdir=log_dir, verbose=False, flush_secs=5) as sw: sw.add_image('bb_{}_image'.format(print_name), output_image, global_step=e) total_loss += loss_ctc.mean() epoch_loss = float(total_loss.asscalar())/len(dataloader) with SummaryWriter(logdir=log_dir, verbose=False, flush_secs=5) as sw: sw.add_scalar('loss', {print_name: epoch_loss}, global_step=e) return epoch_loss
def _init_train(self): from mxboard import SummaryWriter self.record_step = record_step self.loss_name = ['score', 'box', 'class'] self.nd_all_anchors = [ self.all_anchors.copyto(device) for device in ctx ] self.get_default_ltrb() self.L1_loss = gluon.loss.L1Loss() self.L2_loss = gluon.loss.L2Loss() self.LG_loss = gluon.loss.LogisticLoss(label_format='binary') self.CE_loss = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=False, sparse_label=False) self.trainer = gluon.Trainer(self.net.collect_params(), 'adam', {'learning_rate': 0.0001}) self.sw = SummaryWriter(logdir=version + '/logs') #, flush_secs=30) #a = self.net(nd.zeros((1,3,self.size[0],self.size[1]), ctx=ctx[0])) #self.sw.add_graph(self.net) if not os.path.exists(self.backup_dir): os.makedirs(self.backup_dir)
def test_add_multiple_scalars(): sw = SummaryWriter(logdir=_LOGDIR) sw.add_scalar(tag='test_multiple_scalars', value=np.random.uniform(), global_step=0) sw.add_scalar(tag='test_multiple_scalars', value=('scalar1', np.random.uniform()), global_step=0) sw.add_scalar(tag='test_multiple_scalars', value=['scalar2', np.random.uniform()], global_step=0) sw.add_scalar(tag='test_multiple_scalars', value={ 'scalar3': np.random.uniform(), 'scalar4': np.random.uniform() }, global_step=0) items = os.listdir(_LOGDIR) assert len(items) == 2 assert 'test_multiple_scalars' in items items.remove('test_multiple_scalars') assert items[0].startswith(_EVENT_FILE_PREFIX) print(items[0]) assert file_exists(os.path.join(_LOGDIR, items[0])) named_scalar_dir = os.path.join(_LOGDIR, 'test_multiple_scalars') assert dir_exists(named_scalar_dir) for i in range(1, 5): sub_dir = os.path.join(named_scalar_dir, 'scalar%d' % i) assert dir_exists(sub_dir) sub_items = os.listdir(sub_dir) assert len(sub_items) == 1 assert sub_items[0].startswith(_EVENT_FILE_PREFIX)
def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install mxboard via `pip install mxboard`.')
def __init__(self, model, run_id, gpu_idxs=None, tensorboard_logging=False): """ Parameters ---------- model: HybridBlock gpu_idxs: None or list of ints If None will set context to CPU. If list of ints, will set context to given GPUs. """ logging.info("Using Module Learner.") model.hybridize() logging.info("Hybridized model.") input = mx.sym.var('data') pre_output = model(input) output = mx.sym.SoftmaxOutput(pre_output, name='softmax') context = get_context(gpu_idxs) self.module = mx.mod.Module(symbol=output, context=context, data_names=['data'], label_names=['softmax_label']) self.tensorboard_logging = tensorboard_logging if self.tensorboard_logging: from mxboard import SummaryWriter current_folder = os.path.dirname(os.path.realpath(__file__)) tensorboard_folder = os.path.join(current_folder, "..", "logs", "tensorboard") summary_filepath = os.path.join(tensorboard_folder, run_id) self.writer = SummaryWriter(logdir=summary_filepath)
def __call__(self, mxb_writer: mxboard.SummaryWriter, samples_processed: int, *args, **kwargs): if samples_processed - self._last_call > self._freq: self._last_call = samples_processed for k, p in self._params.items(): if p.grad_req != 'null': g = p.grad().asnumpy() mxb_writer.add_histogram(k, g, samples_processed, bins=10)
def __init__( self, net, val_data, train_config: TrainConfig, train_objects: TrainObjects, use_rtpt: bool, ): """ Class for training the neural network. :param net: The NN with loaded parameters that shall be trained. :param val_data: The validation data loaded with gluon DataLoader. :param train_config: An instance of the TrainConfig data class. :param train_objects: Am omstamce pf the TrainObject data class. :param use_rtpt: If True, an RTPT object will be created and modified within this class. """ # Too many instance attributes (29/7) - Too many arguments (24/5) - Too many local variables (25/15) # Too few public methods (1/2) self.tc = train_config self.to = train_objects if self.to.metrics is None: self.to.metrics = {} self._ctx = get_context(train_config.context, train_config.device_id) self._net = net self._graph_exported = False self._val_data = val_data # define a summary writer that logs data and flushes to the file every 5 seconds if self.tc.log_metrics_to_tensorboard: self.sum_writer = SummaryWriter(logdir=self.tc.export_dir + "logs", flush_secs=5, verbose=False) # Define the two loss functions self._softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=self.tc.sparse_policy_label) self._l2_loss = gluon.loss.L2Loss() if self.tc.optimizer_name != "nag": raise NotImplementedError( "The requested optimizer %s Isn't supported yet." % self.tc.optimizer_name) self._trainer = gluon.Trainer( self._net.collect_params(), "nag", { "learning_rate": self.to.lr_schedule(0), "momentum": self.to.momentum_schedule(0), "wd": self.tc.wd, }, ) # collect parameter names for logging the gradients of parameters in each epoch self._params = self._net.collect_params() self._param_names = self._params.keys() self.ordering = list( range(self.tc.nb_parts) ) # define a list which describes the order of the processed batches self.use_rtpt = use_rtpt self.rtpt = None # Set this later in training function
def train(train_data, val_data, epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] # with SummaryWriter(logdir=log_dir, verbose=False) as sw: # sw.add_graph(tripletnet) trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.001}) # Init contrastive loss loss_fn = TripletSemiHardLoss() global_step = 0 for epoch in range(epochs): train_loss = 0 num_batch = len(train_data) tbar = tqdm(train_data) for i, batch in enumerate(tbar): batch_loss = 0 data = mx.gluon.utils.split_and_load(batch[0], ctx_list=context, batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=context, batch_axis=0, even_split=False) with ag.record(): losses = [] for x, y in zip(data, label): embs = net(x) losses.append(loss_fn(embs, y)) for l in losses: l.backward() batch_loss += l.mean().asscalar() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in losses]) global_step += batch_size with SummaryWriter(logdir=log_dir, verbose=False) as sw: sw.add_scalar(tag="BatchLoss", value=batch_loss, global_step=global_step) train_loss /= batch_size * num_batch with SummaryWriter(logdir=log_dir, verbose=False) as sw: sw.add_scalar(tag="TrainLoss", value=train_loss, global_step=global_step) if epoch % save_period == 0: # Test on first device print("Test and visualize") test(val_data, ctx, epoch) net.export("{}/{}".format(save_dir, model_name), epoch=epoch)
def __init__(self, train_fn, args=None, resource=None, searcher=None, search_options=None, checkpoint='./exp/checkpoint.ag', resume=False, num_trials=None, time_out=None, max_reward=1.0, time_attr='epoch', reward_attr='accuracy', visualizer='none', dist_ip_addrs=None): super(FIFOScheduler,self).__init__(dist_ip_addrs) if resource is None: resource = {'num_cpus': 1, 'num_gpus': 0} if searcher is None: searcher = 'random' # Default: Random searcher if search_options is None: search_options = dict() assert isinstance(train_fn, _autogluon_method) self.train_fn = train_fn self.args = args if args else train_fn.args self.resource = resource if isinstance(searcher, str): kwargs = search_options.copy() kwargs['configspace'] = train_fn.cs self.searcher = searcher_factory(searcher, **kwargs) else: assert isinstance(searcher, BaseSearcher) self.searcher = searcher # meta data self.metadata = {} self.metadata['search_space'] = train_fn.kwspaces keys = copy.deepcopy(list(self.metadata['search_space'].keys())) self.metadata['search_strategy'] = searcher self.metadata['stop_criterion'] = {'time_limits': time_out, 'max_reward': max_reward} self.metadata['resources_per_trial'] = resource self.num_trials = num_trials self.time_out = time_out self.max_reward = max_reward self._checkpoint = checkpoint self._time_attr = time_attr self._reward_attr = reward_attr self.visualizer = visualizer.lower() if self.visualizer == 'tensorboard' or self.visualizer == 'mxboard': try_import_mxboard() from mxboard import SummaryWriter self.mxboard = SummaryWriter( logdir=os.path.join(os.path.splitext(checkpoint)[0], 'logs'), flush_secs=3, verbose=False) self.log_lock = mp.Lock() self.training_history = OrderedDict() self.config_history = OrderedDict() if resume: if os.path.isfile(checkpoint): self.load_state_dict(load(checkpoint)) else: msg = 'checkpoint path {} is not available for resume.'.format(checkpoint) logger.exception(msg) raise FileExistsError(msg)
def __init__(self, save_path=None, **kwargs): if not save_path: raise ValueError('save_path not specified') from mxboard import SummaryWriter logdir = save_path save_path = os.path.join(logdir, 'save') os.makedirs(logdir, exist_ok=True) self.sw = SummaryWriter(logdir=logdir) StatsWriter.__init__(self, save_path=save_path, **kwargs)
def __call__(self, mxb_writer: mxboard.SummaryWriter, samples_processed: int, *args, **kwargs): if samples_processed - self._last_call > self._freq: self._last_call = samples_processed # generate image from model samples = self._nn.generate( *self._conditioning_variables).asnumpy() img = samples.reshape((samples.shape[0], *self._image_shape)) mxb_writer.add_image('Generated_image', img, samples_processed)
def __init__(self, config, model, criterion, ctx, sample_input): config['trainer']['output_dir'] = os.path.join(str(pathlib.Path(os.path.abspath(__name__)).parent), config['trainer']['output_dir']) config['name'] = config['name'] + '_' + model.model_name self.save_dir = os.path.join(config['trainer']['output_dir'], config['name']) self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') self.alphabet = config['dataset']['alphabet'] if config['trainer']['resume_checkpoint'] == '' and config['trainer']['finetune_checkpoint'] == '': shutil.rmtree(self.save_dir, ignore_errors=True) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) # 保存本次实验的alphabet 到模型保存的地方 save(list(self.alphabet), os.path.join(self.save_dir, 'dict.txt')) self.global_step = 0 self.start_epoch = 0 self.config = config self.model = model self.criterion = criterion # logger and tensorboard self.tensorboard_enable = self.config['trainer']['tensorboard'] self.epochs = self.config['trainer']['epochs'] self.display_interval = self.config['trainer']['display_interval'] if self.tensorboard_enable: from mxboard import SummaryWriter self.writer = SummaryWriter(self.save_dir, verbose=False) self.logger = setup_logger(os.path.join(self.save_dir, 'train.log')) self.logger.info(pformat(self.config)) self.logger.info(self.model) # device set self.ctx = ctx mx.random.seed(2) # 设置随机种子 self.logger.info('train with mxnet: {} and device: {}'.format(mx.__version__, self.ctx)) self.metrics = {'val_acc': 0, 'train_loss': float('inf'), 'best_model': ''} schedule = self._initialize('lr_scheduler', mx.lr_scheduler) optimizer = self._initialize('optimizer', mx.optimizer, lr_scheduler=schedule) self.trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) if self.config['trainer']['resume_checkpoint'] != '': self._laod_checkpoint(self.config['trainer']['resume_checkpoint'], resume=True) elif self.config['trainer']['finetune_checkpoint'] != '': self._laod_checkpoint(self.config['trainer']['finetune_checkpoint'], resume=False) if self.tensorboard_enable: try: # add graph from mxnet.gluon import utils as gutils self.model(sample_input) self.writer.add_graph(model) except: self.logger.error(traceback.format_exc()) self.logger.warn('add graph to tensorboard failed')
def __init__(self, logdir, keys=['val_acc', 'val_loss']): if not isinstance(keys, (list, tuple)): raise ValueError("Keys should be a list or a tuple.") self.keys = keys self.sw = SummaryWriter(logdir=os.path.join(logdir, 'tb')) self.csv_path = os.path.join(logdir, 'history.csv') with open(self.csv_path, 'w') as f: f.write(";".join(keys) + "\n")
def __init__(self, config): self.config = config self.train_summary_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), "logs", "train") self.validate_summary_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), "logs", "val") if not os.path.exists(self.train_summary_dir): os.makedirs(self.train_summary_dir) if not os.path.exists(self.validate_summary_dir): os.makedirs(self.validate_summary_dir) self.train_summary_writer = SummaryWriter(self.train_summary_dir) self.validate_summary_writer = SummaryWriter(self.validate_summary_dir)
def run_epoch(e, network, dataloader, trainer, print_name, update_network, save_network, print_output): total_loss = nd.zeros(1, ctx) for i, (x, y) in enumerate(dataloader): x = x.as_in_context(ctx) y = y.as_in_context(ctx) with autograd.record(): output = network(x, y) loss = loss_func(output, y) if update_network: loss.backward() trainer[0].step(y.shape[0]) trainer[1].step(y.shape[0]) total_loss += loss.mean() batch_loss += loss.mean() # if i % print_n == 0 and i > 0: # mean_batch_loss = float(batch_loss.asscalar()/print_n) # print('{} Batches {}: {:.6f}'.format(print_name, i, mean_batch_loss)) # batch_loss = nd.zeros(1, ctx) # nd.waitall() epoch_loss = float(total_loss.asscalar()) / len(dataloader) if print_output and e % print_text_every_n == 0 and e > 0: text = "predicted\t| actual\t| noisy \n ---- | ---- | ---- \n" for n in range(y.shape[0]): out_np = output.asnumpy()[n, :] y_np = y.asnumpy()[n, :] x_np = x.asnumpy()[n, :] out_np_max = np.argmax(out_np, axis=1) out_decoded = decode(out_np_max) y_decoded = decode(y_np) x_decoded = decode(x_np) output_text = out_decoded + "\t| " + y_decoded + "\t| " + x_decoded text += output_text + "\n" with SummaryWriter(logdir="./logs", verbose=False, flush_secs=5) as sw: sw.add_text(tag='{}_text'.format(print_name), text=text, global_step=e) print("output {}".format(text)) # if save_network and e % save_every_n == 0 and e > 0: # network.save_params("{}/{}".format(checkpoint_dir, checkpoint_name)) with SummaryWriter(logdir="./logs", verbose=False, flush_secs=5) as sw: sw.add_scalar('loss', {print_name: epoch_loss}, global_step=e) return epoch_loss
class TensorboardStatsWriter(StatsWriter): def __init__(self, save_path=None, **kwargs): if not save_path: raise ValueError('save_path not specified') from mxboard import SummaryWriter logdir = save_path save_path = os.path.join(logdir, 'save') os.makedirs(logdir, exist_ok=True) self.sw = SummaryWriter(logdir=logdir) StatsWriter.__init__(self, save_path=save_path, **kwargs) def _write(self, idx, key, value): self.sw.add_scalar(key, value, idx)
def __init__(self, config): ##setting hyper-parameters self.config = config self.batch_size = config.batch_size self.epochs = config.epochs self.N = config.N self.K = config.K self.input_dims = config.input_dims self.GPU_COUNT = config.GPU_COUNT self.ctx = setting_ctx(self.GPU_COUNT) self.build_model() self.writer = SummaryWriter(logdir=self.config.logdir, filename_suffix="_SNAIL")
class TensorboardCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). """ def __init__(self, logging_dir, total_step=0, prefix=None): self.prefix = prefix self.step = total_step try: from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error( 'You can install tensorboard via `pip install mxboard`.') def __call__(self, param, name_value=None): """Callback to log training speed and metrics in TensorBoard.""" if name_value: self._add_scalar(name_value) if param.eval_metric is None: return # if param.add_step: self.step += 1 name_value = param.eval_metric.get_name_value() if name_value: self._add_scalar(name_value) def _add_scalar(self, name_value): for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(tag=name, value=value, global_step=self.step)
def __init__(self, args): self.args = args self.sw = SummaryWriter(logdir='logs', flush_secs=5) # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) # dataset and dataloader data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size} trainset = COCOSemantic(split='train', mode='train', **data_kwargs) valset = COCOSemantic(split='val', mode='val', **data_kwargs) self.train_data = gluon.data.DataLoader( trainset, args.batch_size, shuffle=True, last_batch='rollover', num_workers=args.workers) self.eval_data = gluon.data.DataLoader(valset, args.test_batch_size, last_batch='rollover', num_workers=args.workers) model = model_zoo.PSPNet(nclass=trainset.NUM_CLASS, backbone='resnet50', aux=True, pretrained_base=True) model.cast(args.dtype) self.net = DataParallelModel(model, args.ctx, args.syncbn) self.evaluator = DataParallelModel(SegEvalModel(model), args.ctx) # resume checkpoint if needed if args.resume is not None: if os.path.isfile(args.resume): model.load_parameters(args.resume, ctx=args.ctx) else: raise RuntimeError("=> no checkpoint found at '{}'" \ .format(args.resume)) # create criterion criterion = MixSoftmaxCrossEntropyLoss(args.aux, aux_weight=args.aux_weight) self.criterion = DataParallelCriterion(criterion, args.ctx, args.syncbn) # optimizer and lr scheduling self.lr_scheduler = LRScheduler(mode='poly', baselr=args.lr, niters=len(self.train_data), nepochs=args.epochs) kv = mx.kv.create(args.kvstore) optimizer_params = {'lr_scheduler': self.lr_scheduler, 'wd': args.weight_decay, 'momentum': args.momentum} if args.dtype == 'float16': optimizer_params['multi_precision'] = True if args.no_wd: for k, v in self.net.module.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 self.optimizer = gluon.Trainer(self.net.module.collect_params(), 'sgd', optimizer_params, kvstore=kv) # evaluation metrics self.metric = gluoncv.utils.metrics.SegmentationMetric(trainset.num_class)
def test_evaluate(self): from model import hybrid_model from model import trainer from data_loader.data_utils import data_gen import numpy as np from mxboard import SummaryWriter import os import shutil ctx = mx.gpu(1) num_of_vertices = 897 batch_size = 50 PeMS_dataset = data_gen('datasets/PeMSD7_V_897.csv', 24) print('>> Loading dataset with Mean: {0:.2f}, STD: {1:.2f}'.format( PeMS_dataset.mean, PeMS_dataset.std)) test = PeMS_dataset['test'].transpose((0, 3, 1, 2)) test_x, test_y = test[:100, :, :12, :], test[:100, :, 12:, :] test_loader = gluon.data.DataLoader(gluon.data.ArrayDataset( nd.array(test_x), nd.array(test_y)), batch_size=batch_size, shuffle=False) print(test_x.shape, test_y.shape) cheb_polys = nd.random_uniform(shape=(num_of_vertices, num_of_vertices * 3)) blocks = [[1, 32, 64], [64, 32, 128]] x = nd.random_uniform(shape=(batch_size, 1, 12, num_of_vertices), ctx=ctx) net = hybrid_model.STGCN(12, 3, 3, blocks, 1.0, num_of_vertices, cheb_polys) net.initialize(ctx=ctx) net.hybridize() net(x) ground_truth = ( np.concatenate([y.asnumpy() for x, y in test_loader], axis=0) * PeMS_dataset.std + PeMS_dataset.mean)[:100] if os.path.exists('test_logs'): shutil.rmtree('test_logs') sw = SummaryWriter('test_logs', flush_secs=5) trainer.evaluate(net, ctx, ground_truth, test_loader, 12, PeMS_dataset.mean, PeMS_dataset.std, sw, 0) self.assertEqual(os.path.exists('test_logs'), True) sw.close() if os.path.exists('test_logs'): shutil.rmtree('test_logs')
class LogMetricsCallback(object): """Log metrics periodically in TensorBoard. This callback works almost same as `callback.Speedometer`, but write TensorBoard event file for visualization. For more usage, please refer https://github.com/dmlc/tensorboard Parameters ---------- logging_dir : str TensorBoard event file directory. After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization. prefix : str Prefix for a metric name of `scalar` value. You might want to use this param to leverage TensorBoard plot feature, where TensorBoard plots different curves in one graph when they have same `name`. The follow example shows the usage(how to compare a train and eval metric in a same graph). Examples -------- >>> # log train and eval metrics under different directories. >>> training_log = 'logs/train' >>> evaluation_log = 'logs/eval' >>> # in this case, each training and evaluation metric pairs has same name, >>> # you can add a prefix to make it separate. >>> batch_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(training_log)] >>> eval_end_callbacks = [mx.contrib.tensorboard.LogMetricsCallback(evaluation_log)] >>> # run >>> model.fit(train, >>> ... >>> batch_end_callback = batch_end_callbacks, >>> eval_end_callback = eval_end_callbacks) >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization. """ def __init__(self, logging_dir, prefix=None): self.prefix = prefix try: from mxboard import SummaryWriter self.summary_writer = SummaryWriter(logging_dir) except ImportError: logging.error('You can install mxboard via `pip install mxboard`.') def __call__(self, param): """Callback to log training speed and metrics in TensorBoard.""" if param.eval_metric is None: return name_value = param.eval_metric.get_name_value() for name, value in name_value: if self.prefix is not None: name = '%s-%s' % (self.prefix, name) self.summary_writer.add_scalar(name, value, global_step=param.epoch)
class Logger: """ mxboard for mxnet """ def __init__(self, config): self.config = config self.train_summary_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), "logs", "train") self.validate_summary_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), "logs", "val") if not os.path.exists(self.train_summary_dir): os.makedirs(self.train_summary_dir) if not os.path.exists(self.validate_summary_dir): os.makedirs(self.validate_summary_dir) self.train_summary_writer = SummaryWriter(self.train_summary_dir) self.validate_summary_writer = SummaryWriter(self.validate_summary_dir) # it can summarize scalars and images. def data_summarize(self, step, summarizer="train", summaries_dict=None): """ :param step: the step of the summary :param summarizer: use the train summary writer or the validate one :param summaries_dict: the dict of the summaries values (tag,value) :return: """ summary_writer = self.train_summary_writer if summarizer == "train" else self.validate_summary_writer if summaries_dict is not None: # summary_writer.add_scalars('./', summaries_dict, step) for tag, value in summaries_dict.items(): summary_writer.add_scalar(tag=tag, value=value, global_step=step) summary_writer.flush() # summary = tf.Summary() # for tag, value in summaries_dict.items(): # summary.value.add(tag=tag, simple_value=value) # summary_writer.add_summary(summary, step) # summary_writer.flush() def graph_summary(self, net, summarizer="train"): summary_writer = self.train_summary_writer if summarizer == "train" else self.validate_summary_writer input_to_model = mxnet.ndarray.ones( shape=(1, self.config['num_channels'], self.config['img_height'], self.config['img_width']), dtype='float32') summary_writer.add_graph(net, (input_to_model, )) def close(self): self.train_summary_writer.close() self.validate_summary_writer.close()
def _init_train(self): self.exp = datetime.datetime.now().strftime("%m-%dx%H-%M") self.exp = self.exp + '_' + self.dataset self.batch_size *= len(self.ctx) print(global_variable.yellow) print('Training Title = {}'.format(self.exp)) print('Batch Size = {}'.format(self.batch_size)) print('Record Step = {}'.format(self.record_step)) print('Step = {}'.format(self.steps)) print('Area = {}'.format(self.area)) ''' for k in self.loss_name: print('%s%s: 10^%s%d' % ( global_variable.blue, k, global_variable.yellow, math.log10(self.scale[k]))) ''' self.backward_counter = self.train_counter_start self.nd_all_anchors = [ self.all_anchors.copyto(dev) for dev in self.ctx ] self._get_default_ltrb() if self.use_fp16: self.nd_all_anchors = self.fp32_2_fp16(self.nd_all_anchors) self.all_anchors_ltrb = self.fp32_2_fp16(self.all_anchors_ltrb) self.HB_loss = gluon.loss.HuberLoss() # self.L1_loss = gluon.loss.L1Loss() # self.L2_loss = gluon.loss.L2Loss() self.LG_loss = gluon.loss.LogisticLoss(label_format='binary') self.CE_loss = gluon.loss.SoftmaxCrossEntropyLoss(from_logits=False, sparse_label=False) # -------------------- init trainer -------------------- # optimizer = mx.optimizer.create('adam', learning_rate=self.learning_rate, multi_precision=self.use_fp16) self.trainer = gluon.Trainer(self.net.collect_params(), optimizer=optimizer) # -------------------- init tensorboard -------------------- # logdir = self.version + '/logs' self.sw = SummaryWriter(logdir=logdir, verbose=False) if not os.path.exists(self.backup_dir): os.makedirs(self.backup_dir)
def __init__(self, rank, size, prefix_dir): self.batch_size = config.batch_size self.rank = rank self.size = size self.prefix_dir = prefix_dir self.frequent = config.frequent self.init = False self.tic = 0 self.last_count = 0 self.loss_metric = MetricNdarray() t = time.localtime() self.summary_writer = SummaryWriter( logdir=os.path.join(self.prefix_dir, "log_tensorboard", "%s_%s_%s" % (str(t.tm_mon), str(t.tm_mday), str(t.tm_hour))), verbose=False)
def test_add_graph_symbol(): data = mx.sym.Variable('data') conv = mx.sym.Convolution(data, kernel=(2, 2), num_filter=2) nodes = _get_nodes_from_symbol(conv) expected_nodes = [NodeDef(name='data', op='null'), NodeDef(name='convolution0/convolution0_weight', op='null', attr={'param': AttrValue( s='{ kernel : (2, 2) , num_filter : 2 }'.encode(encoding='utf-8'))}), NodeDef(name='convolution0/convolution0_bias', op='null', attr={'param': AttrValue( s='{ kernel : (2, 2) , num_filter : 2 }'.encode(encoding='utf-8'))}), NodeDef(name='convolution0/convolution0', op='Convolution', input=['data', 'convolution0/convolution0_weight', 'convolution0/convolution0_bias'], attr={'param': AttrValue( s='{ kernel : (2, 2) , num_filter : 2 }'.encode(encoding='utf-8'))})] # check _get_nodes_from_symbol for expected_node, node in zip(expected_nodes, nodes): assert expected_node == node # check _sym2pb expected_graph = GraphDef(node=expected_nodes, versions=VersionDef(producer=100)) graph = _net2pb(conv) assert expected_graph == graph # check add_graph with SummaryWriter(logdir=_LOGDIR) as sw: sw.add_graph(conv) check_event_file_and_remove_logdir()
def check_add_pr_curve(labels, predictions, num_thresholds): with SummaryWriter(_LOGDIR) as sw: sw.add_pr_curve(tag='test_add_pr_curve', labels=labels, predictions=predictions, num_thresholds=num_threshodls) check_event_file_and_remove_logdir()
def run_epoch(e, network, dataloader, loss_function, trainer, log_dir, print_name, update_cnn, save_cnn, ctx=mx.gpu()): total_loss = nd.zeros(1, ctx) for i, (data, label) in enumerate(dataloader): data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = network(data) loss_i = loss_function(output, label) if update_cnn: loss_i.backward() trainer.step(data.shape[0]) total_loss += loss_i.mean() if e % send_image_every_n == 0 and e > 0 and i == 0: output_image = draw_box_on_image(output.asnumpy(), label.asnumpy(), data.asnumpy()) epoch_loss = float(total_loss.asscalar())/len(dataloader) with SummaryWriter(logdir=log_dir, verbose=False, flush_secs=5) as sw: sw.add_scalar('loss', {print_name: epoch_loss}, global_step=e) if e % send_image_every_n == 0 and e > 0: output_image[output_image<0] = 0 output_image[output_image>1] = 1 sw.add_image('bb_{}_image'.format(print_name), output_image, global_step=e) if save_cnn and e % save_every_n == 0 and e > 0: network.save_parameters("{}/{}".format(checkpoint_dir, checkpoint_name)) return epoch_loss
def __init__(self, model_param, vocab_path, mode='train', vocab_tag_path=None, encoder_type='rnn', head_attention=False, decoder_cell='lstm', ctx=cpu()): """ # TODO 选择模型的编码器解码器部分 # TODO Encoder: Parsed | RNN # TODO Decoder: Headline | RNN # TODO Decoder_RNN_TYPE: DLSTM | LSMT | GRU 根据参数与模式,构建模型 :param mode: train|decode|test 控制当前模型的用途 :param vocab_path: 词典路径 :param vocab_tag_path: 句法解析标记词典路径 :param model_param: 模型中的超参数 """ self.vocab_path = vocab_path self.vocab = Vocab(vocab_path) if vocab_tag_path is not None: self.vocab_tag_path = vocab_tag_path self.vocab_tag = Vocab(vocab_tag_path) self.mode = mode self.loss = SoftmaxCrossEntropyLoss() self.model_param = model_param self.encoder_type = encoder_type if encoder_type == 'rnn': pass # self.model = Seq2SeqRNN(self.vocab, self.model_param, ctx) self.model = Seq2SeqRNN(self.vocab, 'LSTM', model_param['emb_size'], model_param['hidden_size'], self.vocab.size, 60, 'Bahdanau', 'two_way', None, None, 0, 1, ctx=ctx) elif encoder_type == 'parse': self.model = ParseModel(self.vocab, self.vocab_tag, self.model_param, ctx) self.model.initialize(ctx=ctx) self.ctx = ctx self.trainer = Trainer(self.model.collect_params(), 'adam', {'learning_rate': 0.01}) self.global_step = 0 self.sw = SummaryWriter('./logs', flush_secs=2)
def plot_mxboard(block, logdir='./logs'): """Plot network to visualize internal structures. Parameters ---------- block : mxnet.gluon.HybridBlock A hybridizable network to be visualized. logdir : str The directory to save. """ try: from mxboard import SummaryWriter except ImportError: print('mxboard is required. Please install via `pip install mxboard` ' + 'or refer to https://github.com/awslabs/mxboard.') raise data = mx.sym.var('data') sym = block(data) if isinstance(sym, tuple): sym = mx.sym.Group(sym) with SummaryWriter(logdir=logdir) as sw: sw.add_graph(sym) usage = '`tensorboard --logdir={} --host=127.0.0.1 --port=8888`'.format(logdir) print('Log saved. Use: {} to visualize it'.format(usage))
def test(val_data, ctx, epoch): embedding = None labels = None images = None initialized = False for i, (data, label) in enumerate(val_data): data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=0) outputs = [net(X) for X in data] outputs = mx.nd.concat(*outputs, dim=0) label = mx.nd.concat(*label, dim=0) if initialized: embedding = mx.nd.concat(*(embedding, outputs), dim=0) labels = mx.nd.concat(*(labels, label), dim=0) else: embedding = outputs labels = label initialized = True with SummaryWriter(logdir=log_dir) as sw: sw.add_embedding(tag='{}_tripletnet_semihard_{}'.format( opt.dataset, epoch), embedding=embedding, labels=labels, images=images)
noise = mx.nd.zeros((opt.batchSize, nz, 1, 1)) fixed_noise = mx.ndarray.random.normal(shape=(opt.batchSize, nz, 1, 1)) one = mx.nd.array([1]) mone = one * -1 # setup optimizer if opt.adam: trainerD = Trainer(netD.collect_params(),optimizer='adam',optimizer_params={'learning_rate': opt.lrD,'beta1': opt.beta1,'beta2':0.999}) trainerG = Trainer(netG.collect_params(),optimizer='adam',optimizer_params={'learning_rate': opt.lrG, 'beta1': opt.beta1, 'beta2': 0.999}) else: trainerD = Trainer(netD.collect_params(),optimizer='rmsprop',optimizer_params={'learning_rate': opt.lrD,'gamma1':0.99,'gamma2':0.99,'epsilon':1e-12}) trainerG = Trainer(netG.collect_params(),optimizer='rmsprop', optimizer_params={'learning_rate': opt.lrG,'gamma1':0.99,'gamma2':0.99,'epsilon':1e-14}) print('start training') sw = SummaryWriter(logdir='./logs', flush_secs=5) netD.hybridize() netG.hybridize() gen_iterations = 0 for epoch in range(opt.niter): data_iter = iter(dataloader) i = 0 while i < len(dataloader): ############################ # (1) Update D network ########################### # train the discriminator Diters times if gen_iterations < 25 or gen_iterations % 500 == 0:
def do_training(args, module, data_train, data_val, begin_epoch=0): from distutils.dir_util import mkpath from log_util import LogUtil log = LogUtil.getInstance().getlogger() mkpath(os.path.dirname(get_checkpoint_path(args))) #seq_len = args.config.get('arch', 'max_t_count') batch_size = args.config.getint('common', 'batch_size') save_checkpoint_every_n_epoch = args.config.getint('common', 'save_checkpoint_every_n_epoch') save_checkpoint_every_n_batch = args.config.getint('common', 'save_checkpoint_every_n_batch') enable_logging_train_metric = args.config.getboolean('train', 'enable_logging_train_metric') enable_logging_validation_metric = args.config.getboolean('train', 'enable_logging_validation_metric') contexts = parse_contexts(args) num_gpu = len(contexts) eval_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_validation_metric,is_epoch_end=True) # mxboard setting loss_metric = STTMetric(batch_size=batch_size, num_gpu=num_gpu, is_logging=enable_logging_train_metric,is_epoch_end=False) optimizer = args.config.get('optimizer', 'optimizer') learning_rate = args.config.getfloat('train', 'learning_rate') learning_rate_annealing = args.config.getfloat('train', 'learning_rate_annealing') mode = args.config.get('common', 'mode') num_epoch = args.config.getint('train', 'num_epoch') clip_gradient = args.config.getfloat('optimizer', 'clip_gradient') weight_decay = args.config.getfloat('optimizer', 'weight_decay') save_optimizer_states = args.config.getboolean('train', 'save_optimizer_states') show_every = args.config.getint('train', 'show_every') optimizer_params_dictionary = json.loads(args.config.get('optimizer', 'optimizer_params_dictionary')) kvstore_option = args.config.get('common', 'kvstore_option') n_epoch=begin_epoch is_bucketing = args.config.getboolean('arch', 'is_bucketing') if clip_gradient == 0: clip_gradient = None if is_bucketing and mode == 'load': model_file = args.config.get('common', 'model_file') model_name = os.path.splitext(model_file)[0] model_num_epoch = int(model_name[-4:]) model_path = 'checkpoints/' + str(model_name[:-5]) symbol, data_names, label_names = module(1600) model = STTBucketingModule( sym_gen=module, default_bucket_key=data_train.default_bucket_key, context=contexts) data_train.reset() model.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) _, arg_params, aux_params = mx.model.load_checkpoint(model_path, model_num_epoch) model.set_params(arg_params, aux_params) module = model else: module.bind(data_shapes=data_train.provide_data, label_shapes=data_train.provide_label, for_training=True) if begin_epoch == 0 and mode == 'train': module.init_params(initializer=get_initializer(args)) lr_scheduler = SimpleLRScheduler(learning_rate=learning_rate) def reset_optimizer(force_init=False): optimizer_params = {'lr_scheduler': lr_scheduler, 'clip_gradient': clip_gradient, 'wd': weight_decay} optimizer_params.update(optimizer_params_dictionary) module.init_optimizer(kvstore=kvstore_option, optimizer=optimizer, optimizer_params=optimizer_params, force_init=force_init) if mode == "train": reset_optimizer(force_init=True) else: reset_optimizer(force_init=False) data_train.reset() data_train.is_first_epoch = True #mxboard setting mxlog_dir = args.config.get('common', 'mxboard_log_dir') summary_writer = SummaryWriter(mxlog_dir) while True: if n_epoch >= num_epoch: break loss_metric.reset() log.info('---------train---------') for nbatch, data_batch in enumerate(data_train): module.forward_backward(data_batch) module.update() # mxboard setting if (nbatch + 1) % show_every == 0: module.update_metric(loss_metric, data_batch.label) #summary_writer.add_scalar('loss batch', loss_metric.get_batch_loss(), nbatch) if (nbatch+1) % save_checkpoint_every_n_batch == 0: log.info('Epoch[%d] Batch[%d] SAVE CHECKPOINT', n_epoch, nbatch) module.save_checkpoint(prefix=get_checkpoint_path(args)+"n_epoch"+str(n_epoch)+"n_batch", epoch=(int((nbatch+1)/save_checkpoint_every_n_batch)-1), save_optimizer_states=save_optimizer_states) # commented for Libri_sample data set to see only train cer log.info('---------validation---------') data_val.reset() eval_metric.reset() for nbatch, data_batch in enumerate(data_val): # when is_train = False it leads to high cer when batch_norm module.forward(data_batch, is_train=True) module.update_metric(eval_metric, data_batch.label) # mxboard setting val_cer, val_n_label, val_l_dist, _ = eval_metric.get_name_value() log.info("Epoch[%d] val cer=%f (%d / %d)", n_epoch, val_cer, int(val_n_label - val_l_dist), val_n_label) curr_acc = val_cer summary_writer.add_scalar('CER validation', val_cer, n_epoch) assert curr_acc is not None, 'cannot find Acc_exclude_padding in eval metric' data_train.reset() data_train.is_first_epoch = False # mxboard setting train_cer, train_n_label, train_l_dist, train_ctc_loss = loss_metric.get_name_value() summary_writer.add_scalar('loss epoch', train_ctc_loss, n_epoch) summary_writer.add_scalar('CER train', train_cer, n_epoch) # save checkpoints if n_epoch % save_checkpoint_every_n_epoch == 0: log.info('Epoch[%d] SAVE CHECKPOINT', n_epoch) module.save_checkpoint(prefix=get_checkpoint_path(args), epoch=n_epoch, save_optimizer_states=save_optimizer_states) n_epoch += 1 lr_scheduler.learning_rate=learning_rate/learning_rate_annealing log.info('FINISH')
def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): """Perform CapsNet training""" summary_writer = SummaryWriter(args.tblog_dir) lr_scheduler = SimpleLRScheduler(learning_rate) optimizer_params = {'lr_scheduler': lr_scheduler} module.init_params() module.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) n_epoch = 0 while True: if n_epoch >= num_epoch: break train_iter.reset() val_iter.reset() loss_metric.reset() for n_batch, data_batch in enumerate(train_iter): module.forward_backward(data_batch) module.update() module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) train_acc, train_loss, train_recon_err = loss_metric.get_name_value() loss_metric.reset() for n_batch, data_batch in enumerate(val_iter): module.forward(data_batch) module.update_metric(loss_metric, data_batch.label) loss_metric.get_batch_log(n_batch) val_acc, val_loss, val_recon_err = loss_metric.get_name_value() summary_writer.add_scalar('train_acc', train_acc, n_epoch) summary_writer.add_scalar('train_loss', train_loss, n_epoch) summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) summary_writer.add_scalar('val_acc', val_acc, n_epoch) summary_writer.add_scalar('val_loss', val_loss, n_epoch) summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err)) print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) print('SAVE CHECKPOINT') module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) n_epoch += 1 lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)