def train_step(self, batch): """Replace the default train_step function.""" self.trainer.model.train() input, target = batch self.trainer.optimizer.zero_grad() if vega.is_gpu_device(): alphas = torch.from_numpy(self.alphas).cuda() elif vega.is_npu_device(): alphas = torch.from_numpy(self.alphas).npu() for j in range(self.alg_policy.num_individual_per_iter): i = np.random.randint(0, self.alg_policy.num_individual, 1)[0] if self.epoch < self.alg_policy.warmup: if vega.is_gpu_device(): alpha = torch.from_numpy( self.search_alg.random_sample_path()).cuda() elif vega.is_npu_device(): alpha = torch.from_numpy( self.search_alg.random_sample_path()).npu() # logits = self.trainer.model.forward_random(input) else: alpha = alphas[i] logits = self.trainer.model(input, alpha=alpha) loss = self.trainer.loss(logits, target) loss.backward(retain_graph=True) if self.epoch < self.alg_policy.warmup: break nn.utils.clip_grad_norm_(self.trainer.model.parameters(), self.trainer.config.grad_clip) self.trainer.optimizer.step() return { 'loss': loss.item(), 'train_batch_output': logits, 'lr': self.trainer.lr_scheduler.get_lr() }
def cal_adversial_loss(self, netD, real, fake): """Calculate adversial loss for the discriminator. :param netD: the discriminator D :type netD: nn.Module :param real: real images :type real: tensor :param fake: fake images :type fake: tensor :return: discriminator loss :rtype: torch.FloatTensor """ pred_real = netD(real) if vega.is_npu_device(): self.real_label = torch.tensor(1.0).expand_as(pred_real).npu() else: self.real_label = torch.tensor(1.0).expand_as(pred_real).cuda() loss_D_real = self.criterionGAN(pred_real, self.real_label) pred_fake = netD(fake.detach()) if vega.is_npu_device(): self.fake_label = torch.tensor(0.0).expand_as(pred_fake).npu() else: self.fake_label = torch.tensor(0.0).expand_as(pred_fake).cuda() loss_D_fake = self.criterionGAN(pred_fake, self.fake_label) loss_D = (loss_D_real + loss_D_fake) * 0.5 loss_D.backward() return loss_D
def _init_ms_context(self): if hasattr(self.config, "execute_mode"): mode = context.PYNATIVE_MODE if self.config.execute_mode == "PYNATIVE_MODE" else context.GRAPH_MODE else: mode = context.GRAPH_MODE if vega.is_npu_device(): context.set_context(mode=mode, device_target="Ascend", device_id=int(os.environ["DEVICE_ID"])) else: context.set_context(mode=mode, device_target="CPU") self.dataset_sink_mode = True if vega.is_npu_device() else False
def find_best_PSNR(HR, SR, crop_size): """Calculate PSNR and find best PSNR between HR and SR with pixel offset. :param HR: HR image :type HR: torch.FloatTensor/torch.cuda.FloatTensor :param SR: SR image :type SR: torch.FloatTensor/torch.cuda.FloatTensor :param crop_size: pixel offset when calculating psnr during evaluation, default: 10 :type crop_size: int :return: maximum psnr :rtype: Float """ if (crop_size == 0): return 20 * torch.log10(1 / torch.sqrt(torch.mean((HR - SR) ** 2))).cpu().item() SR = SR.squeeze() HR = HR.squeeze() SR_crop = SR[:, crop_size:-crop_size, crop_size:-crop_size] PSNR_list = torch.zeros((2 * crop_size + 1, 2 * crop_size + 1)).to() for i in range(2 * crop_size + 1): for j in range(2 * crop_size + 1): HR_crop = HR[:, i:i + SR_crop.shape[1], j:j + SR_crop.shape[2]] if vega.is_npu_device(): psnr = 20 * torch.log10(1 / torch.sqrt(torch.mean((HR_crop.cpu() - SR_crop.cpu()) ** 2))) else: psnr = 20 * torch.log10(1 / torch.sqrt(torch.mean((HR_crop - SR_crop) ** 2))) PSNR_list[i, j] = psnr.detach().cpu().item() del HR_crop del psnr max_psnr = PSNR_list.max() del PSNR_list # index = (flatten_index//PSNR_list.shape[1],flatten_index%PSNR_list.shape[1]) return max_psnr.cpu().item()
def before_train(self, logs=None): """Be called before the train process.""" self.config = self.trainer.config self.device = vega.is_gpu_device() if vega.is_gpu_device( ) is not True else 0 self.base_net_desc = self.trainer.model.desc sess_config = None if vega.is_torch_backend(): if vega.is_npu_device(): count_input = torch.FloatTensor(1, 3, 32, 32).npu() elif vega.is_gpu_device(): count_input = torch.FloatTensor(1, 3, 32, 32).to(self.device) elif vega.is_tf_backend(): count_input = tf.random.uniform([1, 3, 32, 32], dtype=tf.float32) sess_config = self.trainer._init_session_config() elif vega.is_ms_backend(): count_input = mindspore.Tensor( np.random.randn(1, 3, 32, 32).astype(np.float32)) self.flops_count, self.params_count = calc_model_flops_params( self.trainer.model, count_input) self.latency_count = calc_forward_latency(self.trainer.model, count_input, sess_config) logging.info("after prune model glops=%sM, params=%sK, latency=%sms", self.flops_count * 1e-6, self.params_count * 1e-3, self.latency_count * 1000) self.trainer.model = self._generate_init_model() if vega.is_torch_backend(): self.trainer.optimizer = Optimizer()( model=self.trainer.model, distributed=self.trainer.distributed) self.trainer.lr_scheduler = LrScheduler()(self.trainer.optimizer)
def run_remote_worker(worker_id, worker_path, id, num_workers): """Run worker on remote node.""" from vega.common.utils import init_log init_log(level="info", log_file=".temp_{}.log".format(worker_id), log_path=worker_path) for index in range(num_workers): config = _load_config(worker_id, worker_path, id, index) if "LD_LIBRARY_PATH" in config["env"] and config["env"][ "LD_LIBRARY_PATH"] is not None: os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"] os.environ["PWD"] = config["env"]["PWD"] os.chdir(os.environ["PWD"]) vega.set_backend(os.environ['BACKEND_TYPE'].lower(), os.environ["DEVICE_CATEGORY"]) if vega.is_gpu_device(): sub_pid_list = call_in_gpu(config, id, worker_id, worker_path, index) elif vega.is_npu_device(): os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"] os.environ["PATH"] = config["env"]["PATH"] os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"] sub_pid_list = call_in_npu(config, id, worker_id, worker_path, index) logging.info("DistributedWorker finished!") for sub_pid in sub_pid_list: kill_proc_tree(pid=sub_pid) logging.info("DistributedWorker subprocess cleaned!") return 0
def _init_tf_estimator(self): """Init tensorflow estimator.""" sess_config = self._init_session_config() if vega.is_gpu_device(): self._init_gpu_estimator(sess_config) elif vega.is_npu_device(): self._init_npu_estimator(sess_config)
def _save_checkpoint(self, epoch, best=False): """Save model weights. :param epoch: current epoch :type epoch: int """ save_dir = os.path.join(self.worker_path, str(epoch)) FileOps.make_dir(save_dir) for name in self.model.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = FileOps.join_path(save_dir, save_filename) net = getattr(self.model, 'net' + name) best_file = FileOps.join_path(self.worker_path, "model_{}.pth".format(name)) if vega.is_gpu_device() and torch.cuda.is_available(): # torch.save(net.module.cpu().state_dict(), save_path) torch.save(net.module.state_dict(), save_path) # net.cuda() if best: torch.save(net.module.state_dict(), best_file) elif vega.is_npu_device(): torch.save(net.state_dict(), save_path) if best: torch.save(net.state_dict(), best_file) else: torch.save(net.cpu().state_dict(), save_path) if best: torch.save(net.cpu().state_dict(), best_file)
def _init_model(self): """Load model desc from save path and parse to model.""" model = self.trainer.model if self.trainer.config.is_detection_trainer: model_desc = self.trainer.model_desc or self._get_model_desc() else: model_desc = self._get_model_desc() pretrained_model_file = self._get_pretrained_model_file() if not model: if not model_desc: raise Exception( "Failed to Init model, can not get model description.") model = ModelZoo.get_model(model_desc, pretrained_model_file, ModelConfig.head) if model: if hasattr(model, "desc"): self.trainer.model_desc = model.desc if vega.is_torch_backend(): import torch if vega.is_gpu_device(): model = model.cuda() if General._parallel and General.devices_per_trainer > 1: model = torch.nn.DataParallel(model) elif vega.is_npu_device(): model = model.npu() if General._parallel and General.devices_per_trainer > 1: import torch.distributed as dist dist.init_process_group( backend='hccl', world_size=int(os.environ['WORLD_SIZE']), rank=int(os.environ['RANK_ID'])) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[int(os.environ['DEVICE_ID'])]) return model
def input_fn(self): """Return the next `batch_size` examples from this data set.""" if hasattr(self.dataset, "input_fn"): return self.dataset.input_fn() self._get_dataset_info() dataset = tf.data.Dataset.from_tensor_slices( (self.data_index, self.data_index)) if self.dataset.mode == 'train' and self.dataset.world_size > 1: dataset = dataset.shard(self.dataset.world_size, self.dataset.rank) if self.dataset.mode == 'train': dataset = dataset.repeat() if self.args.shuffle: dataset = dataset.shuffle(buffer_size=self._num_examples) if vega.is_npu_device(): # esr cannot adapt to num_parallel_calls on NPU dataset = dataset.map(self.data_map_func) dataset = dataset.batch(batch_size=self.args.batch_size, drop_remainder=self.args.drop_last) else: dataset = dataset.map( self.data_map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(batch_size=self.args.batch_size, drop_remainder=self.args.drop_last) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def set_parameters(self, name, value): """Set Parameters.""" if vega.is_npu_device(): self.register_parameter(name, nn.Parameter(value.npu())) elif vega.is_gpu_device(): self.register_parameter(name, nn.Parameter(value.cuda())) else: self.register_parameter(name, nn.Parameter(value)) return getattr(self, name)
def _init_tf_estimator(self): """Init tensorflow estimator.""" if not vega.is_tf_backend(): return sess_config = self._init_session_config() if vega.is_gpu_device(): self._init_gpu_estimator(sess_config) elif vega.is_npu_device(): self._init_npu_estimator(sess_config)
def make_batch(self, batch): """Prepare batch data for train_step.""" input, target = batch if not self.config.prefetcher: if vega.is_gpu_device(): input, target = input.cuda(), target.cuda() elif vega.is_npu_device(): input, target = input.npu(), target.npu() return input, target
def _init_loss(self): """Init loss function from timm according to type in config.""" loss_name = self.config.loss.type loss_config = self.config.loss().to_dict()["params"] loss_class = getattr(importlib.import_module('timm.loss'), loss_name) loss_fn = loss_class(**loss_config) if vega.is_gpu_device(): loss_fn = loss_fn.cuda() elif vega.is_npu_device(): loss_fn = loss_fn.npu() return loss_fn
def _init_distributed_setting(self): if not self.distributed: return if vega.is_npu_device(): self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) self._world_size = hvd.size() if vega.is_gpu_device( ) else get_rank_size() self._rank_id = hvd.rank() if vega.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if vega.is_gpu_device( ) else get_local_rank_id()
def exclude_ignore_index(self, logits, labels): """Ignore certain index.""" logits = tf.transpose(logits, [0, 2, 3, 1]) if vega.is_gpu_device(): indices = tf.where(tf.not_equal(labels, self.ignore_index)) labels = tf.cast(tf.gather_nd(labels, indices), tf.int32) logits = tf.gather_nd(logits, indices) return logits, labels, 1.0 elif vega.is_npu_device(): weights = tf.not_equal(labels, self.ignore_index) labels = tf.multiply(labels, tf.cast(weights, labels.dtype)) return logits, labels, tf.to_float(weights)
def _calc_workers_num(self): """Calculate workers numbers.""" if not General.parallel_search: return 1 if vega.is_gpu_device(): import torch world_size = General.env.world_size devices_per_node = torch.cuda.device_count() worker_num = (world_size * devices_per_node) // General.devices_per_trainer elif vega.is_npu_device(): world_devices = int(os.environ['RANK_SIZE']) worker_num = world_devices // General.devices_per_trainer return worker_num
def make_batch(self, batch): """ Make a batch data for ctr trainer. :param batch: a batch data :return: batch data, seperate input and target """ input, target = batch if vega.is_gpu_device(): input, target = input.cuda(), target.cuda() elif vega.is_npu_device(): input, target = input.npu(), target.npu() return (input, target)
def forward(self, input): """Do an inference on Identity.""" input = input.cpu() input = torch.quantize_per_tensor(input, 1.0, 0, self._quant_type[self.quant_bit]) output = super().forward(input) if vega.is_npu_device(): output = torch.dequantize(output).npu() elif vega.is_gpu_device(): output = torch.dequantize(output).cuda() else: output = torch.dequantize(output) return output
def adjust_pipeline_config(self, cfg): """Adjust pipeline config according.""" cfg_cp = copy.deepcopy(cfg) cfg_tiny = copy.deepcopy(cfg) workers_num = self._calc_workers_num() General.parallel_search = False self._get_time_params(cfg_cp) self._simulate_tiny_pipeline(cfg_tiny) General.parallel_search = cfg.general.parallel_search self._modify_pipeline_config(workers_num, self.epoch_time, self.params_dict) if vega.is_npu_device(): os.environ['RANK_TABLE_FILE'] = os.environ['ORIGIN_RANK_TABLE_FILE'] os.environ['RANK_SIZE'] = os.environ['ORIGIN_RANK_SIZE'] logging.info('Adjust runtime config successfully.')
def _init_distributed_setting(self): if not self.distributed: return if vega.is_npu_device(): sess_config = self._init_session_config() self.sess = tf.compat.v1.Session(config=sess_config) from npu_bridge.estimator import npu_ops self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) if vega.is_gpu_device(): import horovod.tensorflow as hvd self._world_size = hvd.size() self._rank_id = hvd.rank() self._local_rank_id = hvd.local_rank() elif vega.is_npu_device(): from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id self._world_size = get_rank_size() self._rank_id = get_rank_id() self._local_rank_id = get_local_rank_id()
def _init_session_config(self): import tensorflow as tf if vega.is_gpu_device(): sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True return sess_config elif vega.is_npu_device(): from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig sess_config = tf.ConfigProto() sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add( ) custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True return sess_config
def _generate_init_model(self): """Generate init model by loading pretrained model. :return: initial model after loading pretrained model :rtype: torch.nn.Module """ model_init = self._new_model_init() chn_node_mask = self._init_chn_node_mask() if vega.is_torch_backend(): if vega.is_gpu_device(): checkpoint = torch.load(self.config.init_model_file + '.pth') model_init.load_state_dict(checkpoint) model = PruneResnet(model_init).apply( chn_node_mask, self.base_net_desc.backbone.chn_mask) model.to(self.device) elif vega.is_npu_device(): device = "npu:{}".format(os.environ.get('DEVICE_ID', 0)) checkpoint = torch.load(self.config.init_model_file + '.pth', map_location=torch.device( '{}'.format(device))) model_init.load_state_dict(checkpoint) model = PruneResnet(model_init).apply( chn_node_mask, self.base_net_desc.backbone.chn_mask) model.npu() elif vega.is_tf_backend(): model = model_init with tf.compat.v1.Session( config=self.trainer._init_session_config()) as sess: saver = tf.compat.v1.train.import_meta_graph("{}.meta".format( self.config.init_model_file)) saver.restore(sess, self.config.init_model_file) all_weight = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.VARIABLES) all_weight = [ t for t in all_weight if not t.name.endswith('Momentum:0') ] PruneResnet(all_weight).apply( chn_node_mask, self.base_net_desc.backbone.chn_mask) save_file = FileOps.join_path( self.trainer.get_local_worker_path(), 'prune_model') saver.save(sess, save_file) elif vega.is_ms_backend(): parameter_dict = load_checkpoint(self.config.init_model_file) load_param_into_net(model_init, parameter_dict) model = PruneResnet(model_init).apply( chn_node_mask, self.base_net_desc.backbone.chn_mask) return model
def _init_model(self): """Init network model from timm according to model type in config.""" args = self.config.model_desc model = create_model(args.model_name, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) if vega.is_gpu_device(): model = model.cuda() elif vega.is_npu_device(): model = model.npu() return model
def _evalGAN(self, model, imgs, epoch, writer): """Save images to event file. :param model: cyclesr model :type model: CycleSRModel class(nn.Module) :param imgs: list of selected valid images :type imgs: list :param epoch: current epoch :type epoch: int :param writer: record enent files to log dir :type writer: tensorboardX.SummaryWriter """ model.set_mode('eval') with torch.no_grad(): for i, img in enumerate(imgs): if vega.is_npu_device(): real_X = img['X'].npu() real_Y = img['Y'].npu() HR = img['HR'].npu() else: real_X = img['X'].cuda() real_Y = img['Y'].cuda() HR = img['HR'].cuda() fake_Y = model.netG(real_X) # G(X) rec_X = model.netF(fake_Y) # F(G(X)) fake_X = model.netF(real_Y) # F(Y) rec_Y = model.netG(fake_X) # G(F(Y)) G_SR = model.netSR(fake_Y) # SR(G(X)) writer.add_image("G_SR" + str(i), TensorNorm((G_SR[0])), epoch) writer.add_image("HR" + str(i), TensorNorm((HR[0])), epoch) writer.add_image("Real_bicubic" + str(i), TensorNorm((real_X[0])), epoch) writer.add_image("Fake_unknown" + str(i), TensorNorm((fake_Y[0])), epoch) writer.add_image("Real_unknown" + str(i), TensorNorm((real_Y[0])), epoch) writer.add_image("Fake_bicubic" + str(i), TensorNorm((fake_X[0])), epoch) writer.add_image("Rec_bicubic" + str(i), TensorNorm( (rec_X[0])), epoch) writer.add_image("Rec_unknown" + str(i), TensorNorm( (rec_Y[0])), epoch)
def _init_setting(self): """Init CUDA setting.""" if vega.is_gpu_device(): import torch.cuda self.config.device = vega.is_gpu_device() if vega.is_gpu_device( ) is not True else 0 if self.distributed: torch.cuda.set_device(self._local_rank_id) torch.cuda.manual_seed(self.config.seed) elif vega.is_npu_device(): import torch.npu device = "npu:{}".format(os.environ.get('DEVICE_ID', 0)) torch.npu.set_device(device) torch.npu.manual_seed(self.config.seed) elif vega.is_cpu_device(): self.config.device = -1 return else: raise ValueError('Set a correct device: cuda or npu.')
def drop_path(x, prob): """Drop path operation. :param x: input feature map :type x: torch tensor :param prob: dropout probability :type prob: float :return: output feature map after dropout :rtype: torch tensor """ if prob <= 0.: return x keep = 1. - prob if vega.is_gpu_device(): mask = torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep) elif vega.is_npu_device(): mask = torch.npu.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep) x.div_(keep) x.mul_(mask) return x
def before_train(self, logs=None): """Fetch trainer info before train stage.""" self._fix_path = "_".join([self.trainer.step_name, str(self.trainer.worker_id)]) self.summary = SummaryBoard(self._archive_root, self._fix_path) # add graph only once. if vega.is_tf_backend(): import tensorflow as tf datasets = self.trainer.valid_input_fn() data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets) input_data, _ = data_iter.get_next() self.input = input_data[:1] graph = self.trainer.graph _graph_name_list = [n.name for n in graph.as_graph_def().node] if len(_graph_name_list) < 2: graph = _fetch_tf_graph(self.trainer.model, self.input) self.summary.add_graph(graph=graph, backend="tf") elif vega.is_torch_backend(): model = self.trainer.model data_iter = iter(self.trainer.train_loader) input_batch, _ = data_iter.next() input_data = input_batch[:1] if not self.trainer.config.is_detection_trainer: if vega.is_gpu_device(): input_data = input_data.cuda() elif vega.is_npu_device(): input_data = input_data.npu() try: self.summary.add_graph(model=model, feed_data=input_data, backend="torch") except BaseException as err: logging.warning("Dump PyTorch model failed! with: \n{}".format(err)) elif vega.is_ms_backend(): logging.debug("Don't support mindspore model dump yet.") else: logging.warning("non-known backend.")
def __call__(self): """Call loss cls.""" params = self.map_config.get("params", {}) logging.debug("Call Loss. name={}, params={}".format( self._cls.__name__, params)) try: if params: cls_obj = self._cls(**params) if isclass( self._cls) else partial(self._cls, **params) else: cls_obj = self._cls() if isclass(self._cls) else partial( self._cls) if vega.is_torch_backend(): if vega.is_gpu_device(): cls_obj = cls_obj.cuda() elif vega.is_npu_device(): cls_obj = cls_obj.npu() return cls_obj except Exception as ex: logging.error("Failed to call Loss name={}, params={}".format( self._cls.__name__, params)) raise ex
def optimize_transmodel(self, input): """Optimize translation model.""" if vega.is_npu_device(): self.real_X = input['X'].npu() self.real_Y = input['Y'].npu() else: self.real_X = input['X'].cuda() self.real_Y = input['Y'].cuda() self.batch_size = self.real_X.shape[0] self.forward() # Following original GAN's optimize order, first, optimize D. requires_grad([self.netD_X, self.netD_Y], True) self.optimizer_D_X.zero_grad() self.optimizer_D_Y.zero_grad() self.update_D() self.optimizer_D_X.step() self.optimizer_D_Y.step() # Then optimize G. requires_grad([self.netD_X, self.netD_Y], False) self.optimizer_G.zero_grad() self.update_G() self.optimizer_G.step()