def handleTenant_Delete(*args,**kwargs): event = kwargs["sender"] accountId = event.accountId session = kwargs["session"] rdclient = ServiceContext().getRdClient() rdclient.api_version = 1 try: account = session.query(Tenant).filter(Tenant.id == accountId).one() services = account.services for svc in services: nodes = svc.nodes for node in nodes: jobs = node.jobs for job in jobs: try: rdclient.delete_job(job.jobid) except: logger.warning("delete job(%s) error, just ignore any way" % job.jobid) session.delete(job) session.delete(node) session.delete(svc) session.delete(account) session.commit() logger.info("delete account<%s>." % accountId) except NoResultFound: logger.warning("account(<%s>) has been deleted, just ignore" % accountId ) finally: del rdclient.api_version
def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logger): """Internal helper to get a memory block or re-use by re-shaping""" if name in shared_data_arrays: arg_arr = shared_data_arrays[name] if np.prod(arg_arr.shape) >= np.prod(arg_shape): # nice, we can directly re-use this data blob assert arg_arr.dtype == arg_type arg_arr = arg_arr.reshape(arg_shape) else: logger.warning( ('bucketing: data "%s" has a shape %s' % (name, arg_shape)) + (", which is larger than already allocated ") + ("shape %s" % (arg_arr.shape, )) + (". Need to re-allocate. Consider putting ") + ("default_bucket_key to") + (" be the bucket taking the largest input for better " ) + ("memory sharing.")) arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) # replace existing shared array because the new one is bigger shared_data_arrays[name] = arg_arr else: arg_arr = nd.zeros(arg_shape, context, dtype=arg_type) shared_data_arrays[name] = arg_arr return arg_arr
def start_ds_port_forward(self, instance_name='userstore', instance_nb=0): if not is_cluster_mode(): ds_pod_name = '%s-%s' % (instance_name, instance_nb) ds_local_port = eval('self.%s%s_local_port' % (instance_name, instance_nb)) command = self.helm_cmd + ' --namespace %s port-forward pod/%s %s:8080' % \ (tests_namespace(), ds_pod_name, ds_local_port) ds_popen = cmd.run_cmd_process(command) duration = 60 start_time = time.time() while time.time() - start_time < duration: soc = socket.socket() result = soc.connect_ex(("", ds_local_port)) soc.close() if result != 0: logger.warning( 'Port-forward for pod %s on port %s not ready, waiting 5s...' % (ds_pod_name, ds_local_port)) time.sleep(5) else: logger.info('Port-forward for pod %s on port %s is ready' % (ds_pod_name, ds_local_port)) return ds_popen raise Exception( 'Port-forward for pod %s on port %s not ready after %ss' % (ds_pod_name, ds_local_port, duration))
def handleTenant_Check(*args,**kwargs): event = kwargs["sender"] accountId = event.accountId session = kwargs["session"] try: account = session.query(Tenant).filter(Tenant.id == accountId).one() svclist = account.services flag = True for svc in svclist: flag = flag and svc.isready() if not flag: account.getSM().trigger("package_activate_timeout",tenant = account) except NoResultFound: logger.warning("account(<%s>) has been deleted, just ignore" )
def init_optimizer(self, kvstore="local", optimizer="sgd", optimizer_params=(("learning_rate", 0.01), ), force_init=False): assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: logger.warning("optimizer already initialized, ignoring.") return self._curr_module._preload_opt_states = self._preload_opt_states self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params, force_init=force_init) self.optimizer_initialized = True
def flat_and_anneal_lr_scheduler( optimizer, total_iters, warmup_iters=0, warmup_factor=0.1, warmup_method="linear", anneal_point=0.72, anneal_method="cosine", target_lr_factor=0, poly_power=1.0, step_gamma=0.1, steps=[2 / 3.0, 8 / 9.0], ): """https://github.com/fastai/fastai/blob/master/fastai/callbacks/flat_cos_a nneal.py. warmup_initial_lr = warmup_factor * base_lr target_lr = base_lr * target_lr_factor """ if warmup_method not in ("constant", "linear"): raise ValueError("Only 'constant' or 'linear' warmup_method accepted," "got {}".format(warmup_method)) if anneal_method not in ("cosine", "linear", "poly", "exp", "step", "none"): raise ValueError( "Only 'cosine', 'linear', 'poly', 'exp', 'step' or 'none' anneal_method accepted," "got {}".format(anneal_method)) if anneal_method == "step": if any([ _step < warmup_iters / total_iters or _step > 1 for _step in steps ]): raise ValueError( "error in steps: {}. warmup_iters: {} total_iters: {}." "steps should be in ({},1)".format(steps, warmup_iters, total_iters, warmup_iters / total_iters)) if list(steps) != sorted(steps): raise ValueError( "steps {} is not in ascending order.".format(steps)) logger.warning("ignore anneal_point when using step anneal_method") anneal_start = steps[0] * total_iters else: if anneal_point > 1 or anneal_point < 0: raise ValueError( "anneal_point should be in [0,1], got {}".format(anneal_point)) anneal_start = anneal_point * total_iters def f(x): # x is the iter in lr scheduler, return the lr_factor # the final lr is warmup_factor * base_lr if x < warmup_iters: if warmup_method == "linear": alpha = float(x) / warmup_iters return warmup_factor * (1 - alpha) + alpha elif warmup_method == "constant": return warmup_factor elif x >= anneal_start: if anneal_method == "step": # ignore anneal_point and target_lr_factor milestones = [_step * total_iters for _step in steps] lr_factor = step_gamma**bisect_right(milestones, float(x)) elif anneal_method == "cosine": # slow --> fast --> slow lr_factor = target_lr_factor + 0.5 * (1 - target_lr_factor) * ( 1 + cos(pi * ((float(x) - anneal_start) / (total_iters - anneal_start)))) elif anneal_method == "linear": # (y-m) / (B-x) = (1-m) / (B-A) lr_factor = target_lr_factor + (1 - target_lr_factor) * ( total_iters - float(x)) / (total_iters - anneal_start) elif anneal_method == "poly": # slow --> fast if poly_power < 1 # fast --> slow if poly_power > 1 # when poly_power == 1.0, it is the same with linear lr_factor = (target_lr_factor + (1 - target_lr_factor) * ((total_iters - float(x)) / (total_iters - anneal_start))**poly_power) elif anneal_method == "exp": # fast --> slow # do not decay too much, especially if lr_end == 0, lr will be # 0 at anneal iter, so we should avoid that _target_lr_factor = max(target_lr_factor, 5e-3) lr_factor = _target_lr_factor**((float(x) - anneal_start) / (total_iters - anneal_start)) else: lr_factor = 1 return lr_factor else: # warmup_iter <= x < anneal_start_iter return 1 return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
def bind( self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req="write", ): # in case we already initialized params, keep it if self.params_initialized: arg_params, aux_params = self.get_params() # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: logger.warning("Already binded, ignoring bind()") return assert shared_module is None, "shared_module for MutableModule is not supported" self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True max_shapes_dict = dict() if self._max_data_shapes is not None: max_shapes_dict.update(dict(self._max_data_shapes[0])) if self._max_label_shapes is not None: max_shapes_dict.update(dict(self._max_label_shapes[0])) max_data_shapes = list() for name, shape in data_shapes[0]: if name in max_shapes_dict: max_data_shapes.append((name, max_shapes_dict[name])) else: max_data_shapes.append((name, shape)) max_label_shapes = list() if not label_shapes.count(None) == len(label_shapes): for name, shape in label_shapes[0]: if name in max_shapes_dict: max_label_shapes.append((name, max_shapes_dict[name])) else: max_label_shapes.append((name, shape)) if len(max_label_shapes) == 0: max_label_shapes = None module = Module( self._symbol, self._data_names, self._label_names, logger=logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names, ) module.bind( [max_data_shapes for _ in range(len(self._context))], [max_label_shapes for _ in range(len(self._context))], for_training, inputs_need_grad, force_rebind=False, shared_module=None, ) self._curr_module = module # copy back saved params, if already initialized if self.params_initialized: self.set_params(arg_params, aux_params)
def init_optimizer(self, kvstore="local", optimizer="sgd", optimizer_params=(("learning_rate", 0.01), ), force_init=False): """Install and initialize optimizers. Parameters ---------- kvstore : str or KVStore Default `'local'`. optimizer : str or Optimizer Default `'sgd'` optimizer_params : dict Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, just to avoid pylint warning of dangerous default values. force_init : bool Default `False`, indicating whether we should force re-initializing the optimizer in the case an optimizer is already installed. """ assert self.binded and self.params_initialized if self.optimizer_initialized and not force_init: logger.warning("optimizer already initialized, ignoring...") return (kvstore, update_on_kvstore) = _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and "dist" in kvstore.type and "_sync" in kvstore.type: batch_size *= kvstore.num_workers rescale_grad = 1.0 / batch_size if isinstance(optimizer, str): idx2name = {} if update_on_kvstore: idx2name.update(enumerate(self._exec_group.param_names)) else: for k in range(len(self._context)): idx2name.update({ i * len(self._context) + k: n for i, n in enumerate(self._exec_group.param_names) }) optimizer_params = dict(optimizer_params) if "rescale_grad" not in optimizer_params: optimizer_params["rescale_grad"] = rescale_grad optimizer = opt.create(optimizer, sym=self.symbol, param_idx2name=idx2name, **optimizer_params) else: assert isinstance(optimizer, opt.Optimizer) if optimizer.rescale_grad != rescale_grad: # pylint: disable=no-member warnings.warn( "Optimizer created manually outside Module but rescale_grad " + "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). " % (optimizer.rescale_grad, rescale_grad) + "Is this intended?", stacklevel=2, ) self._optimizer = optimizer self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore self._updater = None if kvstore: # copy initialized local parameters to kvstore _initialize_kvstore( kvstore=kvstore, param_arrays=self._exec_group.param_arrays, arg_params=self._arg_params, param_names=self._param_names, update_on_kvstore=update_on_kvstore, ) if update_on_kvstore: kvstore.set_optimizer(self._optimizer) else: self._updater = opt.get_updater(optimizer) self.optimizer_initialized = True if self._preload_opt_states is not None: self.load_optimizer_states(self._preload_opt_states) self._preload_opt_states = None
def bind( self, data_shapes, label_shapes=None, for_training=True, inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req="write", ): """Bind the symbols to construct executors. This is necessary before one can perform computation with the module. Parameters ---------- data_shapes : list of (str, tuple) Typically is `data_iter.provide_data`. label_shapes : list of (str, tuple) Typically is `data_iter.provide_label`. for_training : bool Default is `True`. Whether the executors should be bind for training. inputs_need_grad : bool Default is `False`. Whether the gradients to the input data need to be computed. Typically this is not needed. But this might be needed when implementing composition of modules. force_rebind : bool Default is `False`. This function does nothing if the executors are already binded. But with this `True`, the executors will be forced to rebind. shared_module : Module Default is `None`. This is used in bucketing. When not `None`, the shared module essentially corresponds to a different bucket -- a module with different symbol but with the same sets of parameters (e.g. unrolled RNNs with different lengths). """ # force rebinding is typically used when one want to switch from # training to prediction phase. if force_rebind: self._reset_bind() if self.binded: logger.warning("Already binded, ignoring bind()") return self.for_training = for_training self.inputs_need_grad = inputs_need_grad self.binded = True self._grad_req = grad_req if not for_training: assert not inputs_need_grad else: pass # this is not True, as some module might not contains a loss function # that consumes the labels # assert label_shapes is not None # self._data_shapes, self._label_shapes = _parse_data_desc( # self.data_names, self.label_names, data_shapes, label_shapes) self._data_shapes, self._label_shapes = zip(*[ _parse_data_desc(self.data_names, self.label_names, data_shape, label_shape) for data_shape, label_shape in zip(data_shapes, label_shapes) ]) if self._label_shapes.count(None) == len(self._label_shapes): self._label_shapes = None if shared_module is not None: assert isinstance( shared_module, Module ) and shared_module.binded and shared_module.params_initialized shared_group = shared_module._exec_group else: shared_group = None self._exec_group = DataParallelExecutorGroup( self._symbol, self._context, self._work_load_list, self._data_shapes, self._label_shapes, self._param_names, for_training, inputs_need_grad, shared_group, logger=logger, fixed_param_names=self._fixed_param_names, grad_req=grad_req, state_names=self._state_names, ) # self._total_exec_bytes = self._exec_group._total_exec_bytes if shared_module is not None: self.params_initialized = True self._arg_params = shared_module._arg_params self._aux_params = shared_module._aux_params elif self.params_initialized: # if the parameters are already initialized, we are re-binding # so automatically copy the already initialized params self._exec_group.set_params(self._arg_params, self._aux_params) else: assert self._arg_params is None and self._aux_params is None param_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.param_arrays ] self._arg_params = { name: arr for name, arr in zip(self._param_names, param_arrays) } aux_arrays = [ nd.zeros(x[0].shape, dtype=x[0].dtype) for x in self._exec_group.aux_arrays ] self._aux_params = { name: arr for name, arr in zip(self._aux_names, aux_arrays) } if shared_module is not None and shared_module.optimizer_initialized: self.borrow_optimizer(shared_module)