Example #1
0
def handleTenant_Delete(*args,**kwargs):
    event = kwargs["sender"]
    accountId = event.accountId
    session = kwargs["session"]
    rdclient = ServiceContext().getRdClient()
    rdclient.api_version = 1
    try:
        account = session.query(Tenant).filter(Tenant.id == accountId).one()
        services = account.services
        for svc in services:
            nodes = svc.nodes
            for node in nodes:
                jobs = node.jobs
                for job in jobs:
                    try:
                        rdclient.delete_job(job.jobid)
                    except:
                        logger.warning("delete job(%s) error, just ignore any way" % job.jobid)
                    session.delete(job)
                session.delete(node)
            session.delete(svc)
        session.delete(account)
        session.commit()
        logger.info("delete account<%s>." % accountId)
    except NoResultFound:
        logger.warning("account(<%s>) has been deleted, just ignore" % accountId    )
    finally:
        del rdclient.api_version
        def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type,
                            context, logger):
            """Internal helper to get a memory block or re-use by re-shaping"""
            if name in shared_data_arrays:
                arg_arr = shared_data_arrays[name]

                if np.prod(arg_arr.shape) >= np.prod(arg_shape):
                    # nice, we can directly re-use this data blob
                    assert arg_arr.dtype == arg_type
                    arg_arr = arg_arr.reshape(arg_shape)
                else:
                    logger.warning(
                        ('bucketing: data "%s" has a shape %s' %
                         (name, arg_shape)) +
                        (", which is larger than already allocated ") +
                        ("shape %s" % (arg_arr.shape, )) +
                        (". Need to re-allocate. Consider putting ") +
                        ("default_bucket_key to") +
                        (" be the bucket taking the largest input for better "
                         ) + ("memory sharing."))
                    arg_arr = nd.zeros(arg_shape, context, dtype=arg_type)

                    # replace existing shared array because the new one is bigger
                    shared_data_arrays[name] = arg_arr
            else:
                arg_arr = nd.zeros(arg_shape, context, dtype=arg_type)
                shared_data_arrays[name] = arg_arr

            return arg_arr
Example #3
0
    def start_ds_port_forward(self, instance_name='userstore', instance_nb=0):
        if not is_cluster_mode():
            ds_pod_name = '%s-%s' % (instance_name, instance_nb)
            ds_local_port = eval('self.%s%s_local_port' %
                                 (instance_name, instance_nb))
            command = self.helm_cmd + ' --namespace %s port-forward pod/%s %s:8080' % \
                  (tests_namespace(), ds_pod_name, ds_local_port)
            ds_popen = cmd.run_cmd_process(command)

            duration = 60
            start_time = time.time()
            while time.time() - start_time < duration:
                soc = socket.socket()
                result = soc.connect_ex(("", ds_local_port))
                soc.close()
                if result != 0:
                    logger.warning(
                        'Port-forward for pod %s on port %s not ready, waiting 5s...'
                        % (ds_pod_name, ds_local_port))
                    time.sleep(5)
                else:
                    logger.info('Port-forward for pod %s on port %s is ready' %
                                (ds_pod_name, ds_local_port))
                    return ds_popen

            raise Exception(
                'Port-forward for pod %s on port %s not ready after %ss' %
                (ds_pod_name, ds_local_port, duration))
Example #4
0
def handleTenant_Check(*args,**kwargs):
    event = kwargs["sender"]
    accountId = event.accountId
    session = kwargs["session"]
    try:
        account = session.query(Tenant).filter(Tenant.id == accountId).one()
        svclist = account.services
        flag = True
        for svc in svclist:
            flag = flag and svc.isready()
        if not flag:
            account.getSM().trigger("package_activate_timeout",tenant = account)
    except NoResultFound:
        logger.warning("account(<%s>) has been deleted, just ignore" )
Example #5
0
    def init_optimizer(self,
                       kvstore="local",
                       optimizer="sgd",
                       optimizer_params=(("learning_rate", 0.01), ),
                       force_init=False):
        assert self.binded and self.params_initialized
        if self.optimizer_initialized and not force_init:
            logger.warning("optimizer already initialized, ignoring.")
            return

        self._curr_module._preload_opt_states = self._preload_opt_states
        self._curr_module.init_optimizer(kvstore,
                                         optimizer,
                                         optimizer_params,
                                         force_init=force_init)
        self.optimizer_initialized = True
Example #6
0
def flat_and_anneal_lr_scheduler(
    optimizer,
    total_iters,
    warmup_iters=0,
    warmup_factor=0.1,
    warmup_method="linear",
    anneal_point=0.72,
    anneal_method="cosine",
    target_lr_factor=0,
    poly_power=1.0,
    step_gamma=0.1,
    steps=[2 / 3.0, 8 / 9.0],
):
    """https://github.com/fastai/fastai/blob/master/fastai/callbacks/flat_cos_a
    nneal.py.

    warmup_initial_lr = warmup_factor * base_lr
    target_lr = base_lr * target_lr_factor
    """
    if warmup_method not in ("constant", "linear"):
        raise ValueError("Only 'constant' or 'linear' warmup_method accepted,"
                         "got {}".format(warmup_method))

    if anneal_method not in ("cosine", "linear", "poly", "exp", "step",
                             "none"):
        raise ValueError(
            "Only 'cosine', 'linear', 'poly', 'exp', 'step' or 'none' anneal_method accepted,"
            "got {}".format(anneal_method))

    if anneal_method == "step":
        if any([
                _step < warmup_iters / total_iters or _step > 1
                for _step in steps
        ]):
            raise ValueError(
                "error in steps: {}. warmup_iters: {} total_iters: {}."
                "steps should be in ({},1)".format(steps, warmup_iters,
                                                   total_iters,
                                                   warmup_iters / total_iters))
        if list(steps) != sorted(steps):
            raise ValueError(
                "steps {} is not in ascending order.".format(steps))
        logger.warning("ignore anneal_point when using step anneal_method")
        anneal_start = steps[0] * total_iters
    else:
        if anneal_point > 1 or anneal_point < 0:
            raise ValueError(
                "anneal_point should be in [0,1], got {}".format(anneal_point))
        anneal_start = anneal_point * total_iters

    def f(x):  # x is the iter in lr scheduler, return the lr_factor
        # the final lr is warmup_factor * base_lr
        if x < warmup_iters:
            if warmup_method == "linear":
                alpha = float(x) / warmup_iters
                return warmup_factor * (1 - alpha) + alpha
            elif warmup_method == "constant":
                return warmup_factor
        elif x >= anneal_start:
            if anneal_method == "step":
                # ignore anneal_point and target_lr_factor
                milestones = [_step * total_iters for _step in steps]
                lr_factor = step_gamma**bisect_right(milestones, float(x))
            elif anneal_method == "cosine":
                # slow --> fast --> slow
                lr_factor = target_lr_factor + 0.5 * (1 - target_lr_factor) * (
                    1 + cos(pi * ((float(x) - anneal_start) /
                                  (total_iters - anneal_start))))
            elif anneal_method == "linear":
                # (y-m) / (B-x) = (1-m) / (B-A)
                lr_factor = target_lr_factor + (1 - target_lr_factor) * (
                    total_iters - float(x)) / (total_iters - anneal_start)
            elif anneal_method == "poly":
                # slow --> fast if poly_power < 1
                # fast --> slow if poly_power > 1
                # when poly_power == 1.0, it is the same with linear
                lr_factor = (target_lr_factor + (1 - target_lr_factor) *
                             ((total_iters - float(x)) /
                              (total_iters - anneal_start))**poly_power)
            elif anneal_method == "exp":
                # fast --> slow
                # do not decay too much, especially if lr_end == 0, lr will be
                # 0 at anneal iter, so we should avoid that
                _target_lr_factor = max(target_lr_factor, 5e-3)
                lr_factor = _target_lr_factor**((float(x) - anneal_start) /
                                                (total_iters - anneal_start))
            else:
                lr_factor = 1
            return lr_factor
        else:  # warmup_iter <= x < anneal_start_iter
            return 1

    return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
Example #7
0
    def bind(
        self,
        data_shapes,
        label_shapes=None,
        for_training=True,
        inputs_need_grad=False,
        force_rebind=False,
        shared_module=None,
        grad_req="write",
    ):
        # in case we already initialized params, keep it
        if self.params_initialized:
            arg_params, aux_params = self.get_params()

        # force rebinding is typically used when one want to switch from
        # training to prediction phase.
        if force_rebind:
            self._reset_bind()

        if self.binded:
            logger.warning("Already binded, ignoring bind()")
            return

        assert shared_module is None, "shared_module for MutableModule is not supported"

        self.for_training = for_training
        self.inputs_need_grad = inputs_need_grad
        self.binded = True

        max_shapes_dict = dict()
        if self._max_data_shapes is not None:
            max_shapes_dict.update(dict(self._max_data_shapes[0]))
        if self._max_label_shapes is not None:
            max_shapes_dict.update(dict(self._max_label_shapes[0]))

        max_data_shapes = list()
        for name, shape in data_shapes[0]:
            if name in max_shapes_dict:
                max_data_shapes.append((name, max_shapes_dict[name]))
            else:
                max_data_shapes.append((name, shape))

        max_label_shapes = list()
        if not label_shapes.count(None) == len(label_shapes):
            for name, shape in label_shapes[0]:
                if name in max_shapes_dict:
                    max_label_shapes.append((name, max_shapes_dict[name]))
                else:
                    max_label_shapes.append((name, shape))

        if len(max_label_shapes) == 0:
            max_label_shapes = None

        module = Module(
            self._symbol,
            self._data_names,
            self._label_names,
            logger=logger,
            context=self._context,
            work_load_list=self._work_load_list,
            fixed_param_names=self._fixed_param_names,
        )
        module.bind(
            [max_data_shapes for _ in range(len(self._context))],
            [max_label_shapes for _ in range(len(self._context))],
            for_training,
            inputs_need_grad,
            force_rebind=False,
            shared_module=None,
        )
        self._curr_module = module

        # copy back saved params, if already initialized
        if self.params_initialized:
            self.set_params(arg_params, aux_params)
Example #8
0
    def init_optimizer(self,
                       kvstore="local",
                       optimizer="sgd",
                       optimizer_params=(("learning_rate", 0.01), ),
                       force_init=False):
        """Install and initialize optimizers.

        Parameters
        ----------
        kvstore : str or KVStore
            Default `'local'`.
        optimizer : str or Optimizer
            Default `'sgd'`
        optimizer_params : dict
            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
            just to avoid pylint warning of dangerous default values.
        force_init : bool
            Default `False`, indicating whether we should force re-initializing the
            optimizer in the case an optimizer is already installed.
        """
        assert self.binded and self.params_initialized

        if self.optimizer_initialized and not force_init:
            logger.warning("optimizer already initialized, ignoring...")
            return

        (kvstore, update_on_kvstore) = _create_kvstore(kvstore,
                                                       len(self._context),
                                                       self._arg_params)

        batch_size = self._exec_group.batch_size
        if kvstore and "dist" in kvstore.type and "_sync" in kvstore.type:
            batch_size *= kvstore.num_workers
        rescale_grad = 1.0 / batch_size

        if isinstance(optimizer, str):
            idx2name = {}
            if update_on_kvstore:
                idx2name.update(enumerate(self._exec_group.param_names))
            else:
                for k in range(len(self._context)):
                    idx2name.update({
                        i * len(self._context) + k: n
                        for i, n in enumerate(self._exec_group.param_names)
                    })
            optimizer_params = dict(optimizer_params)
            if "rescale_grad" not in optimizer_params:
                optimizer_params["rescale_grad"] = rescale_grad
            optimizer = opt.create(optimizer,
                                   sym=self.symbol,
                                   param_idx2name=idx2name,
                                   **optimizer_params)
        else:
            assert isinstance(optimizer, opt.Optimizer)
            if optimizer.rescale_grad != rescale_grad:
                # pylint: disable=no-member
                warnings.warn(
                    "Optimizer created manually outside Module but rescale_grad "
                    +
                    "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "
                    % (optimizer.rescale_grad, rescale_grad) +
                    "Is this intended?",
                    stacklevel=2,
                )

        self._optimizer = optimizer
        self._kvstore = kvstore
        self._update_on_kvstore = update_on_kvstore
        self._updater = None

        if kvstore:
            # copy initialized local parameters to kvstore
            _initialize_kvstore(
                kvstore=kvstore,
                param_arrays=self._exec_group.param_arrays,
                arg_params=self._arg_params,
                param_names=self._param_names,
                update_on_kvstore=update_on_kvstore,
            )
        if update_on_kvstore:
            kvstore.set_optimizer(self._optimizer)
        else:
            self._updater = opt.get_updater(optimizer)

        self.optimizer_initialized = True

        if self._preload_opt_states is not None:
            self.load_optimizer_states(self._preload_opt_states)
            self._preload_opt_states = None
Example #9
0
    def bind(
        self,
        data_shapes,
        label_shapes=None,
        for_training=True,
        inputs_need_grad=False,
        force_rebind=False,
        shared_module=None,
        grad_req="write",
    ):
        """Bind the symbols to construct executors. This is necessary before one
        can perform computation with the module.

        Parameters
        ----------
        data_shapes : list of (str, tuple)
            Typically is `data_iter.provide_data`.
        label_shapes : list of (str, tuple)
            Typically is `data_iter.provide_label`.
        for_training : bool
            Default is `True`. Whether the executors should be bind for training.
        inputs_need_grad : bool
            Default is `False`. Whether the gradients to the input data need to be computed.
            Typically this is not needed. But this might be needed when implementing composition
            of modules.
        force_rebind : bool
            Default is `False`. This function does nothing if the executors are already
            binded. But with this `True`, the executors will be forced to rebind.
        shared_module : Module
            Default is `None`. This is used in bucketing. When not `None`, the shared module
            essentially corresponds to a different bucket -- a module with different symbol
            but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
        """
        # force rebinding is typically used when one want to switch from
        # training to prediction phase.
        if force_rebind:
            self._reset_bind()

        if self.binded:
            logger.warning("Already binded, ignoring bind()")
            return

        self.for_training = for_training
        self.inputs_need_grad = inputs_need_grad
        self.binded = True
        self._grad_req = grad_req

        if not for_training:
            assert not inputs_need_grad
        else:
            pass
            # this is not True, as some module might not contains a loss function
            # that consumes the labels
            # assert label_shapes is not None

        # self._data_shapes, self._label_shapes = _parse_data_desc(
        #     self.data_names, self.label_names, data_shapes, label_shapes)
        self._data_shapes, self._label_shapes = zip(*[
            _parse_data_desc(self.data_names, self.label_names, data_shape,
                             label_shape)
            for data_shape, label_shape in zip(data_shapes, label_shapes)
        ])
        if self._label_shapes.count(None) == len(self._label_shapes):
            self._label_shapes = None

        if shared_module is not None:
            assert isinstance(
                shared_module, Module
            ) and shared_module.binded and shared_module.params_initialized
            shared_group = shared_module._exec_group
        else:
            shared_group = None

        self._exec_group = DataParallelExecutorGroup(
            self._symbol,
            self._context,
            self._work_load_list,
            self._data_shapes,
            self._label_shapes,
            self._param_names,
            for_training,
            inputs_need_grad,
            shared_group,
            logger=logger,
            fixed_param_names=self._fixed_param_names,
            grad_req=grad_req,
            state_names=self._state_names,
        )
        # self._total_exec_bytes = self._exec_group._total_exec_bytes
        if shared_module is not None:
            self.params_initialized = True
            self._arg_params = shared_module._arg_params
            self._aux_params = shared_module._aux_params
        elif self.params_initialized:
            # if the parameters are already initialized, we are re-binding
            # so automatically copy the already initialized params
            self._exec_group.set_params(self._arg_params, self._aux_params)
        else:
            assert self._arg_params is None and self._aux_params is None
            param_arrays = [
                nd.zeros(x[0].shape, dtype=x[0].dtype)
                for x in self._exec_group.param_arrays
            ]
            self._arg_params = {
                name: arr
                for name, arr in zip(self._param_names, param_arrays)
            }

            aux_arrays = [
                nd.zeros(x[0].shape, dtype=x[0].dtype)
                for x in self._exec_group.aux_arrays
            ]
            self._aux_params = {
                name: arr
                for name, arr in zip(self._aux_names, aux_arrays)
            }

        if shared_module is not None and shared_module.optimizer_initialized:
            self.borrow_optimizer(shared_module)