Beispiel #1
0
def get_best(src: (PATH_TYPE, list),
             *keys,
             with_keys: (str, None) = None,
             with_all=False,
             cmp=lambda x, y: x > y,
             merge=True):
    keys = as_list(keys)

    with_keys = [] if with_keys is None else with_keys.split(";")

    result = {key: None for key in keys}

    result_appendix = {key: None for key in keys}

    for data in loading(src):
        for key in result:
            _data = get_by_key(data, parsed_key=key)
            if result[key] is None or cmp(_data, result[key]):
                result[key] = _data
                if with_all:
                    result_appendix[key] = data
                elif with_keys:
                    result_appendix[key] = {
                        _key: get_by_key(data, _key)
                        for _key in with_keys
                    }

    if merge:
        return _merge(result,
                      result_appendix if with_all or with_keys else None)
    else:
        return result, result_appendix if with_all or with_keys else None
def select_n_most_frequent_students(source: str, target_prefix: str,
                                    ku_dict_path: str, n: (int, list)):
    """None in n means select all students"""
    n_list = as_list(n)
    students = _read(source, ku_dict_path)
    frequency = _frequency(students)
    for _n in n_list:
        _write(get_n_most_frequent_students(students, _n, frequency),
               target_prefix + "%s" % _n)
Beispiel #3
0
 def load(data):
     ctx_vars = []
     for device_type, device_ids in data.items():
         if isinstance(device_ids, int):
             device_ids = as_list(device_ids)
         elif isinstance(device_ids, str):
             device_ids = map(int, device_ids.split(','))
         for device_id in device_ids:
             ctx_vars.append(eval(device_type)(device_id))
     return ctx_vars
Beispiel #4
0
 def dump(data):
     ctx_vars = {}
     for ctx in as_list(data):
         assert isinstance(ctx, Context)
         if ctx.device_type not in ctx_vars:
             ctx_vars[ctx.device_type] = []
         ctx_vars[ctx.device_type].append(ctx.device_id)
     for device_type, device_ids in ctx_vars.items():
         if len(device_ids) > 1:
             ctx_vars[device_type] = ",".join(list(map(str, device_ids)))
         else:
             ctx_vars[device_type] = device_ids[0]
     return ctx_vars
Beispiel #5
0
def retry(max_retry=5,
          retry_interval=1,
          retry_errors=None,
          failed_exception=ConnectionError(),
          logger=logger):
    retry_errors = ConnectionResetError if retry_errors is None else retry_errors
    retry_errors = tuple(as_list(retry_errors))

    def retry_wrapper(f):
        @functools.wraps(f)
        def new_f(*args, **kwargs):
            for _ in range(max_retry):
                try:
                    return f(*args, **kwargs)
                except retry_errors as e:
                    time.sleep(retry_interval)
                    logger.debug(e)
            raise failed_exception

        return new_f

    return retry_wrapper
Beispiel #6
0
    def __call__(self,
                 tips: str = None,
                 iteration: int = None,
                 train_time: float = None,
                 loss_name_value: dict = None,
                 eval_name_value: dict = None,
                 extra_info: (dict, tuple) = None,
                 dump: bool = True,
                 keep: (set, str) = "data",
                 *args,
                 **kwargs):
        msg = []
        data = {}

        if tips is not None:
            msg.append("%s" % tips)

        if iteration is not None:
            msg.append(self.iteration_fmt.format(iteration))
            data[self.iteration_name] = iteration

        if train_time is not None:
            msg.append("Train Time-%.3fs" % train_time)
            data['train_time'] = train_time

        if loss_name_value is not None:
            loss_name_value = _to_dict(loss_name_value)
            assert isinstance(
                loss_name_value, dict
            ), "loss_name_value should be None, dict or tuple, " \
               "now is %s" % type(loss_name_value)
            _msg, _data = self.loss_format(loss_name_value)

            msg.append(_msg)
            data.update(_data)

        if extra_info is not None:
            extra_info = _to_dict(extra_info)
            assert isinstance(
                extra_info, dict
            ), "extra_info should be None, dict or tuple, " \
               "now is %s" % type(extra_info)
            msg.append(str(extra_info))
            data.update(extra_info)

        msg = ["\t".join([m for m in msg if m])]

        if eval_name_value is not None:
            eval_name_value = _to_dict(eval_name_value)
            assert isinstance(
                eval_name_value, dict
            ), "eval_name_value should be None, dict or tuple, " \
               "now is %s" % type(eval_name_value)
            msg.append(result_format(eval_name_value, col=self.col))
            data.update(eval_name_value)

        msg = "\n".join([m for m in msg if m])

        if dump:
            logger = kwargs.get('logger', self.logger)
            logger.info("\n" + msg)
            log_f = kwargs.get('log_f', self.log_f)
            if log_f is not False:
                try:
                    with as_out_io(log_f, "a") as wf:
                        print(json.dumps(data, ensure_ascii=False), file=wf)
                except Exception as e:  # pragma: no cover
                    warnings.warn("Result dumping to file aborted: %s" %
                                  str(e))

        if keep is None:
            return msg
        elif isinstance(keep, str):
            keep = set(as_list(keep))

        if "msg" in keep and "data" in keep:
            return msg, data
        elif "msg" in keep:
            return msg
        elif "data" in keep:
            return data
Beispiel #7
0
def mask_sequence_variable_length(F, data, length, valid_length, time_axis,
                                  merge):
    """
    `Original Code <https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/rnn/rnn_cell.py#L82>`_

    Parameters
    ----------
    F
    data
    length
    valid_length
    time_axis
    merge: bool

    Returns
    -------
    masked_sequence: list, ...
        if merge is False, return list of step vector

    Examples
    --------
    >>> import mxnet.ndarray as nd
    >>> import mxnet as mx
    >>> mask_sequence_variable_length(
    ...     nd, mx.nd.ones((2, 4, 3)), 4, nd.array([2, 4]), 1, False
    ... )    # doctest: +NORMALIZE_WHITESPACE
    [
    [[1. 1. 1.]
     [1. 1. 1.]]
    <NDArray 2x3 @cpu(0)>,
    [[1. 1. 1.]
     [1. 1. 1.]]
    <NDArray 2x3 @cpu(0)>,
    [[0. 0. 0.]
     [1. 1. 1.]]
    <NDArray 2x3 @cpu(0)>,
    [[0. 0. 0.]
     [1. 1. 1.]]
    <NDArray 2x3 @cpu(0)>]
    >>> mask_sequence_variable_length(
    ...     nd, mx.nd.ones((2, 4, 3)), 4, nd.array([2, 4]), 1, True
    ... )    # doctest: +NORMALIZE_WHITESPACE
    <BLANKLINE>
    [[[1. 1. 1.]
      [1. 1. 1.]
      [0. 0. 0.]
      [0. 0. 0.]]
    <BLANKLINE>
     [[1. 1. 1.]
      [1. 1. 1.]
      [1. 1. 1.]
      [1. 1. 1.]]]
    <NDArray 2x4x3 @cpu(0)>
    >>> mask_sequence_variable_length(
    ...     nd, [mx.nd.ones((2, 3)), mx.nd.ones((2, 3)), mx.nd.ones((2, 3)), mx.nd.ones((2, 3))],
    ...     4, nd.array([2, 4]), 1, True
    ... )
    <BLANKLINE>
    [[[1. 1. 1.]
      [1. 1. 1.]
      [0. 0. 0.]
      [0. 0. 0.]]
    <BLANKLINE>
     [[1. 1. 1.]
      [1. 1. 1.]
      [1. 1. 1.]
      [1. 1. 1.]]]
    <NDArray 2x4x3 @cpu(0)>
    """
    assert valid_length is not None
    if not isinstance(data, tensor_types):
        data = F.stack(*data, axis=time_axis)
    outputs = F.SequenceMask(data,
                             sequence_length=valid_length,
                             use_sequence_length=True,
                             axis=time_axis)
    if not merge:
        outputs = as_list(
            F.split(outputs,
                    num_outputs=length,
                    axis=time_axis,
                    squeeze_axis=True))
    return outputs
Beispiel #8
0
def format_sequence(length, inputs, layout, merge, in_layout=None):
    """
    `Original Code <https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/rnn/rnn_cell.py#L52>`_

    Parameters
    ----------
    length
    inputs
    layout
    merge
    in_layout

    Returns
    -------

    Examples
    --------
    >>> import mxnet.ndarray as nd
    >>> seq = [[[0] * 4, [2] * 4, [4] * 4], [[1] * 4, [3] * 4, [5] * 4]]
    >>> seq1, axis, _, batch_size = format_sequence(3, nd.array(seq), "NTC", False)
    >>> seq1   # doctest: +NORMALIZE_WHITESPACE
    [
    [[0. 0. 0. 0.]
     [1. 1. 1. 1.]]
    <NDArray 2x4 @cpu(0)>,
    [[2. 2. 2. 2.]
     [3. 3. 3. 3.]]
    <NDArray 2x4 @cpu(0)>,
    [[4. 4. 4. 4.]
     [5. 5. 5. 5.]]
    <NDArray 2x4 @cpu(0)>]
    >>> axis
    1
    >>> batch_size
    2
    >>> seq2, _, _, _ = format_sequence(3, nd.array(seq), "NTC", True)
    >>> seq2   # doctest: +NORMALIZE_WHITESPACE
    <BLANKLINE>
    [[[0. 0. 0. 0.]
      [2. 2. 2. 2.]
      [4. 4. 4. 4.]]
    <BLANKLINE>
     [[1. 1. 1. 1.]
      [3. 3. 3. 3.]
      [5. 5. 5. 5.]]]
    <NDArray 2x3x4 @cpu(0)>
    >>> import mxnet.symbol as sym
    >>> seq3, _, _, _ = format_sequence(3, sym.Variable("s", shape=(2, 3, 4)), "NTC", False)
    >>> seq3
    [<Symbol split0>, <Symbol split0>, <Symbol split0>]
    >>> seq4 = [nd.array([[0] * 4, [1] * 4]), nd.array([[2] * 4, [3] * 4]), nd.array([[4] * 4, [5] * 4])]
    >>> seq5, _, _, _ = format_sequence(3, seq4, "NTC", True)
    >>> seq5   # doctest: +NORMALIZE_WHITESPACE
    <BLANKLINE>
    [[[0. 0. 0. 0.]
      [2. 2. 2. 2.]
      [4. 4. 4. 4.]]
    <BLANKLINE>
     [[1. 1. 1. 1.]
      [3. 3. 3. 3.]
      [5. 5. 5. 5.]]]
    <NDArray 2x3x4 @cpu(0)>
    >>> seq6 = [sym.Variable("1", shape=(2, 4)), sym.Variable("2", shape=(2, 4)), sym.Variable("3", shape=(2, 4))]
    >>> seq7, _, _, _ = format_sequence(3, seq6, "NTC", True)
    >>> seq7
    <Symbol stack0>
    """
    assert inputs is not None, \
        "unroll(inputs=None) has been deprecated. " \
        "Please create input variables outside unroll."

    axis = layout.find('T')
    batch_axis = layout.find('N')
    batch_size = 0
    in_axis = in_layout.find('T') if in_layout is not None else axis
    if isinstance(inputs, symbol.Symbol):
        F = symbol
        if merge is False:
            assert len(inputs.list_outputs()) == 1, \
                "unroll doesn't allow grouped symbol as input. " \
                "Please convert " \
                "to list with list(inputs) first or " \
                "let unroll handle splitting."
            inputs = list(
                symbol.split(inputs,
                             axis=in_axis,
                             num_outputs=length,
                             squeeze_axis=1))
    elif isinstance(inputs, ndarray.NDArray):
        F = ndarray
        batch_size = inputs.shape[batch_axis]
        if merge is False:
            assert length is None or length == inputs.shape[in_axis]
            inputs = as_list(
                ndarray.split(inputs,
                              axis=in_axis,
                              num_outputs=inputs.shape[in_axis],
                              squeeze_axis=1))
    else:
        assert length is None or len(inputs) == length
        if isinstance(inputs[0], symbol.Symbol):
            F = symbol
        else:
            F = ndarray
            batch_size = inputs[0].shape[batch_axis]
        if merge is True:
            inputs = F.stack(*inputs, axis=axis)
            in_axis = axis

    if isinstance(inputs,
                  tensor_types) and axis != in_axis:  # pragma: no cover
        # todo: find the test case
        inputs = F.swapaxes(inputs, dim1=axis, dim2=in_axis)

    return inputs, axis, F, batch_size
Beispiel #9
0
def classification_report(y_true,
                          y_pred=None,
                          y_score=None,
                          labels=None,
                          metrics=None,
                          sample_weight=None,
                          average_options=None,
                          multiclass_to_multilabel=False,
                          logger=logging,
                          **kwargs):
    """
    Currently support binary and multiclasss classification.

    Parameters
    ----------
    y_true : list, 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) target values.

    y_pred : list or None, 1d array-like, or label indicator array / sparse matrix
        Estimated targets as returned by a classifier.

    y_score : array or None, shape = [n_samples] or [n_samples, n_classes]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers). For binary
        y_true, y_score is supposed to be the score of the class with greater
        label.

    labels : array, shape = [n_labels]
        Optional list of label indices to include in the report.

    metrics: list of str,
        Support: precision, recall, f1, support, accuracy, auc, aupoc.

    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.

    average_options: str or list
        default to macro, choices (one or many): "micro", "macro", "samples", "weighted"

    multiclass_to_multilabel: bool

    logger

    Returns
    -------

    Examples
    --------
    >>> import numpy as np
    >>> # binary classification
    >>> y_true = np.array([0, 0, 1, 1, 0])
    >>> y_pred = np.array([0, 1, 0, 1, 0])
    >>> classification_report(y_true, y_pred)
               precision    recall        f1  support
    0           0.666667  0.666667  0.666667        3
    1           0.500000  0.500000  0.500000        2
    macro_avg   0.583333  0.583333  0.583333        5
    accuracy: 0.600000
    >>> y_true = np.array([0, 0, 1, 1])
    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
    >>> classification_report(y_true, y_score=y_score)    # doctest: +NORMALIZE_WHITESPACE
    macro_auc: 0.750000	macro_aupoc: 0.833333
    >>> y_true = np.array([0, 0, 1, 1])
    >>> y_pred = [0, 0, 0, 1]
    >>> y_score = np.array([0.1, 0.4, 0.35, 0.8])
    >>> classification_report(y_true, y_pred, y_score=y_score)    # doctest: +NORMALIZE_WHITESPACE
               precision  recall        f1  support
    0           0.666667    1.00  0.800000        2
    1           1.000000    0.50  0.666667        2
    macro_avg   0.833333    0.75  0.733333        4
    accuracy: 0.750000	macro_auc: 0.750000	macro_aupoc: 0.833333
    >>> # multiclass classification
    >>> y_true = [0, 1, 2, 2, 2]
    >>> y_pred = [0, 0, 2, 2, 1]
    >>> classification_report(y_true, y_pred)
               precision    recall        f1  support
    0                0.5  1.000000  0.666667        1
    1                0.0  0.000000  0.000000        1
    2                1.0  0.666667  0.800000        3
    macro_avg        0.5  0.555556  0.488889        5
    accuracy: 0.600000
    >>> # multiclass in multilabel
    >>> y_true = np.array([0, 0, 1, 1, 2, 1])
    >>> y_pred = np.array([2, 1, 0, 2, 1, 0])
    >>> y_score = np.array([
    ...    [0.15, 0.4, 0.45],
    ...    [0.1, 0.9, 0.0],
    ...    [0.33333, 0.333333, 0.333333],
    ...    [0.15, 0.4, 0.45],
    ...    [0.1, 0.9, 0.0],
    ...    [0.33333, 0.333333, 0.333333]
    ... ])
    >>> classification_report(
    ...    y_true, y_pred, y_score,
    ...    multiclass_to_multilabel=True,
    ...    metrics=["aupoc"]
    ... )
                  aupoc
    0          0.291667
    1          0.416667
    2          0.166667
    macro_avg  0.291667
    >>> classification_report(
    ...     y_true, y_pred, y_score,
    ...    multiclass_to_multilabel=True,
    ...    metrics=["auc", "aupoc"]
    ... )
                    auc     aupoc
    0          0.250000  0.291667
    1          0.055556  0.416667
    2          0.100000  0.166667
    macro_avg  0.135185  0.291667
    macro_auc: 0.194444
    >>> y_true = np.array([0, 1, 1, 1, 2, 1])
    >>> y_pred = np.array([2, 1, 0, 2, 1, 0])
    >>> y_score = np.array([
    ...    [0.45, 0.4, 0.15],
    ...    [0.1, 0.9, 0.0],
    ...    [0.33333, 0.333333, 0.333333],
    ...    [0.15, 0.4, 0.45],
    ...    [0.1, 0.9, 0.0],
    ...    [0.33333, 0.333333, 0.333333]
    ... ])
    >>> classification_report(
    ...    y_true, y_pred,
    ...    y_score,
    ...    multiclass_to_multilabel=True,
    ... )    # doctest: +NORMALIZE_WHITESPACE
               precision    recall        f1   auc     aupoc  support
    0           0.000000  0.000000  0.000000  1.00  1.000000        1
    1           0.500000  0.250000  0.333333  0.25  0.583333        4
    2           0.000000  0.000000  0.000000  0.10  0.166667        1
    macro_avg   0.166667  0.083333  0.111111  0.45  0.583333        6
    accuracy: 0.166667	macro_auc: 0.437500
    >>> classification_report(
    ...    y_true, y_pred,
    ...    y_score,
    ...    labels=[0, 1],
    ...    multiclass_to_multilabel=True,
    ... )    # doctest: +NORMALIZE_WHITESPACE
               precision  recall        f1   auc     aupoc  support
    0               0.00   0.000  0.000000  1.00  1.000000        1
    1               0.50   0.250  0.333333  0.25  0.583333        4
    macro_avg       0.25   0.125  0.166667  0.45  0.583333        5
    accuracy: 0.166667	macro_auc: 0.437500
    """
    if y_pred is not None:
        check_consistent_length(y_true, y_pred)
    if y_score is not None:
        check_consistent_length(y_true, y_score)

    assert y_pred is not None or y_score is not None

    average_options = set(
        as_list(average_options) if average_options else ["macro"])
    average_label_fmt = "{average}_avg"
    average_metric_fmt = "{average}_{metric}"

    if y_pred is not None:
        _unique_labels = unique_labels(y_true, y_pred)
    else:
        _unique_labels = unique_labels(y_true)

    labels = _unique_labels if labels is None else labels
    labels_set = set(labels)

    if not metrics:
        if y_pred is not None:
            metrics = [
                "accuracy",
                "precision",
                "recall",
                "f1",
            ]
        else:
            metrics = []
        if y_score is not None:
            metrics += [
                "auc",
                "aupoc",
            ]
        if y_pred is not None:
            metrics += ["support"]
    _metrics = set(metrics)

    ret = OrderedDict()

    if _metrics & {"precision", "recall", "f1", "support", "accuracy"}:
        logger.info(
            "evaluate %s" %
            ",".join(_metrics
                     & {"precision", "recall", "f1", "support", "accuracy"}))
        cr_result = cr(y_true,
                       y_pred,
                       labels=labels,
                       sample_weight=sample_weight,
                       output_dict=True)

        if "accuracy" in cr_result:
            acc = cr_result.pop("accuracy")
        else:
            acc = accuracy_score(y_true, y_pred)

        if "accuracy" in _metrics:
            ret["accuracy"] = acc

        for key, value in cr_result.items():
            ret[key] = {}

            for k in _metrics & {
                    "precision", "recall", "f1", "support", "accuracy"
            }:
                _k = k if k != "f1" else "f1-score"
                if _k in value:
                    ret[key][k] = value[_k]

        for average in ["micro", "macro", "samples", "weighted"]:
            _label = average_label_fmt.format(average=average)
            __label = " ".join(_label.split("_"))
            _prefix = __label.split(" ")[0]
            if _prefix in average_options:
                ret[_label] = ret.pop(__label)
            elif __label in ret:
                ret.pop(__label)

    if "auc" in _metrics:
        logger.info("evaluate auc")

        assert y_score is not None, "when evaluate auc, y_score is required"

        func = functools.partial(roc_auc_score,
                                 y_score=y_score,
                                 sample_weight=sample_weight,
                                 **kwargs.get("auc", {"multi_class": 'ovo'}))

        if multiclass_to_multilabel:
            _y_true = multiclass2multilabel(y_true)
            auc_score = func(y_true=_y_true, average=None)
            for _label, score in enumerate(auc_score):
                if _label not in labels_set:
                    continue
                if str(_label) not in ret:
                    ret[str(_label)] = {}
                ret[str(_label)]["auc"] = score

            for average in average_options:
                auc_score = func(y_true=_y_true, average=average)
                _label = average_label_fmt.format(average=average)
                if _label not in ret:
                    ret[_label] = {}

                ret[_label]["auc"] = auc_score

        for average in average_options:
            auc_score = func(y_true=y_true, average=average)
            _label = average_metric_fmt.format(average=average, metric="auc")
            ret[_label] = auc_score

    if "aupoc" in _metrics:
        logger.info("evaluate aupoc")

        func = functools.partial(average_precision_score,
                                 y_score=y_score,
                                 sample_weight=sample_weight)

        if multiclass_to_multilabel:
            _y_true = multiclass2multilabel(y_true)
            aupoc = func(y_true=_y_true, average=None)
            for _label, score in enumerate(aupoc):
                if _label not in labels_set:
                    continue
                if str(_label) not in ret:
                    ret[str(_label)] = {}
                ret[str(_label)]["aupoc"] = score
            for average in average_options:
                _label = average_label_fmt.format(average=average)
                aupoc = func(y_true=_y_true, average=average)
                if _label not in ret:
                    ret[_label] = {}
                ret[_label]["aupoc"] = aupoc
        if len(_unique_labels) == 2:
            for average in average_options:
                aupoc = func(y_true=y_true)
                _label = average_metric_fmt.format(average=average,
                                                   metric="aupoc")
                ret[_label] = aupoc

    logger.info("sorting metrics")
    _ret = POrderedDict()
    for key in ret:
        if isinstance(ret[key], dict):
            _ret[key] = OrderedDict()
            for k in metrics:
                if k in ret[key]:
                    _ret[key][k] = ret[key][k]
        else:
            _ret[key] = ret[key]

    return _ret