Ejemplo n.º 1
0
    def add(self, *columns):
        for column in as_list(columns):
            if column is None:  # pragma: no cover
                continue

            _match = self._pattern.match(column)

            _exclude_tag = _match.group("exclude_tag")
            _regex_tag = _match.group("regex_tag")
            _pattern_str = _match.group("pattern_str")

            if not _regex_tag:
                if _exclude_tag:
                    self.exact_exclude_columns.add(_pattern_str)
                else:
                    self.exact_include_columns.add(_pattern_str)
            else:
                if _exclude_tag:
                    self.regex_exclude_columns.append(re.compile(_pattern_str))
                else:
                    self.regex_include_columns.append(re.compile(_pattern_str))
        assert not (
                (self.exact_include_columns or self.regex_include_columns
                 ) and (self.exact_exclude_columns or self.regex_exclude_columns)
        ), "include mode and exclude mode are exclusive"

        self._mode = "include" if self.exact_include_columns or self.regex_include_columns else "exclude"
Ejemplo n.º 2
0
def _target_names(*files, target_names=None, suffix, prefix=""):
    """

    Examples
    --------
    >>> files = ["x.txt"]
    >>> _target_names(*files, suffix=[".train", ".test"])
    [['x.train.txt', 'x.test.txt']]
    >>> _target_names(*files, suffix=[".train", ".test"], prefix="data/")
    [['data/x.train.txt', 'data/x.test.txt']]
    >>> _target_names(*files, suffix=[".train", ".test"], target_names=[["train.txt", "test.txt"]])
    [['train.txt', 'test.txt']]
    """
    if target_names is None:
        if not prefix:
            return [[
                "%s%s" %
                (PurePath(_file).with_suffix(_suffix), type_from_name(_file))
                for _suffix in suffix
            ] for _file in files]
        else:
            return [[
                "%s%s%s" % (prefix, PurePath(_file).with_suffix(_suffix).name,
                            type_from_name(_file)) for _suffix in suffix
            ] for _file in files]
    else:
        return as_list(target_names)
Ejemplo n.º 3
0
def auto_types(df: pd.DataFrame, excluded: (str, Iterable) = None, verbose=False, pattern_mode=False, **kwargs):
    """
    Only infer the type of object

    Parameters
    ----------
    df
    excluded
    verbose
    pattern_mode: bool
        When pattern mode is set as True,
        matching columns will be inferred using regex pattern, which is time consuming

    Returns
    -------


    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "c": ["a", "b", "c", "d", "e"]})
    >>> df = auto_types(df)
    >>> df.dtypes
    a       int64
    b     float64
    c    category
    dtype: object
    >>> df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "c": ["a", "b", "c", "d", "e"]})
    >>> df = auto_types(df, excluded=["c"])
    >>> df.dtypes
    a      int64
    b    float64
    c     object
    dtype: object
    """
    __log = _get_log_f(verbose=verbose, **kwargs)
    if excluded:
        excluded = set(
            as_list(excluded) if not pattern_mode
            else _filter_columns([e for e in excluded], df.columns)
        )
    else:
        excluded = set()

    if excluded:
        __log("Auto typing: excluded columns: %s" % ", ".join(excluded))

    for column in tqdm(_get_columns_by_dtype(df, "object"), "auto typing", disable=not verbose):
        if column in excluded:
            continue
        numeric_column = pd.to_numeric(df[column].copy(), errors="coerce")
        if numeric_column.count() > 0:
            df[column] = numeric_column
        else:
            df[column] = df[column].astype(
                "category",
            )
    return df
Ejemplo n.º 4
0
    def sample(self, query: (int, str, list), n=1, excluded_key=None, neg=True, *args, **kwargs):
        candidates = self.df.loc[query][self.neg_field] if neg else self.df.loc[query][self.pos_field]

        if excluded_key is not None:
            candidates = list(set(candidates) - set(as_list(excluded_key)))

        sampled = self.random_state.choice(candidates, min(n, len(candidates)), replace=False).tolist()
        # rs = default_rng(10)
        # sampled = rs.choice(candidates, min(n, len(candidates)), replace=False).tolist()
        return sampled
Ejemplo n.º 5
0
 def __init__(self, columns: Iterable = None):
     self._pattern = re.compile(
         r"^(?P<exclude_tag>\[!\])*(?P<regex_tag>\$regex:)*(?P<pattern_str>.*)"
     )
     self.exact_include_columns = set()
     self.exact_exclude_columns = set()
     self.regex_include_columns = []
     self.regex_exclude_columns = []
     self._mode = None
     if columns is not None:
         self.add(*as_list(columns))
Ejemplo n.º 6
0
def eval_f(_net, test_data, ctx=mx.cpu()):
    k = test_data[1]["k"]
    k = as_list(k) if k is not None else []
    max_k = max(k) if k else None
    top_k_ground_truth = []
    top_k_prediction = []
    ground_truth = []
    prediction = []

    for batch_data in tqdm(test_data[0], "evaluating"):
        ctx_data = split_and_load(ctx, *batch_data, even_split=False)
        for (user, item, label) in ctx_data:
            output = _net(user, item)
            pred = output
            label = label.asnumpy().astype("int")
            pred = pred.asnumpy()
            ground_truth.append(label.tolist())
            prediction.append(pred.tolist())
            if max_k:
                top_k_indices = np.argsort(pred)[::-1]
                _top_k_indices = top_k_indices[:max_k]
                padding = [0] * (max_k - len(_top_k_indices)) if len(
                    _top_k_indices) < max_k else []
                top_k_prediction.append(pred[_top_k_indices].tolist() +
                                        padding)
                top_k_ground_truth.append(label[_top_k_indices].tolist() +
                                          padding)

    chained_ground_truth = list(chain(*ground_truth))
    chained_prediction = list(chain(*prediction))
    metrics = {
        "rmse": mean_squared_error(chained_ground_truth, chained_prediction),
        "mae": median_absolute_error(chained_ground_truth, chained_prediction),
    }

    metrics.update(
        classification_report(
            chained_ground_truth,
            [0 if v < 0.5 else 1
             for v in chained_prediction], chained_prediction))

    if k:
        metrics_k = {"ndcg": {}, "HR": {}}
        for _k in k:
            metrics_k["ndcg"][_k] = ndcg_score(top_k_ground_truth,
                                               top_k_prediction,
                                               k=_k)
            metrics_k["HR"][_k] = _hit_rate(top_k_ground_truth, k=_k)
        metrics.update(metrics_k)
    return metrics
Ejemplo n.º 7
0
def extract_params_combinations(candidates: dict, external=None):
    """
    >>> candidates = {'b': [1, 2], 'c': [0, 3], 'd': '$b'}
    >>> list(extract_params_combinations(candidates))
    [{'b': 1, 'c': 0, 'd': 1}, {'b': 1, 'c': 3, 'd': 1}, {'b': 2, 'c': 0, 'd': 2}, {'b': 2, 'c': 3, 'd': 2}]
    >>> candidates = {'a': [1, 2], 'b': '$c'}
    >>> external = {'c': 3}
    >>> list(extract_params_combinations(candidates, external))
    [{'a': 1, 'b': 3}, {'a': 2, 'b': 3}]
    """
    external = {} if external is None else external
    params_paths, params_values = dict2pv(candidates)
    params_values = [as_list(value) for value in params_values]

    for params in itertools.product(*params_values):
        _params = {}

        for p, v in zip(params_paths, params):
            list2dict(p, v, _params)

        for p, v in zip(params_paths, params):
            if isinstance(v, str) and v[0] == '$':
                map_key_path = v.lstrip('$').split(":")
                _dict_obj = get_dict_by_path(_params, p[:-1])
                for map_dict in [_params, external]:
                    try:
                        _map_dict_obj = get_dict_by_path(
                            map_dict, map_key_path)
                        _dict_obj[p[-1]] = _map_dict_obj
                        break
                    except KeyError:
                        try:
                            _map_dict_obj = get_dict_by_path(
                                map_dict, p[:-1] + map_key_path)
                            _dict_obj[p[-1]] = _map_dict_obj
                            break
                        except KeyError:
                            pass

                else:
                    raise KeyError(
                        "The mapped key should be in either candidates or external, but cannot find %s"
                        % v)

        yield _params
Ejemplo n.º 8
0
 def implicit_sample(self, query: (int, str, list), n=1, excluded_key=None, fast_mode=False, samples=None,
                     *args, fast_max_try=100, **kwargs):
     exclude = set(self.df.loc[query][self.pos_field]) | set(self.df.loc[query][self.neg_field])
     if excluded_key is not None:
         exclude |= set(as_list(excluded_key))
     if samples is not None:
         exclude |= set(samples)
     if fast_mode is False:
         candidates = list(set(range(*self.key_range)) - exclude)
         sampled = self.random_state.choice(candidates, min(n, len(candidates)), replace=False).tolist()
         return sampled
     else:
         sampled = set()
         try_cnt = 0
         while len(sampled) < n and try_cnt < n + fast_max_try:
             _sample = self.random_state.integers(*self.key_range)
             if _sample not in exclude and _sample not in sampled:
                 sampled.add(_sample)
             try_cnt += 1
         return list(sampled)
Ejemplo n.º 9
0
def as_array(obj):
    if isinstance(obj, np.ndarray):
        return obj
    else:
        return np.asarray(as_list(obj))
Ejemplo n.º 10
0
def ranking_report(y_true,
                   y_pred,
                   k: (int, list) = None,
                   continuous=False,
                   coerce="ignore",
                   pad_pred=-100,
                   metrics=None,
                   bottom=False,
                   verbose=True) -> POrderedDict:
    r"""

    Parameters
    ----------
    y_true
    y_pred
    k
    continuous
    coerce
    pad_pred
    metrics
    bottom
    verbose

    Returns
    -------

    Examples
    --------
    >>> y_true = [[1, 0, 0], [0, 0, 1]]
    >>> y_pred = [[0.75, 0.5, 1], [1, 0.2, 0.1]]
    >>> ranking_report(y_true, y_pred)  # doctest: +NORMALIZE_WHITESPACE
           ndcg@k  precision@k  recall@k  f1@k  len@k  support@k
    1   1.000000     0.000000       0.0   0.0    1.0          2
    3   0.565465     0.333333       1.0   0.5    3.0          2
    5   0.565465     0.333333       1.0   0.5    3.0          2
    10  0.565465     0.333333       1.0   0.5    3.0          2
    auc: 0.250000	map: 0.416667	mrr: 0.416667	coverage_error: 2.500000	ranking_loss: 0.750000	len: 3.000000
    support: 2
    >>> ranking_report(y_true, y_pred, k=[1, 3, 5])  # doctest: +NORMALIZE_WHITESPACE
           ndcg@k  precision@k  recall@k  f1@k  len@k  support@k
    1   1.000000     0.000000       0.0   0.0    1.0          2
    3   0.565465     0.333333       1.0   0.5    3.0          2
    5   0.565465     0.333333       1.0   0.5    3.0          2
    auc: 0.250000	map: 0.416667	mrr: 0.416667	coverage_error: 2.500000	ranking_loss: 0.750000	len: 3.000000
    support: 2
    >>> ranking_report(y_true, y_pred, bottom=True)  # doctest: +NORMALIZE_WHITESPACE
              ndcg@k  precision@k  recall@k  f1@k  len@k  support@k  ndcg@k(B) \
    1   1.000000     0.000000       0.0   0.0    1.0          2   1.000000
    3   0.565465     0.333333       1.0   0.5    3.0          2   0.806574
    5   0.565465     0.333333       1.0   0.5    3.0          2   0.806574
    10  0.565465     0.333333       1.0   0.5    3.0          2   0.806574
    <BLANKLINE>
        precision@k(B)  recall@k(B)   f1@k(B)  len@k(B)  support@k(B)
    1         0.500000         0.25  0.333333       1.0             2
    3         0.666667         1.00  0.800000       3.0             2
    5         0.666667         1.00  0.800000       3.0             2
    10        0.666667         1.00  0.800000       3.0             2
    auc: 0.250000	map: 0.416667	mrr: 0.416667	coverage_error: 2.500000	ranking_loss: 0.750000	len: 3.000000
    support: 2	map(B): 0.708333	mrr(B): 0.750000
    >>> ranking_report(y_true, y_pred, bottom=True, metrics=["auc"])  # doctest: +NORMALIZE_WHITESPACE
    auc: 0.250000   len: 3.000000	support: 2
    >>> y_true = [[0.9, 0.7, 0.1], [0, 0.5, 1]]
    >>> y_pred = [[0.75, 0.5, 1], [1, 0.2, 0.1]]
    >>> ranking_report(y_true, y_pred, continuous=True)  # doctest: +NORMALIZE_WHITESPACE
          ndcg@k  len@k  support@k
    3   0.675647    3.0          2
    5   0.675647    3.0          2
    10  0.675647    3.0          2
    mrr: 0.750000	len: 3.000000	support: 2
    >>> y_true = [[1, 0], [0, 0, 1]]
    >>> y_pred = [[0.75, 0.5], [1, 0.2, 0.1]]
    >>> ranking_report(y_true, y_pred)  # doctest: +NORMALIZE_WHITESPACE
        ndcg@k  precision@k  recall@k      f1@k  len@k  support@k
    1     1.00     0.500000       0.5  0.500000    1.0          2
    3     0.75     0.416667       1.0  0.583333    2.5          2
    5     0.75     0.416667       1.0  0.583333    2.5          2
    10    0.75     0.416667       1.0  0.583333    2.5          2
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000	len: 2.500000
    support: 2
    >>> ranking_report(y_true, y_pred, coerce="abandon")  # doctest: +NORMALIZE_WHITESPACE
       ndcg@k  precision@k  recall@k  f1@k  len@k  support@k
    1     1.0     0.500000       0.5   0.5    1.0          2
    3     0.5     0.333333       1.0   0.5    3.0          1
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000	len: 2.500000
    support: 2
    >>> ranking_report(y_true, y_pred, coerce="padding")  # doctest: +NORMALIZE_WHITESPACE
        ndcg@k  precision@k  recall@k      f1@k  len@k  support@k
    1     1.00     0.500000       0.5  0.500000    1.0          2
    3     0.75     0.416667       1.0  0.583333    2.5          2
    5     0.75     0.416667       1.0  0.583333    2.5          2
    10    0.75     0.416667       1.0  0.583333    2.5          2
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000	len: 2.500000
    support: 2
    >>> ranking_report(y_true, y_pred, bottom=True)  # doctest: +NORMALIZE_WHITESPACE
        ndcg@k  precision@k  recall@k      f1@k  len@k  support@k  ndcg@k(B)  \
    1     1.00     0.500000       0.5  0.500000    1.0          2   1.000000
    3     0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    5     0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    10    0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    <BLANKLINE>
        precision@k(B)  recall@k(B)   f1@k(B)  len@k(B)  support@k(B)
    1         0.500000          0.5  0.500000       1.0             2
    3         0.583333          1.0  0.733333       2.5             2
    5         0.583333          1.0  0.733333       2.5             2
    10        0.583333          1.0  0.733333       2.5             2
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000	len: 2.500000
    support: 2	map(B): 0.791667	mrr(B): 0.750000
    >>> ranking_report(y_true, y_pred, bottom=True, coerce="abandon")  # doctest: +NORMALIZE_WHITESPACE
       ndcg@k  precision@k  recall@k  f1@k  len@k  support@k  ndcg@k(B)  \
    1     1.0     0.500000       0.5   0.5    1.0          2   1.000000
    3     0.5     0.333333       1.0   0.5    3.0          1   0.693426
    <BLANKLINE>
       precision@k(B)  recall@k(B)  f1@k(B)  len@k(B)  support@k(B)
    1        0.500000          0.5      0.5       1.0             2
    3        0.666667          1.0      0.8       3.0             1
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000
    len: 2.500000	support: 2	map(B): 0.791667	mrr(B): 0.750000
    >>> ranking_report(y_true, y_pred, bottom=True, coerce="padding")  # doctest: +NORMALIZE_WHITESPACE
        ndcg@k  precision@k  recall@k      f1@k  len@k  support@k  ndcg@k(B)  \
    1     1.00     0.500000       0.5  0.500000    1.0          2   1.000000
    3     0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    5     0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    10    0.75     0.416667       1.0  0.583333    2.5          2   0.846713
    <BLANKLINE>
        precision@k(B)  recall@k(B)   f1@k(B)  len@k(B)  support@k(B)
    1             0.50          0.5  0.500000       1.0             2
    3             0.50          1.0  0.650000       3.0             2
    5             0.30          1.0  0.452381       5.0             2
    10            0.15          1.0  0.257576      10.0             2
    auc: 0.500000	map: 0.666667	mrr: 0.666667	coverage_error: 2.000000	ranking_loss: 0.500000	len: 2.500000
    support: 2	map(B): 0.791667	mrr(B): 0.750000
    """
    import numpy as np
    from collections import OrderedDict
    from sklearn.metrics import (label_ranking_average_precision_score,
                                 ndcg_score, label_ranking_loss,
                                 coverage_error)
    assert coerce in {"ignore", "abandon", "raise", "padding"}
    if metrics is None:
        metrics = ["mrr", "ndcg"]
        if continuous is False:
            metrics.extend([
                "auc", "map", "coverage_error", "ranking_loss", "precision",
                "recall", "f1"
            ])
    metrics = set(metrics)

    if k is not None:
        k = as_list(k)
    else:
        if continuous is True:
            k = [3, 5, 10]
        else:
            k = [1, 3, 5, 10]

    results = {
        "auc": [],
        "map": [],
        "mrr": [],
        "coverage_error": [],
        "ranking_loss": [],
        "len": [],
        "support": [],
    }
    if bottom:
        results.update({
            "map(B)": [],
            "mrr(B)": [],
        })
    k_results = {}
    for _k in k:
        k_results[_k] = {
            "ndcg@k": [],
            "precision@k": [],
            "recall@k": [],
            "f1@k": [],
            "len@k": [],
            "support@k": [],
        }
        if bottom:
            k_results[_k].update({
                "ndcg@k(B)": [],
                "precision@k(B)": [],
                "recall@k(B)": [],
                "f1@k(B)": [],
                "len@k(B)": [],
                "support@k(B)": [],
            })
    suffix = [""]
    if bottom:
        suffix += ["(B)"]

    for label, pred in tqdm(zip(y_true, y_pred),
                            "ranking metrics",
                            disable=not verbose):
        if continuous is False and "map" in metrics:
            results["map"].append(
                label_ranking_average_precision_score([label], [pred]))
            if bottom:
                results["map(B)"].append(
                    label_ranking_average_precision_score(
                        [(1 - np.asarray(label)).tolist()],
                        [(-np.asarray(pred)).tolist()]))

        if len(label) > 1 and continuous is False:
            if "coverage_error" in metrics:
                results["coverage_error"].append(
                    coverage_error([label], [pred]))
            if "ranking_loss" in metrics:
                results["ranking_loss"].append(
                    label_ranking_loss([label], [pred]))

        results["len"].append(len(label))
        results["support"].append(1)
        label_pred = list(
            sorted(zip(label, pred), key=lambda x: x[1], reverse=True))
        sorted_label = list(zip(*label_pred))[0]
        if "auc" in metrics:
            results["auc"].append(ranking_auc(sorted_label))
        if "mrr" in metrics:
            try:
                results["mrr"].append(
                    1 / (np.asarray(sorted_label).nonzero()[0][0] + 1))
            except IndexError:  # pragma: no cover
                pass
            try:
                if bottom:
                    results["mrr(B)"].append(
                        1 /
                        (np.asarray(sorted_label[::-1]).nonzero()[0][0] + 1))
            except IndexError:  # pragma: no cover
                pass

        if metrics & {"ndcg", "precision", "recall", "f1"}:
            for _k in k:
                for _suffix in suffix:
                    if _suffix == "":
                        _label_pred = deepcopy(label_pred)
                        if len(_label_pred) < _k:
                            if coerce == "ignore":
                                pass
                            elif coerce == "abandon":
                                continue
                            elif coerce == "raise":
                                raise ValueError(
                                    "Not enough value: %s vs target %s" %
                                    (len(_label_pred), _k))
                            elif coerce == "padding":  # pragma: no cover
                                _label_pred += [(0, pad_pred)
                                                ] * (_k - len(_label_pred))
                        k_label_pred = label_pred[:_k]
                        total_label = sum(label)
                    else:
                        inv_label_pred = [(1 - _l, -p)
                                          for _l, p in label_pred][::-1]
                        if len(inv_label_pred) < _k:
                            if coerce == "ignore":
                                pass
                            elif coerce == "abandon":
                                continue
                            elif coerce == "raise":  # pragma: no cover
                                raise ValueError(
                                    "Not enough value: %s vs target %s" %
                                    (len(inv_label_pred), _k))
                            elif coerce == "padding":
                                inv_label_pred += [
                                    (0, pad_pred)
                                ] * (_k - len(inv_label_pred))
                        k_label_pred = inv_label_pred[:_k]
                        total_label = len(label) - sum(label)

                    if not k_label_pred:  # pragma: no cover
                        continue
                    k_label, k_pred = list(zip(*k_label_pred))
                    if "ndcg" in metrics:
                        if len(k_label) == 1 and "ndcg" in metrics:
                            k_results[_k]["ndcg@k%s" % _suffix].append(1)
                        else:
                            k_results[_k]["ndcg@k%s" % _suffix].append(
                                ndcg_score([k_label], [k_pred]))
                    p = sum(k_label) / len(k_label)
                    r = sum(k_label) / total_label if total_label else 0
                    if "precision" in metrics:
                        k_results[_k]["precision@k%s" % _suffix].append(p)
                    if "recall" in metrics:
                        k_results[_k]["recall@k%s" % _suffix].append(r)
                    if "f1" in metrics:
                        k_results[_k]["f1@k%s" %
                                      _suffix].append(2 * p * r /
                                                      (p + r) if p + r else 0)
                    k_results[_k]["len@k%s" % _suffix].append(len(k_label))
                    k_results[_k]["support@k%s" % _suffix].append(1)

    ret = POrderedDict()
    for key, value in results.items():
        if value:
            if key == "support":
                ret[key] = np.sum(value).item()
            else:
                ret[key] = np.mean(value).item()

    if metrics & {"ndcg", "precision", "recall", "f1"}:
        for k, key_value in k_results.items():
            ret[k] = OrderedDict()
            for key, value in key_value.items():
                if value:
                    if key in {"support@k", "support@k(B)"}:
                        ret[k][key] = np.sum(value).item()
                    else:
                        ret[k][key] = np.mean(value).item()
    return ret
Ejemplo n.º 11
0
def loss_dict2tmt_loss(loss_dict,
                       loss2value=lambda x: x,
                       exclude=None,
                       include=None,
                       as_loss=as_tmt_loss):
    """

    Parameters
    ----------
    loss_dict
    loss2value
    exclude
    include
    as_loss

    Returns
    -------

    Examples
    --------
    >>> def mse(v):
    ...     return v ** 2
    >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x})
    >>> losses.keys()
    dict_keys(['mse', 'rmse'])
    >>> ema = EMAValue(losses)
    >>> losses["mse"](2)
    4
    >>> losses["rmse"](2)
    2
    >>> ema.items()
    dict_items([('mse', 4), ('rmse', 2)])
    >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x}, include="mse")
    >>> losses.keys()
    dict_keys(['mse', 'rmse'])
    >>> ema = EMAValue(losses, auto="ignore")
    >>> losses["mse"](2)
    4
    >>> losses["rmse"](2)
    2
    >>> ema.items()
    dict_items([('mse', 4), ('rmse', nan)])
    >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x}, exclude="mse")
    >>> losses.keys()
    dict_keys(['mse', 'rmse'])
    >>> ema = EMAValue(losses, auto="ignore")
    >>> losses["mse"](2)
    4
    >>> losses["rmse"](2)
    2
    >>> ema.items()
    dict_items([('mse', nan), ('rmse', 2)])
    """
    exclude = set() if exclude is None else set(as_list(exclude))
    if include is not None:
        include = set(as_list(include))
        return {
            name: as_loss(func, loss2value) if name in include else func
            for name, func in loss_dict.items()
        }
    return {
        name: as_loss(func, loss2value) if name not in exclude else func
        for name, func in loss_dict.items()
    }
Ejemplo n.º 12
0
def regression_report(
    y_true,
    y_pred,
    metrics=None,
    sample_weight=None,
    multioutput="uniform_average",
    average_options=None,
    key_prefix="",
    key_suffix="",
    verbose=True,
):
    """

    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.

    metrics: list of str,
        Support: evar(explained_variance), mse, rmse, mae, r2

    sample_weight : array-like of shape (n_samples,), optional
        Sample weights.

    multioutput : string in ['raw_values', 'uniform_average', 'variance_weighted'], list
        or array-like of shape (n_outputs)
        Defines aggregating of multiple output values.
        Disabled when verbose is True.
        Array-like value defines weights used to average errors.
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
            Alias: "macro"
        'variance_weighted':
            Only support in evar and r2.
            Scores of all outputs are averaged, weighted by the variances of each individual output.
            Alias: "vw"

    average_options: str or list
        default to macro, choices (one or many): "macro", "vw"

    key_prefix: str
    key_suffix: str
    verbose: bool
    Returns
    -------
    evar: explained variance
    mse: mean squared error
    rmse: root mean squared error
    mae: mean absolute error
    r2: r2 score

    Examples
    ---------
    >>> y_true = [[0.5, 1, 1], [-1, 1, 1], [7, -6, 1]]
    >>> y_pred = [[0, 2, 1], [-1, 2, 1], [8, -5, 1]]
    >>> regression_report(y_true, y_pred)   # doctest: +NORMALIZE_WHITESPACE
                           evar       mse      rmse  mae        r2
    0                  0.967742  0.416667  0.645497  0.5  0.965438
    1                  1.000000  1.000000  1.000000  1.0  0.908163
    2                  1.000000  0.000000  0.000000  0.0  1.000000
    uniform_average    0.989247  0.472222  0.548499  0.5  0.957867
    variance_weighted  0.983051  0.472222  0.548499  0.5  0.938257
    >>> regression_report(y_true, y_pred, verbose=False)   # doctest: +NORMALIZE_WHITESPACE
    evar: 0.989247	mse: 0.472222	rmse: 0.548499	mae: 0.500000	r2: 0.957867
    >>> regression_report(
    ...     y_true, y_pred, multioutput="variance_weighted", verbose=False
    ... )   # doctest: +NORMALIZE_WHITESPACE
    evar: 0.983051	mse: 0.472222	rmse: 0.548499	mae: 0.500000	r2: 0.938257
    >>> regression_report(y_true, y_pred, multioutput=[0.3, 0.6, 0.1], verbose=False)   # doctest: +NORMALIZE_WHITESPACE
    evar: 0.990323	mse: 0.725000	rmse: 0.793649	mae: 0.750000	r2: 0.934529
    >>> regression_report(y_true, y_pred, verbose=True)   # doctest: +NORMALIZE_WHITESPACE
                           evar       mse      rmse  mae        r2
    0                  0.967742  0.416667  0.645497  0.5  0.965438
    1                  1.000000  1.000000  1.000000  1.0  0.908163
    2                  1.000000  0.000000  0.000000  0.0  1.000000
    uniform_average    0.989247  0.472222  0.548499  0.5  0.957867
    variance_weighted  0.983051  0.472222  0.548499  0.5  0.938257
    >>> regression_report(
    ...     y_true, y_pred, verbose=True, average_options=["macro", "vw", [0.3, 0.6, 0.1]]
    ... )   # doctest: +NORMALIZE_WHITESPACE
                           evar       mse      rmse   mae        r2
    0                  0.967742  0.416667  0.645497  0.50  0.965438
    1                  1.000000  1.000000  1.000000  1.00  0.908163
    2                  1.000000  0.000000  0.000000  0.00  1.000000
    uniform_average    0.989247  0.472222  0.548499  0.50  0.957867
    variance_weighted  0.983051  0.472222  0.548499  0.50  0.938257
    weighted           0.990323  0.725000  0.793649  0.75  0.934529
    """
    legal_metrics = ["evar", "rmse", "mse", "mae", "r2"]
    if not metrics:
        metrics = legal_metrics

    _metrics = set(metrics)
    assert not _metrics - set(legal_metrics)

    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    average_options = as_list(average_options) if average_options else [
        "macro", "vw"
    ]

    alias_dict = {
        "macro": "uniform_average",
        "vw": "variance_weighted",
    }

    ret = POrderedDict()

    if len(y_true.shape) > 1 and verbose:
        _ret = regression_report(
            y_true,
            y_pred,
            sample_weight=sample_weight,
            metrics=_metrics,
            multioutput="raw_values",
            key_prefix=key_prefix,
            key_suffix=key_suffix,
            verbose=False,
        )
        for i in range(y_true.shape[1]):
            ret[i] = {}
            for _metric in _ret.keys():
                ret[i][_metric] = _ret[_metric][i]

        for _multioutput in average_options:
            __multioutput = _multioutput if isinstance(
                _multioutput, list) else alias_dict.get(
                    _multioutput, _multioutput)
            _ret = regression_report(y_true,
                                     y_pred,
                                     metrics=_metrics,
                                     sample_weight=sample_weight,
                                     multioutput=__multioutput,
                                     key_prefix=key_prefix,
                                     key_suffix=key_suffix,
                                     verbose=False)
            _name = "weighted" if isinstance(_multioutput,
                                             list) else __multioutput
            ret[_name] = {}
            for _metric in _ret:
                ret[_name][_metric] = _ret[_metric]

    else:
        if "evar" in _metrics:
            ret[key_prefix + "evar" + key_suffix] = explained_variance_score(
                y_true,
                y_pred,
                sample_weight=sample_weight,
                multioutput=multioutput)

        if "mse" in _metrics:
            _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput
            ret[key_prefix + "mse" + key_suffix] = mean_squared_error(
                y_true,
                y_pred,
                sample_weight=sample_weight,
                multioutput=_multioutput)
        if "rmse" in _metrics:
            _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput
            ret[key_prefix + "rmse" + key_suffix] = mean_squared_error(
                y_true,
                y_pred,
                sample_weight=sample_weight,
                squared=False,
                multioutput=_multioutput)

        if "mae" in _metrics:
            _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput
            ret[key_prefix + "mae" + key_suffix] = mean_absolute_error(
                y_true,
                y_pred,
                sample_weight=sample_weight,
                multioutput=_multioutput)

        if "r2" in metrics:
            ret[key_prefix + "r2" + key_suffix] = r2_score(
                y_true,
                y_pred,
                sample_weight=sample_weight,
                multioutput=multioutput)

    return ret
Ejemplo n.º 13
0
def category2codes(
        df: pd.DataFrame, offset: (int, list, dict) = 1, columns: (str, Iterable) = None,
        pattern_mode=True, verbose=False, inplace=True,
        **kwargs):
    """
    numerically encoding the categorical columns

    Parameters
    ----------
    df: pd.DataFrame
    offset: int, list, dict
        0 or 1, default to 1 for the situation where exception or nan exists.
    columns: str or Iterable
        categorical column to transfer to codes
    pattern_mode: bool
        When pattern mode is set as True,
        matching columns will be inferred using regex pattern, which is time consuming
    verbose
    inplace

    Returns
    -------

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"a1": [1, 2, 300], "a2": [0.1, 0.2, 0.3], "b": ["a", "c", "d"]})
    >>> df.dtypes
    a1      int64
    a2    float64
    b      object
    dtype: object
    >>> df = columns_to_category(df, ["a1", "a2", "b"])
    >>> df.dtypes
    a1    category
    a2    category
    b     category
    dtype: object
    >>> category2codes(df, offset=[0, 1, 2], inplace=False)
       a1  a2  b
    0   0   1  2
    1   1   2  3
    2   2   3  4
    >>> category2codes(df, offset=0, columns=["$regex:^a.*$"], pattern_mode=True, inplace=False)
       a1  a2  b
    0   0   0  a
    1   1   1  c
    2   2   2  d
    >>> category2codes(df, offset={"a1": 0, "a2": 1, "b": 0}, inplace=False)
       a1  a2  b
    0   0   1  0
    1   1   2  1
    2   2   3  2
    >>> from collections import defaultdict
    >>> d_offset = defaultdict(int)
    >>> d_offset.update({"a1": 2, "a2": 1})
    >>> category2codes(df, offset=d_offset, inplace=False)
       a1  a2  b
    0   2   1  0
    1   3   2  1
    2   4   3  2
    """
    df = df if inplace else df.copy()
    __log = _get_log_f(verbose=verbose, **kwargs)

    columns = as_list(columns) if columns else df.select_dtypes(include=["category"]).columns.values.tolist()

    if isinstance(offset, (int, list)):
        columns = _filter_columns(columns, df.columns) if pattern_mode is True else columns
        if isinstance(offset, list):
            assert len(columns) == len(offset), "columns[%s] vs offset[%s]" % (
                len(columns), len(offset)
            )
            __log("categorical columns to be coded: %s" % list(zip(columns, offset)))
        else:
            __log("categorical columns to be coded: %s" % ", ".join(columns))
            __log("offset is %s" % offset)
            offset = [offset] * len(columns)
    elif isinstance(offset, dict):
        # assert pattern_mode is True, "set pattern mode as True when offset is dict"
        columns = _filter_columns(columns, df.columns)
        offset = [
            offset[column] for column in columns
        ]
        __log("categorical columns to be coded: %s" % list(zip(columns, offset)))
    else:
        raise TypeError("Cannot handle %s type offset" % type(offset))

    for column, _offset in tqdm(zip(columns, offset), "encoding categorical columns", disable=not verbose):
        df[column] = df[column].cat.codes + _offset

    return df
Ejemplo n.º 14
0
def columns_to_category(df: pd.DataFrame,
                        columns: list, to_codes: bool = False, columns_to_codes: list = None,
                        pattern_mode=True,
                        code_pattern_mode=None,
                        verbose=False,
                        inplace=True,
                        *args, **kwargs):
    """
    transfer the specified columns into category type

    Parameters
    ----------
    df
    columns
    to_codes
    columns_to_codes
    pattern_mode: bool
        When pattern mode is set as True,
        matching columns will be inferred using regex pattern, which is time consuming
    code_pattern_mode: bool or None
        When pattern mode is set as True,
        matching columns will be inferred using regex pattern, which is time consuming
    verbose
    inplace
    args
    kwargs

    Returns
    -------

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"a": [1, 2, 300, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "a2": ["a", "b", "c", "d", "e"]})
    >>> df.dtypes
    a       int64
    b     float64
    a2     object
    dtype: object
    >>> df = columns_to_category(df, ["a2"])
    >>> df.dtypes
    a        int64
    b      float64
    a2    category
    dtype: object
    >>> columns_to_category(df, ["a2"], to_codes=True, columns_to_codes=["a2"])
         a    b  a2
    0    1  0.1   1
    1    2  0.2   2
    2  300  0.3   3
    3    4  0.4   4
    4    5  0.5   5
    >>> columns_to_category(df, ["a", "b"], to_codes=True, offset=0, inplace=False)
       a  b  a2
    0  0  0   1
    1  1  1   2
    2  4  2   3
    3  2  3   4
    4  3  4   5
    >>> columns_to_category(df, ["a"], to_codes=True, offset=2, inplace=False)
       a    b  a2
    0  2  0.1   1
    1  3  0.2   2
    2  6  0.3   3
    3  4  0.4   4
    4  5  0.5   5
    """
    df = df if inplace else df.copy()
    __log = _get_log_f(verbose=verbose, **kwargs)

    columns = as_list(columns)
    columns = _filter_columns(columns, df.columns) if pattern_mode else columns

    __log("columns to be categorical: %s" % columns)

    for column in tqdm(columns, "columns to be categorical", disable=not verbose):
        if df[column].dtype.name == "category":
            __log("column[%s] has been categorical, ignored" % column)
            continue
        df[column] = df[column].astype(
            "category",
        )
    if to_codes:
        columns_to_codes = columns if not columns_to_codes else columns_to_codes
        code_pattern_mode = pattern_mode if code_pattern_mode is None else code_pattern_mode
        columns = _filter_columns(columns_to_codes, columns) if code_pattern_mode else as_list(columns_to_codes)
        category2codes(df, columns=columns, verbose=verbose, pattern_mode=False, **kwargs)

    return df
Ejemplo n.º 15
0
def columns_to_datetime(
        df: pd.DataFrame,
        columns: (str, list),
        datetime_format: (str, list) = None,
        pattern_mode=False,
        verbose=False,
        *args, **kwargs):
    """

    Parameters
    ----------
    df
    columns: str or list
        The columns to be interpreted as datetime
    datetime_format
    pattern_mode: bool
        When pattern mode is set as True,
        matching columns will be inferred using regex pattern, which is time consuming
    verbose

    Returns
    -------

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({"id": [1, 2, 3], "t": ["1931-09-18", "1949-10-01", "2020-10-12"]})
    >>> df = columns_to_datetime(df, "t")
    >>> df.dtypes
    id             int64
    t     datetime64[ns]
    dtype: object
    >>> df = pd.DataFrame({"t1": ["1931-09-18", "1949-10-01", "unknown"], "t2": ["20201012", "20201013", "unknown"]})
    >>> df = columns_to_datetime(df, ["t1", "t2"])
    >>> df.dtypes
    t1    datetime64[ns]
    t2    datetime64[ns]
    dtype: object
    >>> df
              t1         t2
    0 1931-09-18 2020-10-12
    1 1949-10-01 2020-10-13
    2        NaT        NaT
    >>> df = pd.DataFrame({
    ...     "t1": ["1931-09-18:023358", "1949-10-01:040909"],
    ...     "t2": ["20201012-011203", "20201013-070301"]
    ... })
    >>> df = columns_to_datetime(
    ...     df, ["t1", "t2"],
    ...     datetime_format=["%Y-%m-%d:%H%M%S", "%Y%m%d-%H%M%S"]
    ... )
    >>> df
                       t1                  t2
    0 1931-09-18 02:33:58 2020-10-12 01:12:03
    1 1949-10-01 04:09:09 2020-10-13 07:03:01
    >>> df = pd.DataFrame({
    ...     "t1": ["1931-09-18:023358", "1949-10-01:040909"],
    ...     "t2": ["20201012-011203", "20201013-070301"]
    ... })
    >>> df = columns_to_datetime(
    ...     df, ["t1", "t2"],
    ...     datetime_format={"t1": "%Y-%m-%d:%H%M%S", "t2": "%Y%m%d-%H%M%S"}
    ... )
    >>> df
                       t1                  t2
    0 1931-09-18 02:33:58 2020-10-12 01:12:03
    1 1949-10-01 04:09:09 2020-10-13 07:03:01
    """
    __log = _get_log_f(verbose=verbose, **kwargs)

    columns = as_list(columns)

    if datetime_format is None or isinstance(datetime_format, (str, list)):
        columns = _filter_columns(columns, df.columns) if pattern_mode is True else columns
        if isinstance(datetime_format, list):
            assert len(columns) == len(datetime_format), "columns[%s] vs datetime_format[%s]" % (
                len(columns), len(datetime_format)
            )
            __log("columns to be datetime: %s" % list(zip(columns, datetime_format)))
        else:
            __log("columns to be datetime: %s" % ", ".join(columns))
            __log("datetime format is %s" % datetime_format)
            datetime_format = [datetime_format] * len(columns)
    elif isinstance(datetime_format, dict):
        # assert pattern_mode is True, "set pattern mode as True when datetime_format is dict"
        columns = _filter_columns(columns, df.columns)
        datetime_format = [
            datetime_format[column] for column in columns
        ]
        __log("columns to be datetime: %s" % list(zip(columns, datetime_format)))
    else:
        raise TypeError("Cannot handle %s type datetime_format" % type(datetime_format))

    for column, datetime_format in tqdm(zip(columns, datetime_format), "columns to datetime", disable=not verbose):
        df[column] = pd.to_datetime(df[column], format=datetime_format, errors="coerce", *args, **kwargs)

    return df
Ejemplo n.º 16
0
Archivo: net.py Proyecto: tswsxk/XKT
def begin_states(shapes, prefix, func=mx.nd.zeros):
    states = []
    for i, shape in enumerate(as_list(shapes)):
        state = func(name='%sbegin_state_%d' % (prefix, i), shape=shape)
        states.append(state)
    return states
Ejemplo n.º 17
0
def as_array(array):
    if isinstance(array, nd.NDArray):
        return array
    else:
        return nd.array(as_list(array))
Ejemplo n.º 18
0
 def learn(self, learning_item: (int, str, list)):
     for concept in as_list(learning_item):
         self.state[concept] = max(
             self.state[concept] + self.capacity[concept], 1)
Ejemplo n.º 19
0
def _get_columns_by_dtype(df: pd.DataFrame, dtype: (str, list)):
    dtype = as_list(dtype)
    return df.select_dtypes(include=dtype).columns