def add(self, *columns): for column in as_list(columns): if column is None: # pragma: no cover continue _match = self._pattern.match(column) _exclude_tag = _match.group("exclude_tag") _regex_tag = _match.group("regex_tag") _pattern_str = _match.group("pattern_str") if not _regex_tag: if _exclude_tag: self.exact_exclude_columns.add(_pattern_str) else: self.exact_include_columns.add(_pattern_str) else: if _exclude_tag: self.regex_exclude_columns.append(re.compile(_pattern_str)) else: self.regex_include_columns.append(re.compile(_pattern_str)) assert not ( (self.exact_include_columns or self.regex_include_columns ) and (self.exact_exclude_columns or self.regex_exclude_columns) ), "include mode and exclude mode are exclusive" self._mode = "include" if self.exact_include_columns or self.regex_include_columns else "exclude"
def _target_names(*files, target_names=None, suffix, prefix=""): """ Examples -------- >>> files = ["x.txt"] >>> _target_names(*files, suffix=[".train", ".test"]) [['x.train.txt', 'x.test.txt']] >>> _target_names(*files, suffix=[".train", ".test"], prefix="data/") [['data/x.train.txt', 'data/x.test.txt']] >>> _target_names(*files, suffix=[".train", ".test"], target_names=[["train.txt", "test.txt"]]) [['train.txt', 'test.txt']] """ if target_names is None: if not prefix: return [[ "%s%s" % (PurePath(_file).with_suffix(_suffix), type_from_name(_file)) for _suffix in suffix ] for _file in files] else: return [[ "%s%s%s" % (prefix, PurePath(_file).with_suffix(_suffix).name, type_from_name(_file)) for _suffix in suffix ] for _file in files] else: return as_list(target_names)
def auto_types(df: pd.DataFrame, excluded: (str, Iterable) = None, verbose=False, pattern_mode=False, **kwargs): """ Only infer the type of object Parameters ---------- df excluded verbose pattern_mode: bool When pattern mode is set as True, matching columns will be inferred using regex pattern, which is time consuming Returns ------- Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "c": ["a", "b", "c", "d", "e"]}) >>> df = auto_types(df) >>> df.dtypes a int64 b float64 c category dtype: object >>> df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "c": ["a", "b", "c", "d", "e"]}) >>> df = auto_types(df, excluded=["c"]) >>> df.dtypes a int64 b float64 c object dtype: object """ __log = _get_log_f(verbose=verbose, **kwargs) if excluded: excluded = set( as_list(excluded) if not pattern_mode else _filter_columns([e for e in excluded], df.columns) ) else: excluded = set() if excluded: __log("Auto typing: excluded columns: %s" % ", ".join(excluded)) for column in tqdm(_get_columns_by_dtype(df, "object"), "auto typing", disable=not verbose): if column in excluded: continue numeric_column = pd.to_numeric(df[column].copy(), errors="coerce") if numeric_column.count() > 0: df[column] = numeric_column else: df[column] = df[column].astype( "category", ) return df
def sample(self, query: (int, str, list), n=1, excluded_key=None, neg=True, *args, **kwargs): candidates = self.df.loc[query][self.neg_field] if neg else self.df.loc[query][self.pos_field] if excluded_key is not None: candidates = list(set(candidates) - set(as_list(excluded_key))) sampled = self.random_state.choice(candidates, min(n, len(candidates)), replace=False).tolist() # rs = default_rng(10) # sampled = rs.choice(candidates, min(n, len(candidates)), replace=False).tolist() return sampled
def __init__(self, columns: Iterable = None): self._pattern = re.compile( r"^(?P<exclude_tag>\[!\])*(?P<regex_tag>\$regex:)*(?P<pattern_str>.*)" ) self.exact_include_columns = set() self.exact_exclude_columns = set() self.regex_include_columns = [] self.regex_exclude_columns = [] self._mode = None if columns is not None: self.add(*as_list(columns))
def eval_f(_net, test_data, ctx=mx.cpu()): k = test_data[1]["k"] k = as_list(k) if k is not None else [] max_k = max(k) if k else None top_k_ground_truth = [] top_k_prediction = [] ground_truth = [] prediction = [] for batch_data in tqdm(test_data[0], "evaluating"): ctx_data = split_and_load(ctx, *batch_data, even_split=False) for (user, item, label) in ctx_data: output = _net(user, item) pred = output label = label.asnumpy().astype("int") pred = pred.asnumpy() ground_truth.append(label.tolist()) prediction.append(pred.tolist()) if max_k: top_k_indices = np.argsort(pred)[::-1] _top_k_indices = top_k_indices[:max_k] padding = [0] * (max_k - len(_top_k_indices)) if len( _top_k_indices) < max_k else [] top_k_prediction.append(pred[_top_k_indices].tolist() + padding) top_k_ground_truth.append(label[_top_k_indices].tolist() + padding) chained_ground_truth = list(chain(*ground_truth)) chained_prediction = list(chain(*prediction)) metrics = { "rmse": mean_squared_error(chained_ground_truth, chained_prediction), "mae": median_absolute_error(chained_ground_truth, chained_prediction), } metrics.update( classification_report( chained_ground_truth, [0 if v < 0.5 else 1 for v in chained_prediction], chained_prediction)) if k: metrics_k = {"ndcg": {}, "HR": {}} for _k in k: metrics_k["ndcg"][_k] = ndcg_score(top_k_ground_truth, top_k_prediction, k=_k) metrics_k["HR"][_k] = _hit_rate(top_k_ground_truth, k=_k) metrics.update(metrics_k) return metrics
def extract_params_combinations(candidates: dict, external=None): """ >>> candidates = {'b': [1, 2], 'c': [0, 3], 'd': '$b'} >>> list(extract_params_combinations(candidates)) [{'b': 1, 'c': 0, 'd': 1}, {'b': 1, 'c': 3, 'd': 1}, {'b': 2, 'c': 0, 'd': 2}, {'b': 2, 'c': 3, 'd': 2}] >>> candidates = {'a': [1, 2], 'b': '$c'} >>> external = {'c': 3} >>> list(extract_params_combinations(candidates, external)) [{'a': 1, 'b': 3}, {'a': 2, 'b': 3}] """ external = {} if external is None else external params_paths, params_values = dict2pv(candidates) params_values = [as_list(value) for value in params_values] for params in itertools.product(*params_values): _params = {} for p, v in zip(params_paths, params): list2dict(p, v, _params) for p, v in zip(params_paths, params): if isinstance(v, str) and v[0] == '$': map_key_path = v.lstrip('$').split(":") _dict_obj = get_dict_by_path(_params, p[:-1]) for map_dict in [_params, external]: try: _map_dict_obj = get_dict_by_path( map_dict, map_key_path) _dict_obj[p[-1]] = _map_dict_obj break except KeyError: try: _map_dict_obj = get_dict_by_path( map_dict, p[:-1] + map_key_path) _dict_obj[p[-1]] = _map_dict_obj break except KeyError: pass else: raise KeyError( "The mapped key should be in either candidates or external, but cannot find %s" % v) yield _params
def implicit_sample(self, query: (int, str, list), n=1, excluded_key=None, fast_mode=False, samples=None, *args, fast_max_try=100, **kwargs): exclude = set(self.df.loc[query][self.pos_field]) | set(self.df.loc[query][self.neg_field]) if excluded_key is not None: exclude |= set(as_list(excluded_key)) if samples is not None: exclude |= set(samples) if fast_mode is False: candidates = list(set(range(*self.key_range)) - exclude) sampled = self.random_state.choice(candidates, min(n, len(candidates)), replace=False).tolist() return sampled else: sampled = set() try_cnt = 0 while len(sampled) < n and try_cnt < n + fast_max_try: _sample = self.random_state.integers(*self.key_range) if _sample not in exclude and _sample not in sampled: sampled.add(_sample) try_cnt += 1 return list(sampled)
def as_array(obj): if isinstance(obj, np.ndarray): return obj else: return np.asarray(as_list(obj))
def ranking_report(y_true, y_pred, k: (int, list) = None, continuous=False, coerce="ignore", pad_pred=-100, metrics=None, bottom=False, verbose=True) -> POrderedDict: r""" Parameters ---------- y_true y_pred k continuous coerce pad_pred metrics bottom verbose Returns ------- Examples -------- >>> y_true = [[1, 0, 0], [0, 0, 1]] >>> y_pred = [[0.75, 0.5, 1], [1, 0.2, 0.1]] >>> ranking_report(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k 1 1.000000 0.000000 0.0 0.0 1.0 2 3 0.565465 0.333333 1.0 0.5 3.0 2 5 0.565465 0.333333 1.0 0.5 3.0 2 10 0.565465 0.333333 1.0 0.5 3.0 2 auc: 0.250000 map: 0.416667 mrr: 0.416667 coverage_error: 2.500000 ranking_loss: 0.750000 len: 3.000000 support: 2 >>> ranking_report(y_true, y_pred, k=[1, 3, 5]) # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k 1 1.000000 0.000000 0.0 0.0 1.0 2 3 0.565465 0.333333 1.0 0.5 3.0 2 5 0.565465 0.333333 1.0 0.5 3.0 2 auc: 0.250000 map: 0.416667 mrr: 0.416667 coverage_error: 2.500000 ranking_loss: 0.750000 len: 3.000000 support: 2 >>> ranking_report(y_true, y_pred, bottom=True) # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k ndcg@k(B) \ 1 1.000000 0.000000 0.0 0.0 1.0 2 1.000000 3 0.565465 0.333333 1.0 0.5 3.0 2 0.806574 5 0.565465 0.333333 1.0 0.5 3.0 2 0.806574 10 0.565465 0.333333 1.0 0.5 3.0 2 0.806574 <BLANKLINE> precision@k(B) recall@k(B) f1@k(B) len@k(B) support@k(B) 1 0.500000 0.25 0.333333 1.0 2 3 0.666667 1.00 0.800000 3.0 2 5 0.666667 1.00 0.800000 3.0 2 10 0.666667 1.00 0.800000 3.0 2 auc: 0.250000 map: 0.416667 mrr: 0.416667 coverage_error: 2.500000 ranking_loss: 0.750000 len: 3.000000 support: 2 map(B): 0.708333 mrr(B): 0.750000 >>> ranking_report(y_true, y_pred, bottom=True, metrics=["auc"]) # doctest: +NORMALIZE_WHITESPACE auc: 0.250000 len: 3.000000 support: 2 >>> y_true = [[0.9, 0.7, 0.1], [0, 0.5, 1]] >>> y_pred = [[0.75, 0.5, 1], [1, 0.2, 0.1]] >>> ranking_report(y_true, y_pred, continuous=True) # doctest: +NORMALIZE_WHITESPACE ndcg@k len@k support@k 3 0.675647 3.0 2 5 0.675647 3.0 2 10 0.675647 3.0 2 mrr: 0.750000 len: 3.000000 support: 2 >>> y_true = [[1, 0], [0, 0, 1]] >>> y_pred = [[0.75, 0.5], [1, 0.2, 0.1]] >>> ranking_report(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k 1 1.00 0.500000 0.5 0.500000 1.0 2 3 0.75 0.416667 1.0 0.583333 2.5 2 5 0.75 0.416667 1.0 0.583333 2.5 2 10 0.75 0.416667 1.0 0.583333 2.5 2 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 >>> ranking_report(y_true, y_pred, coerce="abandon") # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k 1 1.0 0.500000 0.5 0.5 1.0 2 3 0.5 0.333333 1.0 0.5 3.0 1 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 >>> ranking_report(y_true, y_pred, coerce="padding") # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k 1 1.00 0.500000 0.5 0.500000 1.0 2 3 0.75 0.416667 1.0 0.583333 2.5 2 5 0.75 0.416667 1.0 0.583333 2.5 2 10 0.75 0.416667 1.0 0.583333 2.5 2 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 >>> ranking_report(y_true, y_pred, bottom=True) # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k ndcg@k(B) \ 1 1.00 0.500000 0.5 0.500000 1.0 2 1.000000 3 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 5 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 10 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 <BLANKLINE> precision@k(B) recall@k(B) f1@k(B) len@k(B) support@k(B) 1 0.500000 0.5 0.500000 1.0 2 3 0.583333 1.0 0.733333 2.5 2 5 0.583333 1.0 0.733333 2.5 2 10 0.583333 1.0 0.733333 2.5 2 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 map(B): 0.791667 mrr(B): 0.750000 >>> ranking_report(y_true, y_pred, bottom=True, coerce="abandon") # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k ndcg@k(B) \ 1 1.0 0.500000 0.5 0.5 1.0 2 1.000000 3 0.5 0.333333 1.0 0.5 3.0 1 0.693426 <BLANKLINE> precision@k(B) recall@k(B) f1@k(B) len@k(B) support@k(B) 1 0.500000 0.5 0.5 1.0 2 3 0.666667 1.0 0.8 3.0 1 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 map(B): 0.791667 mrr(B): 0.750000 >>> ranking_report(y_true, y_pred, bottom=True, coerce="padding") # doctest: +NORMALIZE_WHITESPACE ndcg@k precision@k recall@k f1@k len@k support@k ndcg@k(B) \ 1 1.00 0.500000 0.5 0.500000 1.0 2 1.000000 3 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 5 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 10 0.75 0.416667 1.0 0.583333 2.5 2 0.846713 <BLANKLINE> precision@k(B) recall@k(B) f1@k(B) len@k(B) support@k(B) 1 0.50 0.5 0.500000 1.0 2 3 0.50 1.0 0.650000 3.0 2 5 0.30 1.0 0.452381 5.0 2 10 0.15 1.0 0.257576 10.0 2 auc: 0.500000 map: 0.666667 mrr: 0.666667 coverage_error: 2.000000 ranking_loss: 0.500000 len: 2.500000 support: 2 map(B): 0.791667 mrr(B): 0.750000 """ import numpy as np from collections import OrderedDict from sklearn.metrics import (label_ranking_average_precision_score, ndcg_score, label_ranking_loss, coverage_error) assert coerce in {"ignore", "abandon", "raise", "padding"} if metrics is None: metrics = ["mrr", "ndcg"] if continuous is False: metrics.extend([ "auc", "map", "coverage_error", "ranking_loss", "precision", "recall", "f1" ]) metrics = set(metrics) if k is not None: k = as_list(k) else: if continuous is True: k = [3, 5, 10] else: k = [1, 3, 5, 10] results = { "auc": [], "map": [], "mrr": [], "coverage_error": [], "ranking_loss": [], "len": [], "support": [], } if bottom: results.update({ "map(B)": [], "mrr(B)": [], }) k_results = {} for _k in k: k_results[_k] = { "ndcg@k": [], "precision@k": [], "recall@k": [], "f1@k": [], "len@k": [], "support@k": [], } if bottom: k_results[_k].update({ "ndcg@k(B)": [], "precision@k(B)": [], "recall@k(B)": [], "f1@k(B)": [], "len@k(B)": [], "support@k(B)": [], }) suffix = [""] if bottom: suffix += ["(B)"] for label, pred in tqdm(zip(y_true, y_pred), "ranking metrics", disable=not verbose): if continuous is False and "map" in metrics: results["map"].append( label_ranking_average_precision_score([label], [pred])) if bottom: results["map(B)"].append( label_ranking_average_precision_score( [(1 - np.asarray(label)).tolist()], [(-np.asarray(pred)).tolist()])) if len(label) > 1 and continuous is False: if "coverage_error" in metrics: results["coverage_error"].append( coverage_error([label], [pred])) if "ranking_loss" in metrics: results["ranking_loss"].append( label_ranking_loss([label], [pred])) results["len"].append(len(label)) results["support"].append(1) label_pred = list( sorted(zip(label, pred), key=lambda x: x[1], reverse=True)) sorted_label = list(zip(*label_pred))[0] if "auc" in metrics: results["auc"].append(ranking_auc(sorted_label)) if "mrr" in metrics: try: results["mrr"].append( 1 / (np.asarray(sorted_label).nonzero()[0][0] + 1)) except IndexError: # pragma: no cover pass try: if bottom: results["mrr(B)"].append( 1 / (np.asarray(sorted_label[::-1]).nonzero()[0][0] + 1)) except IndexError: # pragma: no cover pass if metrics & {"ndcg", "precision", "recall", "f1"}: for _k in k: for _suffix in suffix: if _suffix == "": _label_pred = deepcopy(label_pred) if len(_label_pred) < _k: if coerce == "ignore": pass elif coerce == "abandon": continue elif coerce == "raise": raise ValueError( "Not enough value: %s vs target %s" % (len(_label_pred), _k)) elif coerce == "padding": # pragma: no cover _label_pred += [(0, pad_pred) ] * (_k - len(_label_pred)) k_label_pred = label_pred[:_k] total_label = sum(label) else: inv_label_pred = [(1 - _l, -p) for _l, p in label_pred][::-1] if len(inv_label_pred) < _k: if coerce == "ignore": pass elif coerce == "abandon": continue elif coerce == "raise": # pragma: no cover raise ValueError( "Not enough value: %s vs target %s" % (len(inv_label_pred), _k)) elif coerce == "padding": inv_label_pred += [ (0, pad_pred) ] * (_k - len(inv_label_pred)) k_label_pred = inv_label_pred[:_k] total_label = len(label) - sum(label) if not k_label_pred: # pragma: no cover continue k_label, k_pred = list(zip(*k_label_pred)) if "ndcg" in metrics: if len(k_label) == 1 and "ndcg" in metrics: k_results[_k]["ndcg@k%s" % _suffix].append(1) else: k_results[_k]["ndcg@k%s" % _suffix].append( ndcg_score([k_label], [k_pred])) p = sum(k_label) / len(k_label) r = sum(k_label) / total_label if total_label else 0 if "precision" in metrics: k_results[_k]["precision@k%s" % _suffix].append(p) if "recall" in metrics: k_results[_k]["recall@k%s" % _suffix].append(r) if "f1" in metrics: k_results[_k]["f1@k%s" % _suffix].append(2 * p * r / (p + r) if p + r else 0) k_results[_k]["len@k%s" % _suffix].append(len(k_label)) k_results[_k]["support@k%s" % _suffix].append(1) ret = POrderedDict() for key, value in results.items(): if value: if key == "support": ret[key] = np.sum(value).item() else: ret[key] = np.mean(value).item() if metrics & {"ndcg", "precision", "recall", "f1"}: for k, key_value in k_results.items(): ret[k] = OrderedDict() for key, value in key_value.items(): if value: if key in {"support@k", "support@k(B)"}: ret[k][key] = np.sum(value).item() else: ret[k][key] = np.mean(value).item() return ret
def loss_dict2tmt_loss(loss_dict, loss2value=lambda x: x, exclude=None, include=None, as_loss=as_tmt_loss): """ Parameters ---------- loss_dict loss2value exclude include as_loss Returns ------- Examples -------- >>> def mse(v): ... return v ** 2 >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x}) >>> losses.keys() dict_keys(['mse', 'rmse']) >>> ema = EMAValue(losses) >>> losses["mse"](2) 4 >>> losses["rmse"](2) 2 >>> ema.items() dict_items([('mse', 4), ('rmse', 2)]) >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x}, include="mse") >>> losses.keys() dict_keys(['mse', 'rmse']) >>> ema = EMAValue(losses, auto="ignore") >>> losses["mse"](2) 4 >>> losses["rmse"](2) 2 >>> ema.items() dict_items([('mse', 4), ('rmse', nan)]) >>> losses = loss_dict2tmt_loss({"mse": mse, "rmse": lambda x: x}, exclude="mse") >>> losses.keys() dict_keys(['mse', 'rmse']) >>> ema = EMAValue(losses, auto="ignore") >>> losses["mse"](2) 4 >>> losses["rmse"](2) 2 >>> ema.items() dict_items([('mse', nan), ('rmse', 2)]) """ exclude = set() if exclude is None else set(as_list(exclude)) if include is not None: include = set(as_list(include)) return { name: as_loss(func, loss2value) if name in include else func for name, func in loss_dict.items() } return { name: as_loss(func, loss2value) if name not in exclude else func for name, func in loss_dict.items() }
def regression_report( y_true, y_pred, metrics=None, sample_weight=None, multioutput="uniform_average", average_options=None, key_prefix="", key_suffix="", verbose=True, ): """ Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) Estimated target values. metrics: list of str, Support: evar(explained_variance), mse, rmse, mae, r2 sample_weight : array-like of shape (n_samples,), optional Sample weights. multioutput : string in ['raw_values', 'uniform_average', 'variance_weighted'], list or array-like of shape (n_outputs) Defines aggregating of multiple output values. Disabled when verbose is True. Array-like value defines weights used to average errors. 'raw_values' : Returns a full set of errors in case of multioutput input. 'uniform_average' : Errors of all outputs are averaged with uniform weight. Alias: "macro" 'variance_weighted': Only support in evar and r2. Scores of all outputs are averaged, weighted by the variances of each individual output. Alias: "vw" average_options: str or list default to macro, choices (one or many): "macro", "vw" key_prefix: str key_suffix: str verbose: bool Returns ------- evar: explained variance mse: mean squared error rmse: root mean squared error mae: mean absolute error r2: r2 score Examples --------- >>> y_true = [[0.5, 1, 1], [-1, 1, 1], [7, -6, 1]] >>> y_pred = [[0, 2, 1], [-1, 2, 1], [8, -5, 1]] >>> regression_report(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE evar mse rmse mae r2 0 0.967742 0.416667 0.645497 0.5 0.965438 1 1.000000 1.000000 1.000000 1.0 0.908163 2 1.000000 0.000000 0.000000 0.0 1.000000 uniform_average 0.989247 0.472222 0.548499 0.5 0.957867 variance_weighted 0.983051 0.472222 0.548499 0.5 0.938257 >>> regression_report(y_true, y_pred, verbose=False) # doctest: +NORMALIZE_WHITESPACE evar: 0.989247 mse: 0.472222 rmse: 0.548499 mae: 0.500000 r2: 0.957867 >>> regression_report( ... y_true, y_pred, multioutput="variance_weighted", verbose=False ... ) # doctest: +NORMALIZE_WHITESPACE evar: 0.983051 mse: 0.472222 rmse: 0.548499 mae: 0.500000 r2: 0.938257 >>> regression_report(y_true, y_pred, multioutput=[0.3, 0.6, 0.1], verbose=False) # doctest: +NORMALIZE_WHITESPACE evar: 0.990323 mse: 0.725000 rmse: 0.793649 mae: 0.750000 r2: 0.934529 >>> regression_report(y_true, y_pred, verbose=True) # doctest: +NORMALIZE_WHITESPACE evar mse rmse mae r2 0 0.967742 0.416667 0.645497 0.5 0.965438 1 1.000000 1.000000 1.000000 1.0 0.908163 2 1.000000 0.000000 0.000000 0.0 1.000000 uniform_average 0.989247 0.472222 0.548499 0.5 0.957867 variance_weighted 0.983051 0.472222 0.548499 0.5 0.938257 >>> regression_report( ... y_true, y_pred, verbose=True, average_options=["macro", "vw", [0.3, 0.6, 0.1]] ... ) # doctest: +NORMALIZE_WHITESPACE evar mse rmse mae r2 0 0.967742 0.416667 0.645497 0.50 0.965438 1 1.000000 1.000000 1.000000 1.00 0.908163 2 1.000000 0.000000 0.000000 0.00 1.000000 uniform_average 0.989247 0.472222 0.548499 0.50 0.957867 variance_weighted 0.983051 0.472222 0.548499 0.50 0.938257 weighted 0.990323 0.725000 0.793649 0.75 0.934529 """ legal_metrics = ["evar", "rmse", "mse", "mae", "r2"] if not metrics: metrics = legal_metrics _metrics = set(metrics) assert not _metrics - set(legal_metrics) y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) average_options = as_list(average_options) if average_options else [ "macro", "vw" ] alias_dict = { "macro": "uniform_average", "vw": "variance_weighted", } ret = POrderedDict() if len(y_true.shape) > 1 and verbose: _ret = regression_report( y_true, y_pred, sample_weight=sample_weight, metrics=_metrics, multioutput="raw_values", key_prefix=key_prefix, key_suffix=key_suffix, verbose=False, ) for i in range(y_true.shape[1]): ret[i] = {} for _metric in _ret.keys(): ret[i][_metric] = _ret[_metric][i] for _multioutput in average_options: __multioutput = _multioutput if isinstance( _multioutput, list) else alias_dict.get( _multioutput, _multioutput) _ret = regression_report(y_true, y_pred, metrics=_metrics, sample_weight=sample_weight, multioutput=__multioutput, key_prefix=key_prefix, key_suffix=key_suffix, verbose=False) _name = "weighted" if isinstance(_multioutput, list) else __multioutput ret[_name] = {} for _metric in _ret: ret[_name][_metric] = _ret[_metric] else: if "evar" in _metrics: ret[key_prefix + "evar" + key_suffix] = explained_variance_score( y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) if "mse" in _metrics: _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput ret[key_prefix + "mse" + key_suffix] = mean_squared_error( y_true, y_pred, sample_weight=sample_weight, multioutput=_multioutput) if "rmse" in _metrics: _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput ret[key_prefix + "rmse" + key_suffix] = mean_squared_error( y_true, y_pred, sample_weight=sample_weight, squared=False, multioutput=_multioutput) if "mae" in _metrics: _multioutput = "uniform_average" if multioutput == "variance_weighted" else multioutput ret[key_prefix + "mae" + key_suffix] = mean_absolute_error( y_true, y_pred, sample_weight=sample_weight, multioutput=_multioutput) if "r2" in metrics: ret[key_prefix + "r2" + key_suffix] = r2_score( y_true, y_pred, sample_weight=sample_weight, multioutput=multioutput) return ret
def category2codes( df: pd.DataFrame, offset: (int, list, dict) = 1, columns: (str, Iterable) = None, pattern_mode=True, verbose=False, inplace=True, **kwargs): """ numerically encoding the categorical columns Parameters ---------- df: pd.DataFrame offset: int, list, dict 0 or 1, default to 1 for the situation where exception or nan exists. columns: str or Iterable categorical column to transfer to codes pattern_mode: bool When pattern mode is set as True, matching columns will be inferred using regex pattern, which is time consuming verbose inplace Returns ------- Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({"a1": [1, 2, 300], "a2": [0.1, 0.2, 0.3], "b": ["a", "c", "d"]}) >>> df.dtypes a1 int64 a2 float64 b object dtype: object >>> df = columns_to_category(df, ["a1", "a2", "b"]) >>> df.dtypes a1 category a2 category b category dtype: object >>> category2codes(df, offset=[0, 1, 2], inplace=False) a1 a2 b 0 0 1 2 1 1 2 3 2 2 3 4 >>> category2codes(df, offset=0, columns=["$regex:^a.*$"], pattern_mode=True, inplace=False) a1 a2 b 0 0 0 a 1 1 1 c 2 2 2 d >>> category2codes(df, offset={"a1": 0, "a2": 1, "b": 0}, inplace=False) a1 a2 b 0 0 1 0 1 1 2 1 2 2 3 2 >>> from collections import defaultdict >>> d_offset = defaultdict(int) >>> d_offset.update({"a1": 2, "a2": 1}) >>> category2codes(df, offset=d_offset, inplace=False) a1 a2 b 0 2 1 0 1 3 2 1 2 4 3 2 """ df = df if inplace else df.copy() __log = _get_log_f(verbose=verbose, **kwargs) columns = as_list(columns) if columns else df.select_dtypes(include=["category"]).columns.values.tolist() if isinstance(offset, (int, list)): columns = _filter_columns(columns, df.columns) if pattern_mode is True else columns if isinstance(offset, list): assert len(columns) == len(offset), "columns[%s] vs offset[%s]" % ( len(columns), len(offset) ) __log("categorical columns to be coded: %s" % list(zip(columns, offset))) else: __log("categorical columns to be coded: %s" % ", ".join(columns)) __log("offset is %s" % offset) offset = [offset] * len(columns) elif isinstance(offset, dict): # assert pattern_mode is True, "set pattern mode as True when offset is dict" columns = _filter_columns(columns, df.columns) offset = [ offset[column] for column in columns ] __log("categorical columns to be coded: %s" % list(zip(columns, offset))) else: raise TypeError("Cannot handle %s type offset" % type(offset)) for column, _offset in tqdm(zip(columns, offset), "encoding categorical columns", disable=not verbose): df[column] = df[column].cat.codes + _offset return df
def columns_to_category(df: pd.DataFrame, columns: list, to_codes: bool = False, columns_to_codes: list = None, pattern_mode=True, code_pattern_mode=None, verbose=False, inplace=True, *args, **kwargs): """ transfer the specified columns into category type Parameters ---------- df columns to_codes columns_to_codes pattern_mode: bool When pattern mode is set as True, matching columns will be inferred using regex pattern, which is time consuming code_pattern_mode: bool or None When pattern mode is set as True, matching columns will be inferred using regex pattern, which is time consuming verbose inplace args kwargs Returns ------- Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({"a": [1, 2, 300, 4, 5], "b": [0.1, 0.2, 0.3, 0.4, 0.5], "a2": ["a", "b", "c", "d", "e"]}) >>> df.dtypes a int64 b float64 a2 object dtype: object >>> df = columns_to_category(df, ["a2"]) >>> df.dtypes a int64 b float64 a2 category dtype: object >>> columns_to_category(df, ["a2"], to_codes=True, columns_to_codes=["a2"]) a b a2 0 1 0.1 1 1 2 0.2 2 2 300 0.3 3 3 4 0.4 4 4 5 0.5 5 >>> columns_to_category(df, ["a", "b"], to_codes=True, offset=0, inplace=False) a b a2 0 0 0 1 1 1 1 2 2 4 2 3 3 2 3 4 4 3 4 5 >>> columns_to_category(df, ["a"], to_codes=True, offset=2, inplace=False) a b a2 0 2 0.1 1 1 3 0.2 2 2 6 0.3 3 3 4 0.4 4 4 5 0.5 5 """ df = df if inplace else df.copy() __log = _get_log_f(verbose=verbose, **kwargs) columns = as_list(columns) columns = _filter_columns(columns, df.columns) if pattern_mode else columns __log("columns to be categorical: %s" % columns) for column in tqdm(columns, "columns to be categorical", disable=not verbose): if df[column].dtype.name == "category": __log("column[%s] has been categorical, ignored" % column) continue df[column] = df[column].astype( "category", ) if to_codes: columns_to_codes = columns if not columns_to_codes else columns_to_codes code_pattern_mode = pattern_mode if code_pattern_mode is None else code_pattern_mode columns = _filter_columns(columns_to_codes, columns) if code_pattern_mode else as_list(columns_to_codes) category2codes(df, columns=columns, verbose=verbose, pattern_mode=False, **kwargs) return df
def columns_to_datetime( df: pd.DataFrame, columns: (str, list), datetime_format: (str, list) = None, pattern_mode=False, verbose=False, *args, **kwargs): """ Parameters ---------- df columns: str or list The columns to be interpreted as datetime datetime_format pattern_mode: bool When pattern mode is set as True, matching columns will be inferred using regex pattern, which is time consuming verbose Returns ------- Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({"id": [1, 2, 3], "t": ["1931-09-18", "1949-10-01", "2020-10-12"]}) >>> df = columns_to_datetime(df, "t") >>> df.dtypes id int64 t datetime64[ns] dtype: object >>> df = pd.DataFrame({"t1": ["1931-09-18", "1949-10-01", "unknown"], "t2": ["20201012", "20201013", "unknown"]}) >>> df = columns_to_datetime(df, ["t1", "t2"]) >>> df.dtypes t1 datetime64[ns] t2 datetime64[ns] dtype: object >>> df t1 t2 0 1931-09-18 2020-10-12 1 1949-10-01 2020-10-13 2 NaT NaT >>> df = pd.DataFrame({ ... "t1": ["1931-09-18:023358", "1949-10-01:040909"], ... "t2": ["20201012-011203", "20201013-070301"] ... }) >>> df = columns_to_datetime( ... df, ["t1", "t2"], ... datetime_format=["%Y-%m-%d:%H%M%S", "%Y%m%d-%H%M%S"] ... ) >>> df t1 t2 0 1931-09-18 02:33:58 2020-10-12 01:12:03 1 1949-10-01 04:09:09 2020-10-13 07:03:01 >>> df = pd.DataFrame({ ... "t1": ["1931-09-18:023358", "1949-10-01:040909"], ... "t2": ["20201012-011203", "20201013-070301"] ... }) >>> df = columns_to_datetime( ... df, ["t1", "t2"], ... datetime_format={"t1": "%Y-%m-%d:%H%M%S", "t2": "%Y%m%d-%H%M%S"} ... ) >>> df t1 t2 0 1931-09-18 02:33:58 2020-10-12 01:12:03 1 1949-10-01 04:09:09 2020-10-13 07:03:01 """ __log = _get_log_f(verbose=verbose, **kwargs) columns = as_list(columns) if datetime_format is None or isinstance(datetime_format, (str, list)): columns = _filter_columns(columns, df.columns) if pattern_mode is True else columns if isinstance(datetime_format, list): assert len(columns) == len(datetime_format), "columns[%s] vs datetime_format[%s]" % ( len(columns), len(datetime_format) ) __log("columns to be datetime: %s" % list(zip(columns, datetime_format))) else: __log("columns to be datetime: %s" % ", ".join(columns)) __log("datetime format is %s" % datetime_format) datetime_format = [datetime_format] * len(columns) elif isinstance(datetime_format, dict): # assert pattern_mode is True, "set pattern mode as True when datetime_format is dict" columns = _filter_columns(columns, df.columns) datetime_format = [ datetime_format[column] for column in columns ] __log("columns to be datetime: %s" % list(zip(columns, datetime_format))) else: raise TypeError("Cannot handle %s type datetime_format" % type(datetime_format)) for column, datetime_format in tqdm(zip(columns, datetime_format), "columns to datetime", disable=not verbose): df[column] = pd.to_datetime(df[column], format=datetime_format, errors="coerce", *args, **kwargs) return df
def begin_states(shapes, prefix, func=mx.nd.zeros): states = [] for i, shape in enumerate(as_list(shapes)): state = func(name='%sbegin_state_%d' % (prefix, i), shape=shape) states.append(state) return states
def as_array(array): if isinstance(array, nd.NDArray): return array else: return nd.array(as_list(array))
def learn(self, learning_item: (int, str, list)): for concept in as_list(learning_item): self.state[concept] = max( self.state[concept] + self.capacity[concept], 1)
def _get_columns_by_dtype(df: pd.DataFrame, dtype: (str, list)): dtype = as_list(dtype) return df.select_dtypes(include=dtype).columns