def update(self, x, y_pred, centers, sample_weight=1.0): self._minimum_separation = self._find_minimum_separation(centers) self._center_centers = {i: stats.Mean() for i in x} for i in self._center_centers: for j in centers: self._center_centers[i].update(centers[j][i], w=sample_weight) center_centers = { i: self._center_centers[i].get() for i in self._center_centers } beta_t = stats.Mean() for i in centers: beta_t.update( utils.math.minkowski_distance(centers[i], center_centers, 2)) self._beta_t = beta_t.get() try: self._n_points_by_cluster[y_pred] += 1 except KeyError: self._n_points_by_cluster[y_pred] = 1 self._n_clusters = len(centers) return self
def update(self, x, y_pred, centers, sample_weight=1.0): if not self._initialized: self._center_all_points = {i: stats.Mean() for i in x} self._initialized = True for i in self._center_all_points: self._center_all_points[i].update(x[i], w=sample_weight) center_all_points = { i: self._center_all_points[i].get() for i in self._center_all_points } self._n_points += 1 try: self._n_points_by_clusters[y_pred] += 1 except KeyError: self._n_points_by_clusters[y_pred] = 1 for i in centers: self._squared_distances[i] = utils.math.minkowski_distance( centers[i], center_all_points, 2) return self
def update(self, x, y_pred, centers, sample_weight=1.0): self._furthest_cluster_distance = self._find_furthest_cluster_distance(centers) if not self._initialized: self._center_all_points = {i: stats.Mean() for i in x} self._dim = len(x) self._initialized = True for i in self._center_all_points: self._center_all_points[i].update(x[i], w=sample_weight) center_all_points = { i: self._center_all_points[i].get() for i in self._center_all_points } distance_point_cluster_center = math.sqrt( utils.math.minkowski_distance(centers[y_pred], x, 2) ) distance_point_center = math.sqrt( utils.math.minkowski_distance(center_all_points, x, 2) ) self._ssq_points_cluster_centers += distance_point_cluster_center self._ssq_points_center += distance_point_center self._n_clusters = len(centers) # To trace back self.sample_correction = { "distance_point_cluster_center": distance_point_cluster_center, "distance_point_center": distance_point_center, } return self
def __init__( self, optimizer: optim.Optimizer = None, loss: optim.losses.Loss = None, l2=0.0, initializer: optim.initializers.Initializer = None, clip_gradient=1e12, seed=None, ): super().__init__(seed=seed) self.optimizer = optim.SGD() if optimizer is None else copy.deepcopy( optimizer) self.u_optimizer = (optim.SGD() if optimizer is None else copy.deepcopy(optimizer)) self.i_optimizer = (optim.SGD() if optimizer is None else copy.deepcopy(optimizer)) self.loss = optim.losses.Squared() if loss is None else loss self.l2 = l2 if initializer is None: initializer = optim.initializers.Zeros() self.initializer = initializer self.clip_gradient = clip_gradient self.global_mean = stats.Mean() self.u_biases: typing.DefaultDict[ int, optim.initializers.Initializer] = collections.defaultdict( initializer) self.i_biases: typing.DefaultDict[ int, optim.initializers.Initializer] = collections.defaultdict( initializer)
def __init__( self, n_factors=10, bias_optimizer: optim.Optimizer = None, latent_optimizer: optim.Optimizer = None, loss: optim.losses.Loss = None, l2_bias=0.0, l2_latent=0.0, weight_initializer: optim.initializers.Initializer = None, latent_initializer: optim.initializers.Initializer = None, clip_gradient=1e12, seed: int = None, ): self.n_factors = n_factors self.u_bias_optimizer = ( optim.SGD() if bias_optimizer is None else copy.deepcopy(bias_optimizer) ) self.i_bias_optimizer = ( optim.SGD() if bias_optimizer is None else copy.deepcopy(bias_optimizer) ) self.u_latent_optimizer = ( optim.SGD() if latent_optimizer is None else copy.deepcopy(latent_optimizer) ) self.i_latent_optimizer = ( optim.SGD() if latent_optimizer is None else copy.deepcopy(latent_optimizer) ) self.loss = optim.losses.Squared() if loss is None else loss self.l2_bias = l2_bias self.l2_latent = l2_latent if weight_initializer is None: weight_initializer = optim.initializers.Zeros() self.weight_initializer = weight_initializer if latent_initializer is None: latent_initializer = optim.initializers.Normal(sigma=0.1, seed=seed) self.latent_initializer = latent_initializer self.clip_gradient = clip_gradient self.seed = seed self.global_mean = stats.Mean() self.u_biases: typing.DefaultDict[ int, optim.initializers.Initializer ] = collections.defaultdict(weight_initializer) self.i_biases: typing.DefaultDict[ int, optim.initializers.Initializer ] = collections.defaultdict(weight_initializer) random_latents = functools.partial( self.latent_initializer, shape=self.n_factors ) self.u_latents: typing.DefaultDict[ int, optim.initializers.Initializer ] = collections.defaultdict(random_latents) self.i_latents: typing.DefaultDict[ int, optim.initializers.Initializer ] = collections.defaultdict(random_latents)
def update(self, x, y_pred, centers, sample_weight=1.0): self._minimum_separation = self._find_minimum_separation(centers) distance = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2)) if y_pred in self._avg_cp_by_clusters: self._avg_cp_by_clusters[y_pred].update(distance, w=sample_weight) else: self._avg_cp_by_clusters[y_pred] = stats.Mean() self._avg_cp_by_clusters[y_pred].update(distance, w=sample_weight) return self
def __init__(self, x: float, y=typing.Union[float, utils.VectorDict], weight: float = 1.0): self.x_stats = stats.Mean() self.x_stats.update(x, weight) self.y_stats: typing.Union[stats.Var, utils.VectorDict] self._update_estimator: typing.Callable[ [typing.Union[float, utils.VectorDict], float], None] self.is_single_target = True self._init_estimator(y) self._update_estimator(y, weight)
def load_stats(): for _, obj in inspect.getmembers(importlib.import_module("river.stats"), inspect.isclass): try: if issubclass(obj, stats.Link): yield obj(stats.Shift(1), stats.Mean()) continue sig = inspect.signature(obj) yield obj( **{ param.name: param.default if param.default != param.empty else 1 for param in sig.parameters.values() } ) except ValueError: yield obj()
def eval_relations(self, model, dataset): metrics = collections.OrderedDict({ f"{metric}": stats.Mean() for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"] }) with torch.no_grad(): metrics = self.compute_score( model=model, test_set=self.get_relation_stream(dataset), metrics=metrics, device=self.device, ) return { f"{name}_relations": round(metric.get(), 4) for name, metric in metrics.items() }
def eval(self, model, dataset): """Evaluate selected model with the metrics: MRR, MR, HITS@1, HITS@3, HITS@10""" metrics = collections.OrderedDict({ metric: stats.Mean() for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"] }) with torch.no_grad(): for test_set in self.get_entity_stream(dataset): metrics = self.compute_score(model=model, test_set=test_set, metrics=metrics, device=self.device) return { name: round(metric.get(), 4) for name, metric in metrics.items() }
def update(self, x, y_pred, centers, sample_weight=1.0): if not self._initialized: self._center_all_points = {i: stats.Mean() for i in x} self._initialized = True for i in self._center_all_points: self._center_all_points[i].update(x[i], w=sample_weight) center_all_points = { i: self._center_all_points[i].get() for i in self._center_all_points } squared_distance_center = utils.math.minkowski_distance( x, center_all_points, 2) squared_distance_cluster_center = utils.math.minkowski_distance( x, centers[y_pred], 2) self._ssq_point_center += squared_distance_center self._ssq_point_cluster_centers += squared_distance_cluster_center return self
def _unit_test_params(cls): return {"statistic": stats.Mean()}
def __init__(self, seed=None): super().__init__(seed=seed) self.variance = stats.Var() self.mean = stats.Mean() self.seed = seed
def __init__(self, regressor: base.Regressor, window_size: int = None): self.regressor = regressor self.window_size = window_size self.mean = (stats.Mean() if self.window_size is None else stats.RollingMean(self.window_size))
def __init__(self): self.x_m = stats.Mean() self.g_var = stats.Var() self.h_var = stats.Var() self.gh_cov = stats.Cov()
X_y = iter(datasets.Bikes()) # Peer one data # for x, y in X_y: # pprint(x) # print(f'Number of avaliable biles: {y}') # break # construct a model pipeline model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind') # To extract feature (bike number by hour) model += ( get_hour | feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean()) + feature_extraction.TargetAgg(by='station', how=stats.EWMean( 0.5)) # aggregate feature (station and time) ) model |= preprocessing.StandardScaler() model |= linear_model.LinearRegression() evaluate.progressive_val_score(dataset=datasets.Bikes(), model=model, metric=metrics.MAE(), moment='moment', delay=dt.timedelta(minutes=30), print_every=20_000) for x, y, in itertools.islice(X_y, 10000):
import copy import functools import math import random import numpy as np import pytest from river import stats @pytest.mark.parametrize( "stat", [ pytest.param(stat, id=stat.__class__.__name__) for stat in [stats.Mean(), stats.Var( ddof=0), stats.Var(ddof=1)] ], ) def test_add(stat): A = copy.deepcopy(stat) B = copy.deepcopy(stat) C = copy.deepcopy(stat) X = [random.random() for _ in range(30)] Y = [random.random() for _ in range(30)] W = [random.random() for _ in range(30)] for x, y, w in zip(X, Y, W): A.update(x, w) B.update(y, w)
assert isinstance(pickle.loads(pickle.dumps(stat)), stat.__class__) assert isinstance(copy.deepcopy(stat), stat.__class__) # Check the statistic has a working __str__ and name method assert isinstance(str(stat), str) if isinstance(stat, stats.Univariate): assert isinstance(stat.name, str) @pytest.mark.parametrize( 'stat, func', [(stats.Kurtosis(bias=True), sp_stats.kurtosis), (stats.Kurtosis(bias=False), functools.partial(sp_stats.kurtosis, bias=False)), (stats.Mean(), statistics.mean), (stats.Skew(bias=True), sp_stats.skew), (stats.Skew(bias=False), functools.partial(sp_stats.skew, bias=False)), (stats.Var(ddof=0), np.var), (stats.Var(), functools.partial(np.var, ddof=1))]) def test_univariate(stat, func): # Shut up np.warnings.filterwarnings('ignore') X = [random.random() for _ in range(30)] for i, x in enumerate(X): stat.update(x) if i >= 1: assert math.isclose(stat.get(), func(X[:i + 1]), abs_tol=1e-10)
if isinstance(stat, stats.Univariate): assert isinstance(stat.name, str) @pytest.mark.parametrize("stat", load_stats(), ids=lambda stat: stat.__class__.__name__) def test_repr_with_no_updates(stat): assert isinstance(repr(stat), str) assert isinstance(str(stat), str) @pytest.mark.parametrize( "stat, func", [ (stats.Kurtosis(bias=True), sp_stats.kurtosis), (stats.Kurtosis(bias=False), functools.partial(sp_stats.kurtosis, bias=False)), (stats.Mean(), statistics.mean), (stats.Skew(bias=True), sp_stats.skew), (stats.Skew(bias=False), functools.partial(sp_stats.skew, bias=False)), (stats.Var(ddof=0), np.var), (stats.Var(), functools.partial(np.var, ddof=1)), ], ) def test_univariate(stat, func): # Shut up np.warnings.filterwarnings("ignore") X = [random.random() for _ in range(30)] for i, x in enumerate(X): stat.update(x)
def _unit_test_params(cls): yield {"statistic": stats.Mean()}
def __init__(self, seed=None): super().__init__() self.variance = stats.Var() self.mean = stats.Mean() self.seed = seed self._rng = random.Random(seed)
def __init__(self): self.mean = stats.Mean()
def detail_eval(self, model, dataset, threshold=1.5): """ Divide input dataset relations into different categories (i.e. ONE-TO-ONE, ONE-TO-MANY, MANY-TO-ONE and MANY-TO-MANY) according to the mapping properties of relationships. Reference: 1. [Bordes, Antoine, et al. "Translating embeddings for modeling multi-relational data." Advances in neural information processing systems. 2013.](http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf) """ mapping_type_relations = self.types_relations(model=model, dataset=dataset, threshold=threshold) mapping_type_relations = { self.relations[key]: value for key, value in mapping_type_relations.items() } types_relations = ["1_1", "1_M", "M_1", "M_M"] metrics = collections.OrderedDict({ "head-batch": collections.OrderedDict({}), "tail-batch": collections.OrderedDict({}) }) for mode in ["head-batch", "tail-batch"]: for type_relation in types_relations: metrics[mode][type_relation] = collections.OrderedDict({ f"{metric}": stats.Mean() for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"] }) with torch.no_grad(): for test_set in self.get_entity_stream(dataset): metrics = self.compute_detailled_score( model=model, test_set=test_set, metrics=metrics, types_relations=mapping_type_relations, device=self.device, ) for mode in ["head-batch", "tail-batch"]: for type_relation in types_relations: for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"]: metrics[mode][type_relation][metric] = round( metrics[mode][type_relation][metric].get(), 4) results = pd.DataFrame(metrics) head = pd.DataFrame(results["head-batch"].values.tolist()) tail = pd.DataFrame(results["tail-batch"].values.tolist()) head.columns = pd.MultiIndex.from_product([["head"], head.columns]) tail.columns = pd.MultiIndex.from_product([["tail"], tail.columns]) results = pd.concat([head, tail], axis="columns") results = results.set_index(pd.Series(["1_1", "1_M", "M_1", "M_M"])) results.index.name = "relation" # Add frequency of each type of relation: frequency = collections.OrderedDict() for type_relation in types_relations: frequency[type_relation] = 0 for _, type_relation in mapping_type_relations.items(): frequency[type_relation] += 1 for type_relation in types_relations: frequency[type_relation] /= len(mapping_type_relations) frequency = pd.DataFrame.from_dict(frequency, orient="index", columns=["frequency"]) frequency.columns = pd.MultiIndex.from_product([["metadata"], frequency.columns]) results = pd.concat([results, frequency], axis="columns") return results