def test_merge_stats(self): stats1 = StatCounter([1.0, 2.0, 3.0, 4.0]) stats2 = StatCounter([1.0, 2.0, 3.0, 4.0]) stats = stats1.mergeStats(stats2) self.assertEqual(stats.count(), 8) self.assertEqual(stats.max(), 4.0) self.assertEqual(stats.mean(), 2.5) self.assertEqual(stats.min(), 1.0) self.assertAlmostEqual(stats.stdev(), 1.118033988749895) self.assertAlmostEqual(stats.sampleStdev(), 1.1952286093343936) self.assertEqual(stats.sum(), 20.0) self.assertAlmostEqual(stats.variance(), 1.25) self.assertAlmostEqual(stats.sampleVariance(), 1.4285714285714286)
def stats(self): """ Return a L{StatCounter} object that captures the mean, variance and count of the RDD's elements in one operation. """ def redFunc(left_counter, right_counter): return left_counter.mergeStats(right_counter) return self.mapPartitions(lambda i: [StatCounter(i)]).reduce(redFunc)
def merge(self, frame): """ Add another DataFrame to the accumulated stats for each column. Parameters ---------- frame: pandas DataFrame we will update our stats counter with. """ for column_name, _ in self._column_stats.items(): data_arr = frame[[column_name]].values count, min_max_tup, mean, _, _, _ = \ scistats.describe(data_arr) stats_counter = StatCounter() stats_counter.n = count stats_counter.mu = mean stats_counter.m2 = np.sum((data_arr - mean) ** 2) stats_counter.minValue, stats_counter.maxValue = min_max_tup self._column_stats[column_name] = self._column_stats[ column_name].mergeStats(stats_counter) return self
def test_variance_when_size_zero(self): # SPARK-38854: Test case to improve test coverage when # StatCounter argument is empty list or None arguments = [[], None] for arg in arguments: stats = StatCounter(arg) self.assertTrue(math.isnan(stats.variance())) self.assertTrue(math.isnan(stats.sampleVariance())) self.assertEqual(stats.count(), 0) self.assertTrue(math.isinf(stats.max())) self.assertTrue(math.isinf(stats.min())) self.assertEqual(stats.mean(), 0.0)
def merge(self, frame): """ Add another DataFrame to the accumulated stats for each column. Parameters ---------- frame: pandas DataFrame we will update our stats counter with. """ for column_name, counter in self._column_stats.items(): data_arr = frame[[column_name]].values count, min_max_tup, mean, unbiased_var, skew, kurt = \ scistats.describe(data_arr) stats_counter = StatCounter() stats_counter.n = count stats_counter.mu = mean # TODO(juliet): look up paper they base their streams tat alg on, # write docs for statcounter class in spark # line below will likely need to be modified to match the alg stats_counter.m2 = np.sum((data_arr - mean) ** 2) stats_counter.minValue, stats_counter.maxValue = min_max_tup self._column_stats[column_name] = self._column_stats[ column_name].mergeStats(stats_counter) return self
class NAStatCounter: def __init__(self): self.stats = StatCounter() self.missing = long(0) def add(self, x): if x is None: self.missing += 1 else: self.stats.merge(x) return self def mergeStats(self, other): self.stats.mergeStats(other.stats) self.missing += other.missing return self def __repr__(self): return "stats: {0}, NaN: {1}".format(self.stats, self.missing)
def __init__(self, dataframes=[], columns=[]): """ Creates a stats counter for the provided data frames computing the stats for all of the columns in columns. Parameters ---------- dataframes: list of dataframes, containing the values to compute stats on columns: list of strs, list of columns to compute the stats on """ self._column_stats = dict( (column_name, StatCounter()) for column_name in columns) for df in dataframes: self.merge(df)
def __init__(self, dataframes, columns): """ Creates a stats counter for the provided data frames computing the stats for all of the columns in columns. Parameters ---------- dataframes: list of dataframes, containing the values to compute stats on. columns: list of strs, list of columns to compute the stats on. """ assert (not isinstance(columns, basestring)), "columns should be a " \ "list of strs, " \ "not a str!" assert isinstance(columns, list), "columns should be a list!" self._columns = columns self._counters = dict((column, StatCounter()) for column in columns) for df in dataframes: self.merge(df)
def test_merge_stats(self): stats1 = StatCounter([1.0, 2.0, 3.0, 4.0]) stats2 = StatCounter([1.0, 2.0, 3.0, 4.0]) stats = stats1.mergeStats(stats2) self.assertEqual(stats.count(), 8) self.assertEqual(stats.max(), 4.0) self.assertEqual(stats.mean(), 2.5) self.assertEqual(stats.min(), 1.0) self.assertAlmostEqual(stats.stdev(), 1.118033988749895) self.assertAlmostEqual(stats.sampleStdev(), 1.1952286093343936) self.assertEqual(stats.sum(), 20.0) self.assertAlmostEqual(stats.variance(), 1.25) self.assertAlmostEqual(stats.sampleVariance(), 1.4285714285714286) execution_statements = [ StatCounter([1.0, 2.0]).mergeStats(StatCounter(range(1, 301))), StatCounter(range(1, 301)).mergeStats(StatCounter([1.0, 2.0])), ] for stats in execution_statements: self.assertEqual(stats.count(), 302) self.assertEqual(stats.max(), 300.0) self.assertEqual(stats.min(), 1.0) self.assertAlmostEqual(stats.mean(), 149.51324503311) self.assertAlmostEqual(stats.variance(), 7596.302804701549) self.assertAlmostEqual(stats.sampleVariance(), 7621.539691095905)
def merge(self, frame): """ Add another DataFrame to the accumulated stats for each column. Parameters ---------- frame: pandas DataFrame we will update our stats counter with. """ for column_name, _ in self._column_stats.items(): data_arr = frame[[column_name]].values count, min_max_tup, mean, _, _, _ = \ scistats.describe(data_arr) stats_counter = StatCounter() stats_counter.n = count stats_counter.mu = mean stats_counter.m2 = np.sum((data_arr - mean)**2) stats_counter.minValue, stats_counter.maxValue = min_max_tup self._column_stats[column_name] = self._column_stats[ column_name].mergeStats(stats_counter) return self
def merge(self, frame): """ Add another DataFrame to the accumulated stats for each column. Parameters ---------- frame: pandas DataFrame we will update our stats counter with. """ for column_name, counter in self._column_stats.items(): data_arr = frame[[column_name]].values count, min_max_tup, mean, unbiased_var, skew, kurt = \ scistats.describe(data_arr) stats_counter = StatCounter() stats_counter.n = count stats_counter.mu = mean # TODO(juliet): look up paper they base their streams tat alg on, # write docs for statcounter class in spark # line below will likely need to be modified to match the alg stats_counter.m2 = np.sum((data_arr - mean)**2) stats_counter.minValue, stats_counter.maxValue = min_max_tup self._column_stats[column_name] = self._column_stats[ column_name].mergeStats(stats_counter) return self
def stats(d): s = StatCounter() return s.merge(((d.days * 24 * 3600) + d.seconds))
def get_lr_curves( spark, features_df, cluster_ids, kernel_bandwidth, num_pdf_points, random_seed=None, ): """ Compute the likelihood ratio curves for clustered clients. Work-flow followed in this function is as follows: * Access the DataFrame including cluster numbers and features. * Load same similarity function that will be used in TAAR module. * Iterate through each cluster and compute in-cluster similarity. * Iterate through each cluster and compute out-cluster similarity. * Compute the kernel density estimate (KDE) per similarity score. * Linearly down-sample both PDFs to 1000 points. :param spark: the SparkSession object. :param features_df: the DataFrame containing the user features (e.g. the ones coming from |get_donors|). :param cluster_ids: the list of cluster ids (e.g. the one coming from |get_donors|). :param kernel_bandwidth: the kernel bandwidth used to estimate the kernel densities. :param num_pdf_points: the number of points to sample for the LR-curves. :param random_seed: the provided random seed (fixed in tests). :return: A list in the following format [(idx, (lr-numerator-for-idx, lr-denominator-for-idx)), (...), ...] """ # Instantiate holder lists for inter- and intra-cluster scores. same_cluster_scores_rdd = spark.sparkContext.emptyRDD() different_clusters_scores_rdd = spark.sparkContext.emptyRDD() random_split_kwargs = {"seed": random_seed} if random_seed else {} for cluster_number in cluster_ids: # Pick the features for users belonging to the current cluster. current_cluster_df = features_df.where( col("prediction") == cluster_number) # Pick the features for users belonging to all the other clusters. other_clusters_df = features_df.where( col("prediction") != cluster_number) logger.debug("Computing scores for cluster", extra={"cluster_id": cluster_number}) # Compares the similarity score between pairs of clients in the same cluster. cluster_half_1, cluster_half_2 = current_cluster_df.rdd.randomSplit( [0.5, 0.5], **random_split_kwargs) pair_rdd = generate_non_cartesian_pairs(cluster_half_1, cluster_half_2) intra_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r)) same_cluster_scores_rdd = same_cluster_scores_rdd.union( intra_scores_rdd) # Compares the similarity score between pairs of clients in different clusters. pair_rdd = generate_non_cartesian_pairs(current_cluster_df.rdd, other_clusters_df.rdd) inter_scores_rdd = pair_rdd.map(lambda r: similarity_function(*r)) different_clusters_scores_rdd = different_clusters_scores_rdd.union( inter_scores_rdd) # Determine a range of observed similarity values linearly spaced. all_scores_rdd = same_cluster_scores_rdd.union( different_clusters_scores_rdd) stats = all_scores_rdd.aggregate(StatCounter(), StatCounter.merge, StatCounter.mergeStats) min_similarity = stats.minValue max_similarity = stats.maxValue lr_index = np.arange( min_similarity, max_similarity, float(abs(min_similarity - max_similarity)) / num_pdf_points, ) # Kernel density estimate for the inter-cluster comparison scores. kd_dc = KernelDensity() kd_dc.setSample(different_clusters_scores_rdd) kd_dc.setBandwidth(kernel_bandwidth) denominator_density = kd_dc.estimate(lr_index) # Kernel density estimate for the intra-cluster comparison scores. kd_sc = KernelDensity() kd_sc.setSample(same_cluster_scores_rdd) kd_sc.setBandwidth(kernel_bandwidth) numerator_density = kd_sc.estimate(lr_index) # Structure this in the correct output format. return list( zip(lr_index, list(zip(numerator_density, denominator_density))))
def chooseBandwidthList(self): stddev = StatCounter(self.samples).stdev() return 1.06 * stddev * math.pow(len(self.samples), -.2)
def __init__(self): self.stats = StatCounter() self.missing = long(0)
def test_merge(self): stats = StatCounter([1.0, 2.0, 3.0, 4.0]) stats.merge(5.0) self.assertEqual(stats.count(), 5) self.assertEqual(stats.max(), 5.0) self.assertEqual(stats.mean(), 3.0) self.assertEqual(stats.min(), 1.0) self.assertAlmostEqual(stats.stdev(), 1.414213562373095) self.assertAlmostEqual(stats.sampleStdev(), 1.5811388300841898) self.assertEqual(stats.sum(), 15.0) self.assertAlmostEqual(stats.variance(), 2.0) self.assertAlmostEqual(stats.sampleVariance(), 2.5)