def gen_summary(df, output_prefix=""): summary = {} string_cols = [] boolean_cols = [] numeric_cols = [] other_cols = [] for field in df.schema.fields: if isinstance(field.dataType, T.StringType): string_cols.append(field.name) elif isinstance(field.dataType, T.BooleanType): boolean_cols.append(field.name) elif isnumeric(field.dataType): numeric_cols.append(field.name) else: other_cols.append(field.name) counts = cardinalities(df, string_cols) uniques = likely_unique(counts) categoricals = unique_values(df, likely_categoricals(counts)) for span in [2, 3, 4, 6, 12]: thecube = df.cube( "Churn", F.ceil(df.tenure / span).alias("%d_month_spans" % span), "gender", "Partner", "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod", F.ceil(F.log2(F.col("MonthlyCharges")) * 10).alias("log_charges")).count() therollup = df.rollup( "Churn", F.ceil(df.tenure / span).alias("%d_month_spans" % span), "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod", F.ceil(F.log2(F.col("MonthlyCharges")) * 10).alias("log_charges")).agg( F.sum(F.col("TotalCharges")).alias("sum_charges")) thecube.write.mode("overwrite").parquet("%scube-%d.parquet" % (output_prefix, span)) therollup.write.mode("overwrite").parquet("%srollup-%d.parquet" % (output_prefix, span)) encoding_struct = { "categorical": categoricals, "numeric": numeric_cols + boolean_cols, "unique": uniques } summary["schema"] = df.schema.jsonValue() summary["ecdfs"] = approx_ecdf(df, numeric_cols) summary["true_percentage"] = percent_true(df, boolean_cols) summary["encoding"] = encoding_struct summary["distinct_customers"] = df.select(df.customerID).distinct().count() return summary
def _simple_entropy(df: pyspark.sql.dataframe.DataFrame, column_name: str) -> float: count = df.count() testdf = df.select(column_name).groupby(column_name).agg((F.count(column_name) / count).alias("p")) result = testdf.groupby().agg(-F.sum(F.col("p") * F.log2("p"))).collect()[0][0] if not result: return 0.0 return result
def distributional_coverage(self): """Calculate distributional coverage for recommendations across all users. The metric definition is based on formula (21) in the following reference: :Citation: G. Shani and A. Gunawardana, Evaluating Recommendation Systems, Recommender Systems Handbook pp. 257-297, 2010. Returns: float: distributional coverage """ # In reco_df, how many times each col_item is being recommended df_itemcnt_reco = self.reco_df.groupBy(self.col_item).count() # the number of total recommendations count_row_reco = self.reco_df.count() df_entropy = df_itemcnt_reco.withColumn( "p(i)", F.col("count") / count_row_reco).withColumn( "entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)"))) # distributional coverage d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0] return d_coverage
def historical_item_novelty(self): """Calculate novelty for each item. Novelty is computed as the minus logarithm of (number of interactions with item / total number of interactions). The definition of the metric is based on the following reference using the choice model (eqs. 1 and 6): :Citation: P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems: choice, discovery and relevance, ECIR 2011 The novelty of an item can be defined relative to a set of observed events on the set of all items. These can be events of user choice (item "is picked" by a random user) or user discovery (item "is known" to a random user). The above definition of novelty reflects a factor of item popularity. High novelty values correspond to long-tail items in the density function, that few users have interacted with and low novelty values correspond to popular head items. Returns: pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_item, item_novelty. """ if self.df_item_novelty is None: n_records = self.train_df.count() self.df_item_novelty = (self.train_df.groupBy( self.col_item).count().withColumn( "item_novelty", -F.log2(F.col("count") / n_records)).select( self.col_item, "item_novelty").orderBy(self.col_item)) return self.df_item_novelty
def preprocessing(spark_df): smart_feature_columns=[column for column in spark_df.columns if 'smart' in column] window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy( F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0) prefix_window7='window_7_' for smart_col in smart_feature_columns: spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType())) if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw', 'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']: spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col, F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over( window_spec_7)) spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col, F.stddev(F.col(smart_col)).over(window_spec_7)) #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0)) #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0)) if smart_col in ['smart_4raw','smart_5raw','smart_191raw', 'smart_187raw','smart_197raw','smart_198raw', 'smart_199raw','window_7_range_smart_199raw']: spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.))) spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw')) spark_df = spark_df.withColumn('anomaly_sum', F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16 + F.col('smart_191raw') / 18 + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15) return spark_df
def evaluate(self, req: List[privacy.Auxiliary], N=2, similarity="general", mode="best-guess", with_movie=True, tol=15): """De-anonymisation evaluator Given a list of Auxiliary requests and a number of sampled customers, evaluate de-anonymisation performance. There are two modes: - 'best-guess': returns true positive rate for a fixed threshold. - 'entropic': returns the entropy of the probability distribution. """ scoring = self.get_scoring(similarity, with_movie) aux = self.generate_auxiliary_data(req, N) scores = self.compute_score(aux, similarity, with_movie, tol) if mode == "best-guess": match = scoring.matching_set(scores, 0.5) return 100 * match.filter("custId_1 == custId_2").count() / N elif mode == "entropic": probas = scoring.output(scores, mode="entropic") withEntropy = probas.groupBy("custId_1").agg((-F.sum( F.col("probas") * F.log2(F.col("probas")))).alias("entropy")) return withEntropy.groupBy().avg('entropy').collect() else: raise "Invalid argument."
def _entropy_todo(column, df): """ Returns what (columns, as in spark columns) to compute to get the results requested by the parameters. :param column: :type column: str/int :param df: :type df: DataFrame :return: Pyspark columns representing what to compute. """ # group on that column todo = df.groupBy(column) # count instances of each group todo = todo.agg(count("*").alias("_entropy_ci")) # ignore nans/null for computing entropy todo = todo.filter(~col(column).isNull()) todo = todo.select( sum(col("_entropy_ci") * log2("_entropy_ci")).alias("_sumcilogci"), sum("_entropy_ci").alias("_total")) todo = todo.select( log2(col("_total")) - col("_sumcilogci") / col("_total")) return todo
def evaluate_all(self, req: List[privacy.Auxiliary], N=100, similarity="general", mode="best-guess", with_movie=True, tol=15): scoring = self.get_scoring(similarity, with_movie) aux = self.generate_auxiliary_data(req, N) scores = self.compute_score(aux, similarity, with_movie, tol) custIds = aux.custId.unique() if mode == "best-guess": # {aux, custId, score, excentricity } match = scoring.matching_set(scores, 0.0).toPandas().set_index("custId_1") return [{ "id": custId, "aux": aux.set_index("custId").loc[custId], "matchedId": int(match.loc[custId]["custId_2"]), "score": match.loc[custId]["value_1"], "eccentricity": match.loc[custId]["eccentricity"], } for custId in custIds] elif mode == "entropic": scores.cache() probas = scoring.output(scores, mode="entropic") match = scoring.matching_set(scores, 0.0).toPandas().set_index("custId_1") withEntropy = probas.groupBy("custId_1").agg( (-F.sum(F.col("probas") * F.log2(F.col("probas"))) ).alias("entropy")).toPandas().set_index("custId_1") return [{ "id": custId, "aux": aux.set_index("custId").loc[custId], "matchedId": int(match.loc[custId]["custId_2"]), "score": match.loc[custId]["value_1"], "eccentricity": match.loc[custId]["eccentricity"], "entropy": withEntropy.loc[custId] } for custId in custIds] else: raise "Invalid argument."
def _weighted_entropy( countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True ) -> float: """Entropy calculation across many .""" split_columns_plus_target = split_columns[:] split_columns_plus_target.append(target_column_name) groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count")) w = Window.partitionBy(split_columns) groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn( "weight", F.sum(groupdf["group_count"] / total_count).over(w) ) entropydf = groupdf.groupby(split_columns).agg( (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight") ) if weighted: result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0] else: result = entropydf.groupby().sum("entropy").collect()[0][0] return result
def compile_log2(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.log2(src_column)
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
def udaf(self, data): """ Apply median polish to groupBy keys and return value for each sample within that grouping. This is a hacked/workaround user-defined aggregate function (UDAF) that passes the grouped data to python to do median polish and return the result back to the dataframe. :returns: spark dataframe """ # register the medianpolish as a UDF medpol = udf(probe_summarization, ArrayType(ArrayType(StringType()))) # repartition by our grouping keys if self.group_keys not in [['TRANSCRIPT_CLUSTER'], ['PROBESET']]: raise Exception("Invalid grouping keys.") data = data.withColumnRenamed('NORMALIZED_INTENSITY_VALUE', 'VALUE') data = data.repartition(self.repartition_number, self.group_keys) # log 2 values data = data.withColumn('VALUE', log2(data['VALUE']).alias('VALUE')) # group the data while concatenating rest of columns into one value # so we can pass it to collect, one value(list) per row and a list of # lists for the whole grouping, so that we can give it to our UDF as # one item which returns back one item (array or arrays) data = data.withColumn( 'data', concat_ws(',', 'SAMPLE', 'PROBE', 'VALUE')) \ .groupBy(self.group_keys) \ .agg(collect_list('data') .alias('data')) \ .withColumn('data', medpol('data')) def gen_cols(other_cols): """ Create a list for select(). select() can take one list, or *args. generating the grouping keys as columns and adding other column selections to the same list. :param other_cols: list of other column selections :type other_cols: list :returns: single list of columns, expressions, etc. for select() """ cols = [col(s) for s in self.group_keys] cols += other_cols return cols # unpack the first level of nesting vertically, so each array in the # array is a new row (per sample) data = data.select( gen_cols([explode(data['data']).alias("SAMPLEVALUE")])) # unpack the final nesting laterally, into two new columns data = data.select( gen_cols([ data['SAMPLEVALUE'].getItem(0).alias('SAMPLE'), data['SAMPLEVALUE'].getItem(1).alias("VALUE") ])) data = data.repartition(int(self.num_samples)) return data