def topn(df, c, by=None, n=3, others=False, index='_idx', result='_res'): cnt = f'{c}##cnt' cnt_tot = f'{c}##tot' cnt_top = f'{c}##top' _gcols = [by] if isinstance(by, str) and by else by or [] r = _topn(df, c, _gcols, n) if others: # colculate total topn and total o = a.groupby(_gcols).agg(F.sum(cnt).alias(cnt_top)) t = s.groupby(*_gcols).agg(F.sum(cnt).alias(cnt_tot)) # corner case single row o = o.join(t, on=_gcols) if _gcols else o.withColumn( cnt_tot, F.lit(t.take(1)[0][0])) # collect to list r = r.union( o.select(*_gcols, F.lit(others).alias(c), (F.col(cnt_tot) - F.col(cnt_top)).alias(cnt))) r = r.groupby(*_gcols).agg( F.map_from_arrays(F.collect_list(F.col(c)), F.collect_list(F.col(cnt))).alias(result)) # the following forces colname to be nullable r = r.withColumn(index, F.udf(lambda x: x, T.StringType())(F.lit(c))) return r
def percentiles(df, c, by=None, p=[10, 25, 50, 75, 90], index='_idx', result='_res'): _gcols = [by] if isinstance(by, str) and by else by or [] ptile = f'{c}##p' # percentiles per row w = Window.partitionBy(*_gcols).orderBy(c) d = df.select(c, *_gcols, F.floor(100 * (F.percent_rank().over(w))).alias(ptile)) # aggregate agg_keys = F.array(*[F.lit(x) for x in p]) agg_values = F.array( *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p]) r = d.groupby(*_gcols).agg( F.map_from_arrays(agg_keys, agg_values).alias(result)) # add colname r = r.withColumn(index, F.lit(c)) return r
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'): add_ids = '##add_ids' del_ids = '##del_ids' upd_ids = '##upd_ids' c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] on = merge_on or colnames on = on if isinstance(on, (list, tuple)) else [on] on = [c for c in on if c in colnames] s = on + [state_col, updated_col] cols = [x for x in df.columns if x not in s] a = df.filter(f'{state_col} = 0') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \ .select(updated_col, add_ids) d = df.filter(f'{state_col} = 1') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \ .select(updated_col, del_ids) res = a.join(d, on=updated_col, how='outer') res = res.select(updated_col, F.coalesce(add_ids, F.array([])).alias(add_ids), F.coalesce(del_ids, F.array([])).alias(del_ids)) if cols: agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols] cnt = df.groupby(*on, updated_col).agg(*agg_funcs) agg_names = [F.lit(x) for x in cols] agg_sums = [F.sum(x) for x in cols] cnt = cnt.groupby(updated_col).agg( F.map_from_arrays(F.array(*agg_names), F.array(*agg_sums)).alias('changes')) res = res.join(cnt, on=updated_col) else: res = res.withColumn('changes', F.lit(None)) res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids)) res = res.select( F.col(updated_col).alias('updated'), F.size(upd_ids).alias('upd'), F.size(F.array_except(add_ids, upd_ids)).alias('add'), F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes') return res.orderBy('updated')
def sampleVCF(hdfs, spark): vcf = spark.sparkContext.textFile(hdfs) col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t") vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\ .map(lambda x : x.split("\t"))\ .toDF(col_name)\ .withColumn("POS", F.col("POS").cast(IntegerType())) sample_name = vcf_data.columns[-1] vcf_data = vcf_data.select(F.col("#CHROM"), F.col("POS"), F.col("FORMAT"), F.col(sample_name))\ .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\ .withColumn(sample_name, firstremove(F.split(F.col(sample_name), ":")))\ .withColumn(sample_name, F.map_from_arrays(F.col("FORMAT"), F.col(sample_name))) return vcf_data.select("#CHROM", "POS", sample_name)
def generate( batch_id, n_data, public_key_hex_internal, public_key_hex_external, output, n_rows, scale, partition_size_mb, ): shares = ( spark_session().range(n_rows * n_data).select( (F.col("id") % n_rows).alias("row_id"), F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"), ).groupBy("row_id").agg( F.collect_list("payload").alias("payload")).select( F.pandas_udf( partial( udf.encode, batch_id, n_data, public_key_hex_internal, public_key_hex_external, ), returnType="a: binary, b: binary", )("payload").alias("shares")) # repeat this data `scale` times .withColumn("_repeat", F.explode(F.array_repeat( F.lit(0), scale))).drop("_repeat").withColumn( "id", F.udf(lambda: str(uuid4()), returnType="string")())) # we can make an estimate with just a single row, since the configuration # is the same here. row = shares.first() dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) * n_rows * scale * 1.0 / 10**6) num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb) click.echo(f"writing {num_partitions} partitions") # try to be efficient without caching by repartitioning repartitioned = (shares.withColumn( "shares", F.map_from_arrays(F.array(F.lit("a"), F.lit("b")), F.array("shares.a", "shares.b")), ).repartitionByRange(num_partitions, "id").select( "id", F.explode("shares").alias("server_id", "payload"))) repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
def topn(df, c, by=None, n=3, others=False, index='_idx', result='_res'): """ :param df: input dataframe :param c: column to aggregate :param by: group by :param n: top n results :param others: If true, it must be of the same type of the column being aggregated e.g 'others' for strings and -1 for positive numbers :param index: :param result: :return: """ cnt = f'{c}##cnt' cnt_tot = f'{c}##tot' cnt_top = f'{c}##top' _gcols = [by] if isinstance(by, str) and by else by or [] s, a = _topn(df, c, _gcols, n) r = a.select(*_gcols, c, cnt) if others: # colculate total topn and total o = a.groupby(_gcols).agg(F.sum(cnt).alias(cnt_top)) t = s.groupby(*_gcols).agg(F.sum(cnt).alias(cnt_tot)) # corner case single row o = o.join(t, on=_gcols) if _gcols else o.withColumn( cnt_tot, F.lit(t.take(1)[0][0])) o = o.select(*_gcols, F.lit(others).alias(c), (F.col(cnt_tot) - F.col(cnt_top)).alias(cnt)) # collect to list r = r.union(o) r = r.groupby(*_gcols).agg( F.map_from_arrays(F.collect_list(F.col(c)), F.collect_list(F.col(cnt))).alias(result)) # the following forces colname to be nullable r = r.withColumn(index, F.udf(lambda x: x, T.StringType())(F.lit(c))) return r
def applyLabels(self, df, labels, date): map_keys = array([lit(k) for k in labels.keys()]) map_values = array([lit(v) for v in labels.values()]) map_func = map_from_arrays(map_keys, map_values) verdict = udf(self.spark_functions.finalVerdict, StringType()) labelled_df = df.withColumn('src_verdict', map_func.getItem(df.src_ip)).withColumn('dest_verdict', map_func.getItem(df.dest_ip)) labelled_df = labelled_df.withColumn('label', verdict('src_verdict', 'dest_verdict', 'src_ip', 'dest_ip')) to_drop = ['src_verdict', 'dest_verdict', 'asn', 'country', 'timestamp'] labelled_df = labelled_df.drop(*to_drop) #Drop ips that are not labelled labelled_df.show() return labelled_df
def map_from_array(theArray, theDelim): def pull_key_val(x, d, kind): retval = [] index = -1 if (kind == "key"): index = 0 if (kind == "val"): index = 1 if index == -1: raise "Bad input" for i in x: retval.append(i.split(d)[index]) return (retval) pull_key_udf = f.pandas_udf( lambda x: x.apply(pull_key_val, args=(theDelim, "key")), ArrayType(StringType())) pull_val_udf = f.pandas_udf( lambda x: x.apply(pull_key_val, args=(theDelim, "val")), ArrayType(StringType())) return (f.map_from_arrays(pull_key_udf(theArray), pull_val_udf(theArray)))
def generate_integration( data_config, public_key_hex_internal, public_key_hex_external, output, n_rows, n_partitions, n_drop_batch, ): """Generate test data from a configuration file. The data is generated in a deterministic way and fits into memory.""" spark = spark_session() assert n_rows > 0 config_data = spark.read.json(data_config, multiLine=True) def generate_data(batch_id, n_data): return dict( batch_id=batch_id, n_data=n_data, payload=[int(x % 3 == 0 or x % 5 == 0) for x in range(n_data)], ) test_data = [] # by dropping batch_ids, we can test whether the main processing script is # robust enough when the data doesn't exist. collected_rows = sorted(config_data.collect(), key=lambda x: x["batch_id"], reverse=True) for conf in collected_rows[n_drop_batch:]: batch_id = conf["batch_id"] n_data = conf["n_data"] test_data += [ generate_data(batch_id, n_data) for _ in range(n_rows - 1) ] # include invalid data for a batch test_data += [generate_data(batch_id, n_data + 1)] # include unknown batch id test_data += [generate_data("bad-id", 10) for _ in range(n_rows)] shares = (spark.createDataFrame(test_data).select( "batch_id", F.udf(udf.encode_single, returnType="a: binary, b: binary")( "batch_id", "n_data", F.lit(public_key_hex_internal), F.lit(public_key_hex_external), "payload", ).alias("shares"), ).withColumn("id", F.udf(lambda: str(uuid4()), returnType="string")())) repartitioned = (shares.withColumn( "shares", F.map_from_arrays(F.array(F.lit("a"), F.lit("b")), F.array("shares.a", "shares.b")), ).repartitionByRange(n_partitions, "batch_id", "id").select( "batch_id", "id", F.explode("shares").alias("server_id", "payload"))) repartitioned.write.partitionBy("server_id", "batch_id").json(output, mode="overwrite")
df2 = spark.sql(sql) #df2.cache() #df2.show() columns = df2.columns columns.remove("uid") columns.remove("time_interval") # 所有指标列合并 df3 = df2.withColumn('data', f.array(columns)) #df3.show(10, False) # 时间周期合并,时间为key,value为所有指标数组 df4 = df3.groupBy("uid").agg( f.map_from_arrays(f.collect_list("time_interval"), f.collect_list("data")).alias("map")) #df4.show(10, False) # udf,时间周期自动往前累加 # def agg(data): # f = {} # m = data # for ci,cn in enumerate(columns): # d = {} # for i in range(0,144): # if i in m: # num = m[i][ci] # else: # num = 0 # for j in range(0,i):