def get_nunique(self, df, columns=[]): """return dict with number of unique entries for given columns :param df: input (spark) data frame :param columns: columns to select (optional) """ if not columns: columns = df.columns qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns)) return qdf.toPandas().T[0].to_dict()
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
spark = SparkSession\ .builder\ .appName("StructuredKafkaWordCount")\ .getOrCreate() lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") probes = lines.select( split(lines.value, ',')[0].alias('timestamp'), split(lines.value, ',')[1].alias('mac'), split(lines.value, ',')[2].alias('SSID'), split(lines.value, ',')[3].alias('fornecedor'), split(lines.value, ',')[4].alias('macId')) pnl = probes.filter('SSID != "BROADCAST"').select( approxCountDistinct('mac', rsd=0.01).alias('count')) query = pnl\ .writeStream\ .outputMode("complete")\ .foreach(processRow)\ .start() query.awaitTermination()
topics = sys.argv[3] spark = SparkSession\ .builder\ .appName("StructuredKafkaWordCount")\ .getOrCreate() lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") probes = lines.select( split(lines.value, ',')[0].alias('timestamp'), split(lines.value, ',')[1].alias('mac'), split(lines.value, ',')[2].alias('SSID'), split(lines.value, ',')[3].alias('fornecedor')) probesDir = probes.filter('SSID != "BROADCAST"').select( approxCountDistinct('timestamp', rsd=0.01).alias('count')) query = probesDir\ .writeStream\ .outputMode("complete")\ .foreach(processRow)\ .start() query.awaitTermination()
.builder\ .appName("StructuredKafkaWordCount")\ .getOrCreate() lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") probes = lines.select( split(lines.value,',')[0].alias('timestamp'), split(lines.value,',')[1].alias('mac'), split(lines.value,',')[2].alias('SSID'), split(lines.value,',')[3].alias('fornecedor'), split(lines.value,',')[4].alias('macId') ) ssid = probes.filter('SSID != "BROADCAST"').select( approxCountDistinct('SSID',rsd = 0.01).alias('count') ) query = ssid\ .writeStream\ .outputMode("complete")\ .foreach(processRow)\ .start() query.awaitTermination()
def number_distinct_values(col): return F.approxCountDistinct(col)
# Create DataSet representing the stream of input lines from kafka lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Cria a tabela de dados dados = lines.select( split(lines.value, ', ')[0].alias("Source"), split(lines.value, ', ')[1].alias("Time"), split(lines.value, ', ')[2].alias("ssid"), split(lines.value, ', ')[3].alias("marca") ) #remove as probes do tipo BroadCast directProbes = dados.filter('ssid != "Wildcard (Broadcast)"') #conta a quantidade de ssids diferentes qtdSsidDif = directProbes.agg(approxCountDistinct('ssid')) query = qtdSsidDif\ .writeStream\ .outputMode('complete')\ .foreach(processRow)\ .start() query.awaitTermination()
# Create DataSet representing the stream of input lines from kafka lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Cria a tabela de dados dados = lines.select( split(lines.value, ', ')[0].alias("Source"), split(lines.value, ', ')[1].alias("Time"), split(lines.value, ', ')[2].alias("ssid"), split(lines.value, ', ')[3].alias("marca")) #remove as probes do tipo BroadCast directProbes = dados.filter('ssid != "Wildcard (Broadcast)"') # total de probes direct totalDirect = directProbes.agg(approxCountDistinct('Time')) query = totalDirect\ .writeStream\ .outputMode('complete')\ .format('console')\ .start() query.awaitTermination()
lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") probes = lines.select( split(lines.value, ',')[0].alias('timestamp'), split(lines.value, ',')[1].alias('mac'), split(lines.value, ',')[2].alias('SSID'), split(lines.value, ',')[3].alias('fornecedores'), split(lines.value, ',')[4].alias('macId')) dispositivos = probes.select(approxCountDistinct('mac', rsd=0.01)) pnl = probes.select('mac', 'SSID', 'macId').filter('SSID != "BROADCAST"').distinct() pnl1 = pnl.select(approxCountDistinct('macId', rsd=0.01)) probesTot = probes.select(approxCountDistinct('timestamp', rsd=0.01)) probesBroad = probes.filter('SSID == "BROADCAST"').select( approxCountDistinct('timestamp', rsd=0.01)) probesDir = probes.filter('SSID != "BROADCAST"').select( approxCountDistinct('timestamp', rsd=0.01)) # macs = probes.groupBy('mac').count()
.format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Cria a tabela de dados dados = lines.select( split(lines.value, ', ')[0].alias("Source"), split(lines.value, ', ')[1].alias("Time"), split(lines.value, ', ')[2].alias("ssid"), split(lines.value, ', ')[3].alias("marca")) #conta a quantidade de devices diferentes qtdDeviceDif = dados.agg(approxCountDistinct('Source')) query = qtdDeviceDif\ .writeStream\ .outputMode('complete')\ .format('console')\ .start() query.awaitTermination() #conta a quantidade de devices diferentes qtdDeviceDif = dados.agg(approxCountDistinct('Source')) query = qtdDeviceDif\ .writeStream\ .outputMode('complete')\ .foreach(processRow)\
.format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") probes = lines.select( split(lines.value,',')[0].alias('timestamp'), split(lines.value,',')[1].alias('mac'), split(lines.value,',')[2].alias('SSID'), split(lines.value,',')[3].alias('fornecedor'), split(lines.value,',')[4].alias('macId') ) dispositivos = probes.select( approxCountDistinct('mac',rsd = 0.01) ) pnl = probes.filter('SSID != "BROADCAST"').select( 'mac' ).distinct() pnl1 = pnl.select( approxCountDistinct('mac',rsd = 0.01) ) probesTot = probes.select( approxCountDistinct('timestamp',rsd = 0.01) ) probesBroad = probes.filter('SSID == "BROADCAST"').select(
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
.appName("consumidor")\ .getOrCreate() # Create DataSet representing the stream of input lines from kafka lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Cria a tabela de dados dados = lines.select( split(lines.value, ', ')[0].alias("Source"), split(lines.value, ', ')[1].alias("Time"), split(lines.value, ', ')[2].alias("ssid"), split(lines.value, ', ')[3].alias("marca")) # total de probes direct totalProbes = dados.agg(approxCountDistinct('Time')) query = totalProbes\ .writeStream\ .outputMode('complete')\ .format('console')\ .start() query.awaitTermination()
def number_distinct_values( col): #function to calculate distinct values return F.approxCountDistinct(col)
spark = SparkSession\ .builder\ .appName("consumidor")\ .getOrCreate() # Create DataSet representing the stream of input lines from kafka lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Cria a tabela de dados dados = lines.select( split(lines.value, ', ')[0].alias("Source"), split(lines.value, ', ')[1].alias("Time"), split(lines.value, ', ')[2].alias("ssid"), split(lines.value, ', ')[3].alias("marca")) #conta a quantidade de devices com pnl >= 1 qtdDeviceComPNL = dados.agg(approxCountDistinct('Source')) query = qtdDeviceComPNL\ .writeStream\ .outputMode('complete')\ .foreach(processRow)\ .start() query.awaitTermination()