def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def do_something_only_once(): # the command I use to run this script: #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda ## Loading of data sc = SparkContext(appName='Simple App') #"local" sqlContext = SQLContext(sc) # Load metadata avro reader = sqlContext.read.format('com.databricks.spark.avro') meta = reader.load('data/spark_metadata.avro') # # Loading topic distributions topdisFile = 'data/spark_output.tuples' csvLoader = sqlContext.read.format('com.databricks.spark.csv') topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile) strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType()) topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0'])) strip_first_col_float = udf(lambda row: float(row[1:]), FloatType()) topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1'])) strip_last_col = udf(lambda row: float(row[:-2]), FloatType()) topdis = topdis.withColumn('C20',strip_last_col(topdis['C20'])) # # Load dictionary CSV dicFile = 'data/spark_dic.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile) dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) ldaFile = 'data/spark_lda.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile) lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache() # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) # # # Load clustertopics CSV # clutoFile = 'enron_small_clustertopics.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile) # # # Load topicswords CSV # towoFile = 'enron_small_lda_transposed.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile) # # Merge topdis which has document id and with metadata, based on document id metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp")) doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache() maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp'] mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w)) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2) ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def proportion_samples(sdf, proportions_sdf, count_column='rows_count'): '''Load huge tables from Hive slightly faster than over toPandas in Spark Parameters ---------- sdf : spark Dataframe to sample from proportions_sdf : spark Dataframe with counts to sample from sdf count_column: column name with counts, other columns used as statifiers Returns ---------- sampled : spark Dataframe with number of rows lesser or equal proportions_sdf for each strata ''' import pyspark.sql.functions as F from pyspark.sql.window import Window groupers = [c for c in proportions_sdf.columns if c != count_column] sampled = sdf.join(proportions_sdf, groupers, how='inner') \ .withColumn('rownum', F.rowNumber().over(Window.partitionBy(groupers))) \ .filter(F.col('rownum') <= F.col(count_column)).drop(count_column).drop('rownum') return sampled
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() config = ConfigParser.ConfigParser() config.read(CONFIG_PATH) if opts.es: esnode = config.get('ElasticSearch', 'node') esport = config.get('ElasticSearch', 'port') esresource = config.get('ElasticSearch', 'resource') # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") # setting spark log level logs = opts.logs.upper() validateLogLevel(logs) sc.setLogLevel(logs) # setting up spark sql variables sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) lastdate = sorted(datedic.keys())[-1] files = getFileList(opts.basedir, fromdate, lastdate) msg = "Between dates %s and %s found %d directories" % ( fromdate, lastdate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError( "File or directory not specified. Specify fname or basedir parameters." ) # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" node_tier_reg = r"^(.{2})" campaign_reg = r"^/[^/]*/([^/]*)-[^/]*/[^/]*$" groupf = udf(lambda x: groupdic.get(x, ''), StringType()) nodef = udf(lambda x: nodedic.get(x, ''), StringType()) regexudf = udf(lambda x, y: bool(regexp_extract(x, y, 1)), BooleanType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "yyyy-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))\ .withColumn("node_tier", when(regexp_extract(pdf.node_name, node_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.node_name, node_tier_reg, 1)))\ .withColumn("campaign", when(regexp_extract(pdf.dataset_name, campaign_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, campaign_reg, 1))) # print dataframe schema if opts.verbose: print("pdf data type", type(ndf)) ndf.printSchema() print("ndf total:", ndf.count()) for row in ndf.head(10): print(row) # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc = [fil.split(':')[0] for fil in opts.filt.split(',')] if opts.filt else [] filtv = [fil.split(':')[1] for fil in opts.filt.split(',')] if opts.filt else [] isavgday = (AVERAGEDAY in aggregations) validateAggregationParams(keys, results, aggregations, order, filtc) # filtering data by regex for index, val in enumerate(filtc): ndf = ndf.filter( regexp_extract(getattr(ndf, val), filtv[index], 0) != "") # if delta aggregation is used if DELTA in aggregations: if len(results) != 1: raise ValueError("Delta aggregation can have only 1 result field") result = results[0] #1 for all dates generate interval group dictionary interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn( "delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select( aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: # clean-up unnecessary dataframe and columns pdf.unpersist() if isavgday: datescount = ndf.select(ndf.now).distinct().count() aggregations = [ "sum" if aggregation == "avg-day" else aggregation for aggregation in aggregations ] resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) ndf.unpersist() # if average day then divide by dates count if isavgday: resfields = [ resAgg_dic[result] + "(" + result + ")" for result in results ] for field in resfields: aggres = aggres.withColumn(field, getattr(aggres, field) / datescount) # output results if opts.fout: is_header = str(opts.header).lower() fout_header = formFileHeader(opts.fout) if opts.collect: fout_header = fout_header + ".json" aggres = aggres.toJSON().collect() with open(fout_header, 'w+') as f: f.write('[') f.write(",".join(aggres)) f.write(']') else: lines = aggres.map(to_csv) lines.saveAsTextFile(opts.fout) if opts.es: validateEsParams(esnode, esport, esresource) aggres = aggres.withColumn("origin", lit(opts.esorigin)) aggres.repartition(1).write.format("org.elasticsearch.spark.sql").option("es.nodes", esnode)\ .option("es.port", esport)\ .option("es.resource", esresource)\ .save(mode="append") else: for row in aggres.head(10): print(row)
def main(): cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = HiveContext(sc) # Loading yelp_business data yelp_business = sc.cassandraTable("bigbang", "yelp_business").\ select('business_id', 'city', 'full_address', 'latitude', 'longitude', 'name', 'open', 'review_count', 'stars', 'state')\ .toDF() yelp_business.registerTempTable('business') # Loading yelp_user data yelp_user = sc.cassandraTable("bigbang", "yelp_user").select('user_id', 'name', 'yelping_since').toDF() yelp_user.registerTempTable('user') # Loading user_pred_ratings data user_pred_ratings = sc.cassandraTable("bigbang", "user_pred_ratings").\ select('user_id', 'business_id', 'user_pred_rating').toDF() user_pred_ratings.registerTempTable('userPredRatings') # Loading checkin data business_checkin = sc.cassandraTable("bigbang", "yelp_business_checkin")\ .select('business_id', 'checkin','day','hour').toDF() business_checkin.registerTempTable('businessCheckin') # Joining user, business and ratings to get denormalized data user_res_pred_denorm_df = sqlContext.sql( """SELECT a.user_id as user_id, b.city as city, a.business_id as business_id, b.full_address as full_address, b.latitude as latitude, b.longitude as longitude, b.name as business_name, b.open as open, b.review_count as review_count, b.stars as stars, b.state as state, c.name as user_name, a.user_pred_rating as user_pred_rating from userPredRatings a inner join business b on a.business_id = b.business_id inner join user c on a.user_id = c.user_id""") # user_res_pred_denorm_df.registerTempTable('userResPredDenorm') # # recommendations_top10_df = sqlContext.sql( # """SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, # stars, state, user_name, user_pred_rating FROM # (SELECT user_id, city, business_id, full_address, latitude, longitude, business_name, open, review_count, stars, # state, user_name, user_pred_rating, row_number() OVER (PARTITION BY user_id, state ORDER BY user_pred_rating desc) rn # FROM userResPredDenorm) temp # WHERE temp.rn <= 20""") #Retrieving top 20 restaurants per user per state using Window Analytic Functions windowSpec = Window.partitionBy(user_res_pred_denorm_df['user_id'],user_res_pred_denorm_df['state']).\ orderBy(user_res_pred_denorm_df['user_pred_rating'].desc()) rn_business = (func.rowNumber().over(windowSpec)) recommendations_ranked_df = user_res_pred_denorm_df.select( user_res_pred_denorm_df['user_id'], user_res_pred_denorm_df['city'], user_res_pred_denorm_df['business_id'], user_res_pred_denorm_df['full_address'], user_res_pred_denorm_df['latitude'], user_res_pred_denorm_df['longitude'], user_res_pred_denorm_df['business_name'], user_res_pred_denorm_df['open'], user_res_pred_denorm_df['review_count'], user_res_pred_denorm_df['stars'], user_res_pred_denorm_df['state'], user_res_pred_denorm_df['user_name'], user_res_pred_denorm_df['user_pred_rating'], rn_business.alias("rn")) recommendations_top20_df = recommendations_ranked_df.filter( recommendations_ranked_df["rn"] <= 20).select( recommendations_ranked_df['user_id'], recommendations_ranked_df['city'], recommendations_ranked_df['business_id'], recommendations_ranked_df['full_address'], recommendations_ranked_df['latitude'], recommendations_ranked_df['longitude'], recommendations_ranked_df['business_name'], recommendations_ranked_df['open'], recommendations_ranked_df['review_count'], recommendations_ranked_df['stars'], recommendations_ranked_df['state'], recommendations_ranked_df['user_name'], recommendations_ranked_df['user_pred_rating']) recommendations_top20_df.registerTempTable('recommendations') recommendations_top20_rdd = recommendations_top20_df.rdd.map(lambda p: ( str(uuid.uuid1()), p['user_id'], p['city'], p['business_id'], p[ 'full_address'], p['latitude'], p['longitude'], p[ 'business_name'], p['open'], p['review_count'], p['stars'], p[ 'state'], p['user_name'], p['user_pred_rating'])) # Loading the recommendations table in cassandra recommendations_top20_rdd.saveToCassandra( 'bigbang', 'recommendations', columns=[ "uid", "user_id", "business_city", "business_id", "business_full_address", "business_latitude", "business_longitude", "business_name", "business_open", "business_review_count", "business_stars", "business_state", "user_name", "user_pred_rating" ], parallelism_level=1000) # Joining recommendations and checkin data user_res_checkin_denorm_df = sqlContext.sql( """SELECT a.user_id as user_id, a.city as business_city, a.business_id as business_id, a.full_address as business_full_address, a.latitude as business_latitude, a.longitude as business_longitude, a.business_name as business_name, a.open as business_open, a.review_count as business_review_count, a.stars as business_stars, a.state as business_state, a.user_name as user_name, a.user_pred_rating as user_pred_rating, b.day as day, b.hour as hour, b.checkin as checkin from recommendations a inner join businessCheckin b on a.business_id = b.business_id""") recommendations_checkin_rdd = user_res_checkin_denorm_df.rdd.map( lambda p: (str(uuid.uuid1()), p['user_id'], p['business_city'], p[ 'business_id'], p['business_full_address'], p['business_latitude'], p['business_longitude'], p['business_name'], p[ 'business_open'], p['business_review_count'], p['business_stars'], p['business_state'], p['user_name'], p[ 'user_pred_rating'], p['day'], p['hour'], p['checkin'])) # Loading the recommendations_checkin table in cassandra recommendations_checkin_rdd.saveToCassandra( 'bigbang', 'recommendations_checkin', columns=[ "uid", "user_id", "business_city", "business_id", "business_full_address", "business_latitude", "business_longitude", "business_name", "business_open", "business_review_count", "business_stars", "business_state", "user_name", "user_pred_rating", "day", "hour", "checkin" ], parallelism_level=1000)
# build dataframe with column shown in schema, the reason to build data frame is that VectorAssembler's input should be two column in dataframe schema = 'score,gilded,distinguished,controversiality'.split(',') itemsForDataFrame = textScorePair.map( lambda x: [x[1], x[0][2], x[0][3], x[0][4]]) otherFeaturesDF = sqlContext.createDataFrame(itemsForDataFrame, schema) # build datafame for tf_idf, same reason as above tfidfSchema = StructType([StructField("tf_idf", VectorUDT(), True)]) row = Row("tf_idf") tfidfDF = tfidf.map(lambda x: row(x)).toDF(tfidfSchema) # add row number to the two dataframe, in order to perform a join w = Window().orderBy() otherFeaturesDF = otherFeaturesDF.withColumn("columnindex", rowNumber().over(w)) tfidfDF = tfidfDF.withColumn("columnindex", rowNumber().over(w)) mergedDF = otherFeaturesDF.join( tfidfDF, otherFeaturesDF.columnindex == tfidfDF.columnindex, 'inner') # assemble the tf-idf and other features to form a single vector assembler = VectorAssembler( inputCols=["tf_idf", "gilded", "distinguished", "controversiality"], outputCol="features") mergedDF = assembler.transform(mergedDF) mergedDF.show() scoreFeaturesPair = mergedDF.map(lambda x: (x[7], x[0])).repartition(500) features = scoreFeaturesPair.map(lambda x: x[0]) scores = scoreFeaturesPair.map(lambda x: int(x[1]))
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)