Beispiel #1
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
 def train(self, df, featureCols):
     
     ntiles = []
     for col in featureCols:
         w = Window.partitionBy().orderBy(col)
         aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col)
         ntiles.append(list(aux.groupby('ntile').max(col).collect()))
         
     self.ntiles_ = np.array(ntiles)
     self.columns_ = map(str,featureCols)
     self._is_trained = True
Beispiel #3
0
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
Beispiel #4
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
 def sliding_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
 def ordered_window(self):
     return Window.partitionBy('id').orderBy('v')
 def shrinking_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(-3, Window.unboundedFollowing)
 def growing_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(Window.unboundedPreceding, 4)
tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/"

dfjson = sparkSession.read.format("json").option("multiline", "true").option(
    "inferSchema", "true").load(srcfilePath)

data = dfjson.withColumn("data", explode("DATA")).select("data.*")

# dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss"))
dfPT = data.withColumn(
    "createdDatePT",
    sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "),
                          "US/Pacific"))

df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select(
    [col for col in df.columns if not col.startswith("xmlns")])

#dfbaseData.show(10,False)

dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \
                    .where(sf.col("row_num") == 1) \
                    .select(dfbaseData["*"])

dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
                   .write.format("parquet") \
                   .partitionBy("year","month","day") \
                   .mode("overwrite") \
                   .save(tgtfilePath)
Beispiel #10
0
def test_sliding_window(data_gen):
    row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False)
    w = Window.partitionBy(f.window('ts', '5 hour', '1 hour'))
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : gen_df(spark, row_gen).withColumn('rolling_max', f.max("data").over(w)))
Beispiel #11
0
def cal_mat_window(sc, sqlContext, dfSC, window):
    windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1)
    mat = func.avg("close").over(windowSpec)
    dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat )
    print dfSC.collect()
Beispiel #12
0
 def sliding_range_window(self):
     return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
Beispiel #13
0
 def sliding_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
Beispiel #14
0
 def shrinking_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(-3, Window.unboundedFollowing)
Beispiel #15
0
 def shrinking_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
Beispiel #16
0
 def growing_range_window(self):
     return Window.partitionBy('id').orderBy('v') \
         .rangeBetween(Window.unboundedPreceding, 4)
Beispiel #17
0
 def growing_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
Beispiel #18
0
                        sf.col('inputData.trackingId').alias('trackingId'),
                        sf.col('sort'),
                        sf.col('filters'),
                        sf.col('inputData.filterInput').alias('filterInput'),
                        sf.col('algo'),
                        sf.col('pid'),
                        sf.col('outputData.rankList.ranks').alias('ranks'))

# display(_searches)

# COMMAND ----------

columnsToPartitionOn = ['vcid', 'flavour', 'checkIn', 'checkOut', 'pax', 'userId', 'email', 'trackingId', 'sort', 'filters', 'filterInput',
                        'algo']
searches = _searches.withColumn('lastSearchTime', sf.lag('currentSearchTime') \
                                                    .over(Window.partitionBy(columnsToPartitionOn) \
                                                    .orderBy('currentSearchTime'))) \
                   .withColumn('isNewSession', is_new_session(sf.col('currentSearchTime'), sf.col('lastSearchTime'), sf.col('pid'))) \
                   .withColumn('sessionId', sf.sum('isNewSession') \
                                              .over(Window.partitionBy(columnsToPartitionOn) \
                                              .orderBy('currentSearchTime'))) \
                   .groupBy(columnsToPartitionOn + ['sessionId']) \
                   .agg(sf.min('searchTime').alias('searchTime'),
                        sf.collect_list('ranks').alias('hotels')) \
                   .withColumn('hotels', flatten(sf.col('hotels')))

display(searches)

# COMMAND ----------

# DBTITLE 1,Details
hotels_e_detail = spark.table('hotels_e.detail')
Beispiel #19
0
    AVG(peakBytes) peakBytesDaily, 
    AVG(peakLoad1) peakLoad1Daily, AVG(peakLoad2) peakLoad2Daily, AVG(peakLoad3) peakLoad3Daily,  
    AVG(peakLoad4) peakLoad4Daily, AVG(peakLoad5) peakLoad5Daily, AVG(peakLoadSecure) peakLoadSecureDaily
    FROM featureeddf group by ServerIP, SessionStartDay
"""

dailyStatisticdf = spark.sql(sqlStatement);
#lag features
#previous week average
#rolling mean features with 2-days/48-hours lag
rollingLags = [2]
lagColumns = [x for x in dailyStatisticdf.columns if 'Daily' in x] 
windowSize=[7]
for w in windowSize:
    for i in rollingLags:
        wSpec = Window.partitionBy('d_ServerIP').orderBy('d_SessionStartDay').rowsBetween(-i-w, -i-1)
        for j in lagColumns:
            dailyStatisticdf = dailyStatisticdf.withColumn(j+'Lag'+str(i)+'Win'+str(w),F.avg(col(j)).over(wSpec) )
selectColumn =  ['d_ServerIP', 'd_SessionStartDay']
selectColumn.extend([x for x in dailyStatisticdf.columns if 'Lag' in x])
dailyStatisticdf = dailyStatisticdf.select(selectColumn)
dailyStatisticdf = dailyStatisticdf.withColumn("d_key2", concat(dailyStatisticdf.d_ServerIP,lit("_"),dailyStatisticdf.d_SessionStartDay.cast('string')))
featureeddf = featureeddf.withColumn("d_key2", concat(featureeddf.ServerIP,lit("_"),featureeddf.SessionStartDay.cast('string')))
dailyStatisticdf.cache()
# Single column join is much faster than two columns join
featureeddf = featureeddf.join(dailyStatisticdf,  (featureeddf.d_key2 == dailyStatisticdf.d_key2),
                                   'outer' )
featureeddf.show(1)
featureeddf.persist()
featureeddf = featureeddf.select([x for x in featureeddf.columns if 'd_' not in x ])
priceDF = sqlContext.read.format('csv') \
  .options(header='true', inferSchema='true') \
  .load(INPUT_CSV)

print('Archivo cargado')

priceDF = priceDF.select( \
F.col('code').alias('CODE'), \
F.col('Price').alias('PRICE'), \
F.col('Date').alias('DATE'))
priceDF = priceDF.withColumn('YEAR', F.year(F.to_date('DATE', 'MM-dd-yyyy')))

print('Dataset creado')

#Rentabilidad diaria
windowSpec = Window.partitionBy("CODE").orderBy(
    F.col('DATE').asc()).rowsBetween(-1, 0)
priceDF = priceDF.withColumn('AUX', F.sum("price").over(windowSpec))
priceDF = priceDF.withColumn("RETURNS", (F.col("PRICE") -
                                         (F.col("AUX") - F.col("PRICE"))) /
                             (F.col("AUX") - F.col("PRICE"))).drop("AUX")

print('Rentabilidad diaria')

#Rentabilidad acumulada
precioIniDF = priceDF.sort(F.desc('DATE')).groupBy('YEAR', 'CODE').agg(
    F.last('PRICE').alias('PRICE_START'))
priceDF = precioIniDF.join(priceDF, on=['YEAR', 'CODE'])
priceDF = priceDF.withColumn('CUMULATIVE_RETURNS',
                             ((F.col('PRICE') - F.col('PRICE_START')) /
                              (F.col('PRICE_START'))))
    import pandas as pd
except ImportError as e:
    print("It wasn't possible import packages. Error:", e)

path = '/home/rafaelrs/Documents/case_stone/data'
relative_read_path = f'{path}/transaction_history/'
relative_write_path = f'{path}/balance/'

balance_schema = T.StructType([
    T.StructField('account_id', T.StringType(), True),
    T.StructField('amount', T.LongType(), True),
    T.StructField('counterparty_account_id', T.StringType(), True),
    T.StructField('inserted_at', T.LongType(), True),
    T.StructField('transaction_id', T.StringType(), True)
])

try:
    balance = spark.read.parquet(relative_read_path, schema=balance_schema)
except Exception as e:
    print("Something went wrong with the transaction history path", e)

by_account_id = Window.partitionBy(['account_id']).orderBy("inserted_at")

balance = balance.withColumn('balance',
                             F.sum(F.col('amount')).over(by_account_id))

balance = balance.withColumn(
    'inserted_at',
    F.to_timestamp(F.from_unixtime(F.col('inserted_at') / 1000)))

balance.write.parquet(relative_write_path)
                    dfbaseData.loanApplicationId, \
                    dfbaseData.clientID, \
                    dfbaseData.mvpClientID, \
                    sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("idologyTimestampUTC"), \
                    dfbaseData.createdDatePT.alias("idologyTimestampPT"),\
                    dfrmi.id.alias("rmiId"), \
                    dfrmi.date_created.alias("riskTimestampPT"), \
                    dfrmsNS.score_type.alias("scoreType"),\
                    dfrmi.risk_model_scorex_plus_input,\
                    dfrmi.risk_model_vantage_input,\
                    dfrmi.risk_model_vantage3_input,\
                    dfbaseData.year,\
                    dfbaseData.month,\
                    dfbaseData.day)

dfranked =  dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","idologyTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \
                .where(sf.col("row_num") == 1) \
                .select(dfjoin["*"])

dfsplitCol.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
                   .write.format("parquet") \
                   .partitionBy("year","month","day") \
                   .mode("overwrite") \
                   .save(tgtfilePathResultCodes)

dfbaseData.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
                   .write.format("parquet") \
                   .partitionBy("year","month","day") \
                   .mode("overwrite") \
                   .save(tgtfilePathBaseData)
 def unbounded_window(self):
     return Window.partitionBy('id') \
         .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
 def ordered_window(self):
     return Window.partitionBy('id').orderBy('v')
Beispiel #25
0
def calculate_time_window_indicator(df,
                                    column_name,
                                    time_window,
                                    group_column,
                                    filter_criteria,
                                    agg_column='sum_column',
                                    timestamp_column):
    """
    Function to calculate count aggregation grouped by a column and using provided time window and filter criteria

    :param df: spark dataframe with data to aggregate
    :param column_name: new column name
    :param time_window: time window to aggregate values
    :param group_column: column used to group by
    :param filter_criteria: filter criteria to apply before aggregation
    :param timestamp_column: column with timestamp to use for aggregation
    :return df: spark dataframe with aggregated column
    """

    # Time window dic
    dict_time_window = {
        '60M': 60,
        '2H': 2,
        '3H': 3,
        '12H': 12,
        '24H': 24,
        '3D': 3,
        '7D': 7,
        '15D': 15,
        '30D': 30,
        '60D': 60,
        '90D': 90,
        '365D': 365,
    }

    # Check if time window is correct
    if not time_window in dict_time_window:
        print('ERR PARAM: invalid time_window')
        return df

    # Check if column_name is correct
    if column_name in df.columns:
        print('ERR PARAM: invalid column_name, already exists on dataframe')
        return df

    # Check if group_column is correct
    if group_column not in df.columns:
        if not all(item in df.columns for item in group_column):
            print('ERR PARAM: invalid time_window')
            return df

    # Lambdas to calcule time in seconds
    days = lambda i: i * 86400
    hours = lambda i: i * 3600
    minutes = lambda i: i * 60

    # Create time window with paramenter
    if 'M' in time_window:
        window_parameter = minutes(dict_time_window[time_window])
    elif 'H' in time_window:
        window_parameter = hours(dict_time_window[time_window])
    elif 'D' in time_window:
        window_parameter = days(dict_time_window[time_window])

    # Define time window with partition
    windowSpec = Window.partitionBy(group_column).orderBy(
        F.col(timestamp_column).cast('long')).rangeBetween(
            -window_parameter, 0)

    # Add temp column used for aggregation that meet criteria
    if agg_column == 'sum_column':
        df = df.withColumn(agg_column,
                           F.when((filter_criteria), 1).otherwise(0))
    else:
        df = df.withColumn(
            agg_column,
            F.when((filter_criteria), df[agg_column]).otherwise(0))

    # Calculate rolling on provided window
    df = df.withColumn(column_name, F.sum(agg_column).over(windowSpec))

    # Substract one index due to start with 1
    if agg_column == 'sum_column':
        df = df.withColumn(
            column_name,
            F.when((df.sum_column == 1) & (df[column_name] > 0),
                   df[column_name] - 1).otherwise(df[column_name]))

    # Drop temporary columns
    df = df.drop(*['sum_column'])

    return df
 def unpartitioned_window(self):
     return Window.partitionBy()
 def growing_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
 def with_window_column(df):
     from pyspark.sql.window import Window
     from pyspark.sql.functions import percent_rank
     windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast'])
     return df.withColumn("r", percent_rank().over(windowSpec))
 def shrinking_row_window(self):
     return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
#reading dataframes from MongoDB
df = spark.read.format("mongo").load()

df.printSchema()

#let's change the data type to a timestamp
df = df.withColumn("tx_time", df.tx_time.cast("timestamp"))

#Here we are calculating a moving average
from pyspark.sql.window import Window
from pyspark.sql import functions as F

movAvg = df.withColumn(
    "movingAverage",
    F.avg("price").over(
        Window.partitionBy("company_symbol").rowsBetween(-1, 1)))
movAvg.show()

#Saving Dataframes to MongoDB
movAvg.write.format("mongo").option("replaceDocument",
                                    "true").mode("append").save()

#Reading Dataframes from the Aggregation Pipeline in MongoDB
pipeline = "[{'$group': {_id:'$company_name', 'maxprice': {$max:'$price'}}},{$sort:{'maxprice':-1}}]"
aggPipelineDF = spark.read.format("mongo").option("pipeline", pipeline).load()
aggPipelineDF.show()

#using SparkSQL with MongoDB
movAvg.createOrReplaceTempView("avgs")

sqlDF = spark.sql("SELECT * FROM avgs WHERE movingAverage > 43.0")
 def unbounded_window(self):
     return Window.partitionBy('id') \
         .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing).orderBy('v')
def chain_delay_feature_engineering(airline_df):
    '''Takes airline df with created columns CRS_DEP_TIME_UTC, CRS_DEP_MINUS_TWO_FIFTEEN_UTC and returns new airline df with 5 added columns:
  dep_time_diff_one_flight_before: time between departure of current flight and previous flight (in seconds)
  dep_time_diff_two_flights_before: time between departure of current flight and  flight two previous (in seconds)
  delay_one_before: was flight before delayed? (binary)
  delay_two_before: was flight two before delayed? (binary)
  PREVIOUS_FLIGHT_DELAYED_FOR_MODEL: If previous flight is at least 2 hours 15 minutes prior (8100 seconds), was it delayed? If less than 2:15, was flight 2 before delayed? (binary)'''

    airline_df.createOrReplaceTempView("airlines_temp_view")

    #Store new df with limited number of ordered columns that we can use to window
    airlines_aircraft_tracking = airline_df[[
        "tail_num", "fl_date", "origin_city_name", "dest_city_name",
        "dep_del15", "crs_dep_time_utc", "crs_dep_minus_two_fifteen_utc",
        "crs_arr_time_utc"
    ]].orderBy("tail_num", "fl_date", "crs_dep_time_utc")
    #This section is related to windowing so that we can pull information from previous flight and flight 2 before current flight. Windowing will only pull for the same tail number
    w = Window.partitionBy("tail_num").orderBy("crs_dep_time_utc")
    diff = col("crs_dep_time_utc").cast("long") - lag("crs_dep_time_utc",
                                                      1).over(w).cast("long")
    diff2 = col("crs_dep_time_utc").cast("long") - lag("crs_dep_time_utc",
                                                       2).over(w).cast("long")
    delay_one_before = lag("dep_del15", 1).over(w)
    delay_two_before = lag("dep_del15", 2).over(w)
    arr_time_one_before = col("crs_dep_time_utc").cast("long") - lag(
        "crs_arr_time_utc", 1).over(w).cast("long")
    arr_time_two_before = col("crs_dep_time_utc").cast("long") - lag(
        "crs_arr_time_utc", 2).over(w).cast("long")
    airlines_aircraft_tracking_diff = airlines_aircraft_tracking.withColumn("dep_time_diff_one_flight_before", diff)\
                                    .withColumn("dep_time_diff_two_flights_before", diff2)\
                                    .withColumn("delay_one_before", delay_one_before)\
                                    .withColumn("delay_two_before", delay_two_before)\
                                    .withColumn("arr_time_one_before", arr_time_one_before)\
                                    .withColumn("arr_time_two_before", arr_time_two_before)

    def chain_delay_analysis(crs_dep_time_utc, dep_time_diff_one_flight_before,
                             dep_time_diff_two_flights_before,
                             delay_one_before, delay_two_before,
                             arr_time_one_before, arr_time_two_before):
        '''Takes info on flight before: departure time difference, whether
    flight was delayed and returns 1 if flight before was delayed AND outside of 2:15 from current flight.
    If outside of 2:15 looks at flight 2 before and returns 1 if that one was delayed, 0 if not. If scheduled arrival of previous flight
    is greater than 5 hours or flight 2 before great than 7 hours before current flight we mark as 0'''
        try:
            if dep_time_diff_one_flight_before >= 8100:
                if arr_time_one_before <= 18000:
                    return delay_one_before
                else:
                    return int(0)
            else:
                if arr_time_two_before <= 25200:
                    return delay_two_before
                else:
                    return int(0)
        except:
            return int(0)

    chain_delay_analysis_udf = f.udf(chain_delay_analysis)
    airlines_aircraft_tracking_diff_for_join = airlines_aircraft_tracking_diff.withColumn(
        "PREVIOUS_FLIGHT_DELAYED_FOR_MODELS",
        chain_delay_analysis_udf('crs_dep_time_utc',
                                 'dep_time_diff_one_flight_before',
                                 'dep_time_diff_two_flights_before',
                                 'delay_one_before', 'delay_two_before',
                                 'arr_time_one_before', 'arr_time_two_before'))
    airline_df_with_id = airline_df.withColumn("id",
                                               monotonically_increasing_id())

    #Join chain delay analysis back to full airline data
    join_columns = [
        "tail_num", "fl_date", "origin_city_name", "dest_city_name",
        "crs_dep_time_utc"
    ]
    airlines_chain_delays = airline_df_with_id.alias("a").join(airlines_aircraft_tracking_diff_for_join.alias("j"), join_columns, 'left_outer') \
                              .select('a.year', 'a.quarter', 'a.month', 'a.day_of_week', 'a.fl_date', 'a.op_unique_carrier', 'a.tail_num', 'a.origin_airport_id', 'a.origin', 'a.origin_city_name', 'a.dest_airport_id', 'a.dest', 'a.dest_city_name', 'a.crs_dep_time', 'a.dep_time', 'a.dep_delay', 'a.dep_del15', 'a.cancelled', 'a.diverted', 'a.distance', 'a.distance_group', 'a.short_dest_city_name', 'a.short_orig_city_name', 'a.carrier_delay', 'a.weather_delay', 'a.nas_delay', 'a.security_delay', 'a.late_aircraft_delay', 'a.taxi_out', 'a.dest_timezone', 'a.origin_timezone', 'a.truncated_crs_dep_time_utc', 'a.truncated_crs_dep_minus_three_utc', 'a.crs_dep_time_utc', 'a.crs_dep_minus_two_fifteen_utc', 'a.Holiday', 'a.id', 'j.dep_time_diff_one_flight_before', 'j.dep_time_diff_two_flights_before', 'j.delay_one_before', 'j.delay_two_before', 'j.PREVIOUS_FLIGHT_DELAYED_FOR_MODELS')

    #Drop duplicates created during join.
    airlines_chain_delays_no_dups = airlines_chain_delays.dropDuplicates(
        ['id'])

    return airlines_chain_delays_no_dups
 def unpartitioned_window(self):
     return Window.partitionBy()
Beispiel #34
0
# Join de Rentabilidad y Volatilidad
rentaVolatilDF = rentabilidadDF.join(volatilidadDF, on=['YEAR', 'CODE'])

# Tabla Final
TotalDF = TotalDF.join(rentaVolatilDF, on=['YEAR', 'CODE'])

# RANKINGs
auxColumns_cat = [
    'PSDA__EARNINGS_PER_SHARE_DILUTED', 'PSDA__EBITDA_PER_SHARE',
    'PSDA__TOTAL_DEBT_PER_SHARE', 'CSR__NET_MARGIN', 'IS__REVENUE',
    'IS__NET_INCOME', 'IS__COST_OF_GOODS_SOLD', 'BS__TOTAL_EQUITY',
    'IS__EBITDA', 'BS__TOTAL_ASSETS', 'BS__LONG_TERM_DEBT',
    'BS__TOTAL_CURRENT_LIABILITIES'
]

wY = Window.partitionBy('YEAR')
for col in auxColumns_cat:
    TotalDF = TotalDF.withColumn(col + '_RANKING',
                                 F.row_number().over(wY.orderBy(F.col(col))))

# Agregamos label (rentabilidad periodo siguiente)
TotalDF = TotalDF.withColumn(
    "TARGET",
    F.lead(F.col("RETURNS"),
           1).over(Window.partitionBy(F.col("CODE")).orderBy(F.col("YEAR"))))

# Escribimos fichero final (CSV)
TotalDF.repartition('YEAR', 'CODE').write.csv(BUCKET + OUTPUT_FILE,
                                              header=True)
 def sliding_range_window(self):
     return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
    print("OMO's Zip & Borough list,")

    boro_zip_df = nyc_omo_df \
        .select("Boro", nyc_omo_df["Zip"].cast(IntegerType())) \
        .groupBy("Boro") \
        .agg({"Zip": "collect_set"}) \
        .withColumnRenamed("collect_set(Zip)", "ZipList") \
        .withColumn("ZipCount", F.size("ZipList"))

    boro_zip_df \
        .select("Boro", "ZipCount", "ZipList") \
        .show(5)

    # Window functions
    window_spec = Window.partitionBy("OMOCreateDate")
    omo_daily_freq = nyc_omo_df \
        .withColumn("OMODailyFreq", F.count("OMOID").over(window_spec))

    print("# of partitions in window'ed OM dataframe = " + str(omo_daily_freq.count()))
    omo_daily_freq.show(5)

    omo_daily_freq.select("OMOCreateDate", "OMODailyFreq") \
        .distinct() \
        .show(5)

    omo_daily_freq \
        .repartition(5) \
        .write \
        .mode("overwrite") \
        .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/nyc_omo_data")
Beispiel #37
0

# COMMAND ----------

identity_columns = [index_column_name, delta_column_name]
nested_columns = eval(str(dbutils.widgets.get("nested_columns")))
print("Nested Columns:", nested_columns)

# COMMAND ----------

for nested_column in nested_columns:
    max_records_on_array = (newDf.select(
        size(nested_column).alias(nested_column + "len")).groupby().max(
            nested_column +
            "len").collect()[0]["max({})".format(nested_column + "len")])
    nestedwindowSpec = Window.partitionBy("ROW_ID").orderBy(
        monotonically_increasing_id())
    nested_parquet_filepath = (mount + basepath + zone + "/" + contry_name +
                               "/" + source_name + "/" + object_name + "_" +
                               nested_column + "/" + year + "/" + month + "/" +
                               day + "/" + object_name + "_" + nested_column +
                               "_" + filename_timestamp + ".parquet")
    if max_records_on_array > 0:
        result = denormalizer(newDf, nested_column, identity_columns)
        result = result.withColumn("ITEM_ID",
                                   sqlfn.row_number().over(nestedwindowSpec))
        # added to remove any structypes
        result = flattenDataframe(result)
        result.write.format("parquet").save(nested_parquet_filepath)
        newDf = newDf.withColumn(
            nested_column, sha2(newDf[nested_column].cast(StringType()), 512))
    else:
Beispiel #38
0
 def unbounded_window(self):
     return (Window.partitionBy("id").rowsBetween(
         Window.unboundedPreceding, Window.unboundedFollowing).orderBy("v"))
Beispiel #39
0
def get_similar_artists(top_artist_df, artist_relation_df,
                        similar_artist_limit):
    """ Get artists similar to top artists.

        Args:
            top_artist_df: Dataframe containing top artists listened to by users
            artist_relation_df: Dataframe containing artists and similar artists.
                                For columns refer to artist_relation_schema in listenbrainz_spark/schema.py.
            similar_artist_limit (int): number of similar artist to calculate

        Returns:
            similar_artist_df (dataframe): Top Z artists similar to top artists where
                                           Z = SIMILAR_ARTISTS_LIMIT.
    """
    condition = [top_artist_df.top_artist_credit_id == artist_relation_df.id_0]

    df1 = top_artist_df.join(artist_relation_df, condition, 'inner') \
                       .select(col('id_0').alias('top_artist_credit_id'),
                               col('name_0').alias('top_artist_name'),
                               col('id_1').alias('similar_artist_credit_id'),
                               col('name_1').alias('similar_artist_name'),
                               'score',
                               'user_name')

    condition = [top_artist_df.top_artist_credit_id == artist_relation_df.id_1]

    df2 = top_artist_df.join(artist_relation_df, condition, 'inner') \
                       .select(col('id_1').alias('top_artist_credit_id'),
                               col('name_1').alias('top_artist_name'),
                               col('id_0').alias('similar_artist_credit_id'),
                               col('name_0').alias('similar_artist_name'),
                               'score',
                               'user_name')

    df = df1.union(df2)

    window = Window.partitionBy('top_artist_credit_id', 'user_name')\
                   .orderBy(col('score').desc())

    similar_artist_df_html = df.withColumn('rank', row_number().over(window)) \
                               .where(col('rank') <= similar_artist_limit)\
                               .select('top_artist_credit_id',
                                       'top_artist_name',
                                       'similar_artist_credit_id',
                                       'similar_artist_name',
                                       'user_name')

    similar_artist_df_html = filter_top_artists_from_similar_artists(
        similar_artist_df_html, top_artist_df)

    # Two or more artists can have same similar artist(s) leading to non-unique recordings
    # therefore we have filtered the distinct similar artists.
    similar_artist_df = similar_artist_df_html.select('similar_artist_credit_id',
                                                      'similar_artist_name',
                                                      'user_name') \
                                              .distinct()

    if _is_empty_dataframe(similar_artist_df):
        logger.error('Similar artists not generated.', exc_info=True)
        raise SimilarArtistNotFetchedException(
            'Artists missing from artist relation')

    return similar_artist_df, similar_artist_df_html
Beispiel #40
0
 def ordered_window(self):
     return Window.partitionBy("id").orderBy("v")
Beispiel #41
0
train.select(['user_id']).distinct().count()
#5298
val_new.select(['user_id']).distinct().count()
#1751
test_new.select(['user_id']).distinct().count()
#1636
## there are some users in val_add and test_add but not in val_new and test_new, therefore, 8774> 5298+1751+1636=8685

## example of writing x% data
train_add_test.write.parquet("train01.parquet")
val_new.write.parquet("val01.parquet")
test_new.write.parquet("test01.parquet")

# create the true rank list (example of 1% data)
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window = Window.partitionBy(val['user_id']).orderBy(val['rating'].desc())
val_true_order = val.select('*', rank().over(window).alias('rank'))
val_true_list = val_true_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
val_true_list.write.parquet("val01_true_list.parquet")

window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc())
test_true_order = test.select('*', rank().over(window).alias('rank'))
test_true_list = test_true_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
test_true_list.write.parquet("test01_true_list.parquet")
Beispiel #42
0
 def sliding_row_window(self):
     return Window.partitionBy("id").orderBy("v").rowsBetween(-2, 1)
Beispiel #43
0
def process_log_data(spark, input_data, output_data):
    """
    Load Log & Song dataset from AWS S3 into spark and build two dimension 
    tables (users, time) and a fact table (songplays) 
    And writes tables back to S3 in parquet format

    Arguments:
        spark: Spark Session
        input_data: Input data path
        output_data: Output data path
    Return: None  
    """
    # build SQLContext
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    # get filepath to log data file
    log_data = os.path.join(input_data, "log_data/*/*/*.json")

    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    # read log data file
    df = spark.read.format("json").load(log_data)

    # filter by actions for song plays and keep only 'NextSong' event
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    # users table need a special treat for keeping the last level value
    # in the log data
    # For capturing the last level value for each user, Window will be used
    win1 = Window.partitionBy('userId').orderBy('ts') \
                 .rangeBetween(Window.unboundedPreceding, 0)
    df_users = df.filter(col('userId').isNotNull()). \
                 select('userId', 'firstName', 'lastName',
          'gender', 'level', 'ts') \
                 .withColumn('tsSeq', count('ts').over(win1))
    win2 = Window.partitionBy("userId")
    df_users = df_users.withColumn("tsSeqMax", F.max("tsSeq").over(win2))
    users_table =  df_users.filter((df_users.tsSeq == df_users.tsSeqMax)) \
                           .groupBy('userId') \
                           .agg(F.max("firstName"), F.max("lastName"),
           F.max("gender"), F.max("level")) \
                           .withColumnRenamed('userId', 'user_id') \
                           .withColumnRenamed('max(firstName)', 'first_name') \
                           .withColumnRenamed('max(lastName)', 'last_name') \
                           .withColumnRenamed('max(gender)', 'gender') \
                           .withColumnRenamed('max(level)', 'level')

    # make sure users table has desired schema
    users_schema = StructType([
        StructField('user_id', StringType(), False),
        StructField('first_name', StringType(), False),
        StructField('last_name', StringType(), False),
        StructField('gender', StringType(), True),
        StructField('level', StringType(), False)
    ])
    users_table = sqlContext.createDataFrame(users_table.rdd, users_schema)

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'),
                              mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1e3),
                        TimestampType())
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1e3), DateType())
    df = df.withColumn("ts_date", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.groupBy('timestamp') \
                .agg({'timestamp':'max'}) \
                .withColumn('hour', F.hour('max(timestamp)')) \
                .withColumn('day', F.dayofmonth('max(timestamp)')) \
                .withColumn('week', F.weekofyear('max(timestamp)')) \
                .withColumn('month', F.month('max(timestamp)')) \
                .withColumn('year', F.year('max(timestamp)')) \
                .withColumn('weekday', F.date_format('max(timestamp)', 'E')) \
                .withColumnRenamed('max(timestamp)', 'start_time') \
                .drop('timestamp') \
                .orderBy(F.asc('start_time'))

    # make sure time table has desired schema
    time_schema = StructType([
        StructField('start_time', TimestampType(), False),
        StructField('hour', IntegerType(), False),
        StructField('day', IntegerType(), False),
        StructField('week', IntegerType(), False),
        StructField('month', IntegerType(), False),
        StructField('year', IntegerType(), False),
        StructField('weekday', StringType(), False)
    ])
    time_table = sqlContext.createDataFrame(time_table.rdd, time_schema)

    # write time table to parquet files partitioned by year and month
    time_table.repartition(col('year'), col('month')) \
              .write.partitionBy('year', 'month') \
              .parquet(os.path.join(output_data, 'time'), mode='overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.format("json").load(song_data)

    t_log = df.alias('t_log')
    t_song = song_df.alias('t_song')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = t_log.join(t_song, \
                                 (col('t_log.song') == col('t_song.title')) & \
                                 (col('t_log.artist') == col('t_song.artist_name') \
    ),how='inner') \
                           .select([col('t_log.timestamp'),\
               col('t_log.userId'), \
               col('t_log.level'), \
        col('t_song.song_id'), \
                                    col('t_song.artist_id'), \
        col('t_log.sessionId'), \
        col('t_log.location'), \
        col('t_log.userAgent')]) \
                           .withColumnRenamed('timestamp', 'start_time') \
                           .withColumnRenamed('userId', 'user_id') \
                           .withColumnRenamed('sessionId', 'session_id') \
                           .withColumnRenamed('userAgent', 'user_agent') \
                           .orderBy(F.asc('start_time'))

    # make sure songplays has desired schema
    songplays_schema = StructType([
        StructField('start_time', TimestampType(), False),
        StructField('user_id', StringType(), False),
        StructField('level', StringType(), True),
        StructField('song_id', StringType(), False),
        StructField('artist_id', StringType(), False),
        StructField('session_id', LongType(), False),
        StructField('location', StringType(), True),
        StructField('user_agent', StringType(), False)
    ])
    songplays_table = sqlContext.createDataFrame(songplays_table.rdd,
                                                 songplays_schema)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.withColumn('year', F.year('start_time')) \
                   .withColumn('month', F.month('start_time')) \
                   .repartition(col('year'), col('month')) \
                   .write.partitionBy('year', 'month') \
     .parquet(os.path.join(output_data, 'songplays'),\
              mode='overwrite')
Beispiel #44
0
 def growing_range_window(self):
     return Window.partitionBy("id").orderBy("v").rangeBetween(
         Window.unboundedPreceding, 4)
Beispiel #45
0
inner_join.createOrReplaceTempView('inner_join')
(training, tune, test) = check.randomSplit([0.6, 0.2, 0.2], seed=0)
training.createOrReplaceTempView('training')
tune.createOrReplaceTempView('tune')
test.createOrReplaceTempView('test')
train_set = spark.sql(
    'Select training.user_id, inner_join.rating, inner_join.book_id from inner_join, training where inner_join.user_id = training.user_id'
)
val_set = spark.sql(
    'Select tune.user_id, inner_join.rating, inner_join.book_id from inner_join, tune where inner_join.user_id = tune.user_id'
)
test_set = spark.sql(
    'Select test.user_id, inner_join.rating, inner_join.book_id from inner_join, test where inner_join.user_id = test.user_id'
)
from pyspark.sql.window import Window
window = Window.partitionBy('user_id').orderBy('book_id')
import pyspark.sql.functions as F
val = (val_set.select("user_id", "book_id", "rating",
                      F.row_number().over(window).alias("row_number")))
test = (test_set.select("user_id", "book_id", "rating",
                        F.row_number().over(window).alias("row_number")))
val.createOrReplaceTempView('val')
test.createOrReplaceTempView('test')
val_data = spark.sql('Select * from val where row_number%2 = 0')
test_final = spark.sql('Select * from test where row_number%2 = 0')
val_train = spark.sql('Select * from val where row_number%2 = 1')
test_train = spark.sql('Select * from test where row_number%2 = 1')
val_train = val_train.drop('row_number')
val_data = val_data.drop('row_number')
test_final = test_final.drop('row_number')
train_set = train_set.unionByName(val_train)
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)