def test_mixed_sql_and_udf(self):
        df = self.data
        w = self.unbounded_window
        ow = self.ordered_window
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
        expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))

        # Test mixing sql window function and window udf in the same expression
        result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
        expected2 = expected1

        # Test chaining sql aggregate function and udf
        result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('min_v', min(df['v']).over(w)) \
                    .withColumn('v_diff', col('max_v') - col('min_v')) \
                    .drop('max_v', 'min_v')
        expected3 = expected1

        # Test mixing sql window function and udf
        result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('rank', rank().over(ow))
        expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('rank', rank().over(ow))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
Esempio n. 2
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
Esempio n. 3
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
Esempio n. 4
0
 def getValueFieldValueLists(self, handlerId, keyFields, valueFields):
     df = self.entity.groupBy(keyFields)
     agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId))
     maxRows = int(self.options.get("rowCount","100"))
     numRows = min(maxRows,df.count())
     valueLists = []
     for valueField in valueFields:
         valueDf = None
         if agg == "SUM":
             valueDf = df.agg(F.sum(valueField).alias("agg"))
         elif agg == "AVG":
             valueDf = df.agg(F.avg(valueField).alias("agg"))
         elif agg == "MIN":
             valueDf = df.agg(F.min(valueField).alias("agg"))
         elif agg == "MAX":
             valueDf = df.agg(F.max(valueField).alias("agg"))
         else:
             valueDf = df.agg(F.count(valueField).alias("agg"))
         for keyField in keyFields:
             valueDf = valueDf.sort(F.col(keyField).asc())
         valueDf = valueDf.dropna()
         rows = valueDf.select("agg").take(numRows)
         valueList = []
         for row in rows:
             valueList.append(row["agg"])
         valueLists.append(valueList)
     return valueLists   
Esempio n. 5
0
def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()
    def test_multiple_udfs(self):
        df = self.data
        w = self.unbounded_window

        result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \
                    .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \
                    .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w))

        expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \
                      .withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('min_w', min(df['w']).over(w))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_timestamp_splitter(test_specs, spark_dataset):
    dfs_rating = spark_dataset.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float"))

    splits = spark_timestamp_split(
        dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL
    )

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split0 <= min_split1)

    # Test multi split
    splits = spark_timestamp_split(dfs_rating, ratio=test_specs["ratios"])

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split0 <= min_split1)

    max_split1 = splits[1].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0]
    min_split2 = splits[2].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0]
    assert(max_split1 <= min_split2)
Esempio n. 8
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
Esempio n. 9
0
def do_something_only_once():
    # the command I use to run this script:
    #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py
    global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda
    ## Loading of data
    sc = SparkContext(appName='Simple App') #"local"
    sqlContext = SQLContext(sc)
    # Load metadata avro
    reader = sqlContext.read.format('com.databricks.spark.avro')
    meta = reader.load('data/spark_metadata.avro')
    # # Loading topic distributions
    topdisFile = 'data/spark_output.tuples'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile)
    strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType())
    topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0']))
    strip_first_col_float = udf(lambda row: float(row[1:]), FloatType())
    topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1']))
    strip_last_col = udf(lambda row: float(row[:-2]), FloatType())
    topdis = topdis.withColumn('C20',strip_last_col(topdis['C20']))
    # # Load dictionary CSV
    dicFile = 'data/spark_dic.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile)
    dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    ldaFile = 'data/spark_lda.csv'
    csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile)
    lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache()
    # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count'))
    # # # Load clustertopics CSV
    # clutoFile = 'enron_small_clustertopics.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile)
    # # # Load topicswords CSV
    # towoFile = 'enron_small_lda_transposed.csv'
    # csvLoader = sqlContext.read.format('com.databricks.spark.csv')
    # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile)
    # # Merge topdis which has document id and with metadata, based on document id
    metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp"))
    doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache()
    maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp']
    mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
def _if_later(data1, data2):
    """Helper function to test if records in data1 are earlier than that in data2.
    Returns:
        bool: True or False indicating if data1 is earlier than data2.
    """
    x = (data1.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL)
         .groupBy(DEFAULT_USER_COL)
         .agg(F.max(DEFAULT_TIMESTAMP_COL).cast('long').alias('max'))
         .collect())
    max_times = {row[DEFAULT_USER_COL]: row['max'] for row in x}

    y = (data2.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL)
         .groupBy(DEFAULT_USER_COL)
         .agg(F.min(DEFAULT_TIMESTAMP_COL).cast('long').alias('min'))
         .collect())
    min_times = {row[DEFAULT_USER_COL]: row['min'] for row in y}

    result = True
    for user, max_time in max_times.items():
        result = result and min_times[user] >= max_time

    return result
Esempio n. 11
0
    def test_bounded_simple(self):
        from pyspark.sql.functions import mean, max, min, count

        df = self.data
        w1 = self.sliding_row_window
        w2 = self.shrinking_range_window

        plus_one = self.python_plus_one
        count_udf = self.pandas_agg_count_udf
        mean_udf = self.pandas_agg_mean_udf
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count_udf(df['v']).over(w2)) \
            .withColumn('max_v',  max_udf(df['v']).over(w2)) \
            .withColumn('min_v', min_udf(df['v']).over(w1))

        expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \
            .withColumn('count_v', count(df['v']).over(w2)) \
            .withColumn('max_v', max(df['v']).over(w2)) \
            .withColumn('min_v', min(df['v']).over(w1))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
    #     sqlfuncs.avg('age').alias('age'), sqlfuncs.max('gender').alias('gender'),sqlfuncs.max('date').alias('last_consume'),\
    #     sqlfuncs.min('date').alias('first_consume')  )
    # konsum_user_agg.registerTempTable('user_agg')
    #
    #
    # print(konsum_user_agg.first())
    # konsum_user_agg.write.save('/home/erlenda/data/konsum/a_users_parquet')
    #
    #
    #
    # #reg_late=konsum_user.filter(konsum_user.reg_date<datetime.datetime(2015,11,16,0,0))
    #
    pvs=konsum_user.groupBy('a_virtual','a_user_key','timegroup','device').agg(sqlfuncs.sum(konsum_user.pv).alias("pvs"),\
                                                  sqlfuncs.sum(konsum_user.pv_bet).alias("pvs_bet"),\
                                                  sqlfuncs.max('date').alias('last_consume'),\
                                                  sqlfuncs.min('date').alias('first_consume'),\
                                                  sqlfuncs.sum(konsum_user.visits).alias("visits"))

    pprint(pvs.take(10))
    print()
    #print(pvs.take(100)[55])
    pvs_tot1=pvs.agg(sqlfuncs.sum(pvs.pvs)).first()
    print('Total after basic aggregation',pvs_tot1)

    pvs_mapped=pvs.rdd.map(lambda x:((x.a_user_key,x.a_virtual), (Counter({literal_eval(x.timegroup):x.pvs}),\
        Counter({literal_eval(x.timegroup):1}),\
        x.pvs,\
        x.pvs_bet,\
        Counter({x.device:x.pvs}) ) )  )

    pvs_reduced=pvs_mapped.reduceByKey(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4])   )
Esempio n. 13
0
content_size_summary_df = logs_df.describe(['content_size'])
content_size_summary_df.show()

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC Alternatively, we can use SQL to directly calculate these statistics.  You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).
# MAGIC 
# MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`.

# COMMAND ----------

from pyspark.sql import functions as sqlFunctions
content_size_stats =  (logs_df
                       .agg(sqlFunctions.min(logs_df['content_size']),
                            sqlFunctions.avg(logs_df['content_size']),
                            sqlFunctions.max(logs_df['content_size']))
                       .first())

print 'Using SQL functions:'
print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*content_size_stats)

# COMMAND ----------

# MAGIC %md
# MAGIC ### (3b) Example: HTTP Status Analysis
# MAGIC 
# MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times.  We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column.

# COMMAND ----------
Esempio n. 14
0
    def all(self, axis: Union[int, str] = 0) -> bool:
        """
        Return whether all elements are True.

        Returns True unless there at least one element within a series that is
        False or equivalent (e.g. zero or empty)

        Parameters
        ----------
        axis : {0 or 'index'}, default 0
            Indicate which axis or axes should be reduced.

            * 0 / 'index' : reduce the index, return a Series whose index is the
              original column labels.

        Examples
        --------
        >>> ks.Series([True, True]).all()
        True

        >>> ks.Series([True, False]).all()
        False

        >>> ks.Series([0, 1]).all()
        False

        >>> ks.Series([1, 2, 3]).all()
        True

        >>> ks.Series([True, True, None]).all()
        True

        >>> ks.Series([True, False, None]).all()
        False

        >>> ks.Series([]).all()
        True

        >>> ks.Series([np.nan]).all()
        True

        >>> df = ks.Series([True, False, None]).rename("a").to_frame()
        >>> df.set_index("a").index.all()
        False
        """
        axis = validate_axis(axis)
        if axis != 0:
            raise ValueError('axis should be either 0 or "index" currently.')

        sdf = self._internal._sdf.select(self._scol)
        col = scol_for(sdf, sdf.columns[0])

        # Note that we're ignoring `None`s here for now.
        # any and every was added as of Spark 3.0
        # ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0]
        # Here we use min as its alternative:
        ret = sdf.select(F.min(F.coalesce(col.cast('boolean'),
                                          F.lit(True)))).collect()[0][0]
        if ret is None:
            return True
        else:
            return ret
Esempio n. 15
0
 def min(scol):
     return F.when(
         F.row_number().over(self._window) >= self._min_periods,
         F.min(scol).over(self._window)).otherwise(F.lit(None))
Esempio n. 16
0
def test_end_to_end(spark, tmp_path, root, num_partitions):
    server_a_keys = json.loads((root / "server_a_keys.json").read_text())
    server_b_keys = json.loads((root / "server_b_keys.json").read_text())
    shared_seed = json.loads((root / "shared_seed.json").read_text())["shared_seed"]

    batch_id = "test_batch"
    n_data = 7
    n_rows = 20

    params_a = [
        batch_id.encode(),
        n_data,
        "A",
        server_a_keys["private_key"].encode(),
        b64decode(shared_seed),
        server_a_keys["public_key"].encode(),
        server_b_keys["public_key"].encode(),
    ]
    params_b = [
        batch_id.encode(),
        n_data,
        "B",
        server_b_keys["private_key"].encode(),
        b64decode(shared_seed),
        server_b_keys["public_key"].encode(),
        server_a_keys["public_key"].encode(),
    ]

    def show(df):
        df.show()
        return df

    def explain(df):
        df.explain()
        return df

    shares = (
        spark.createDataFrame(
            [Row(payload=[int(i % 3 == 0 or i % 5 == 0) for i in range(n_data)])]
            * n_rows
        )
        .select(
            F.pandas_udf(
                partial(
                    udf.encode,
                    batch_id.encode(),
                    n_data,
                    server_a_keys["public_key"].encode(),
                    server_b_keys["public_key"].encode(),
                ),
                returnType="a: binary, b: binary",
            )("payload").alias("shares")
        )
        .withColumn("id", F.udf(lambda: str(uuid4()), "string")())
    )
    shares.cache()
    shares.show()

    verify1_a = shares.select(
        "id",
        F.pandas_udf(partial(udf.verify1, *params_a), returnType="binary")(
            "shares.a"
        ).alias("verify1_a"),
    )

    verify1_b = shares.select(
        "id",
        F.pandas_udf(partial(udf.verify1, *params_b), returnType="binary")(
            "shares.b"
        ).alias("verify1_b"),
    )

    verify2_a = (
        shares.join(verify1_a, on="id")
        .join(verify1_b, on="id")
        .select(
            "id",
            F.pandas_udf(partial(udf.verify2, *params_a), returnType="binary")(
                "shares.a", "verify1_a", "verify1_b"
            ).alias("verify2_a"),
        )
    )

    verify2_b = (
        shares.join(verify1_a, on="id")
        .join(verify1_b, on="id")
        .select(
            "id",
            F.pandas_udf(partial(udf.verify2, *params_b), returnType="binary")(
                "shares.b", "verify1_b", "verify1_a"
            ).alias("verify2_b"),
        )
    )

    aggregate_a = (
        shares.join(verify2_a, on="id")
        .join(verify2_b, on="id")
        .select(
            F.col("shares.a").alias("shares"),
            F.col("verify2_a").alias("internal"),
            F.col("verify2_b").alias("external"),
        )
        # this only works if partition < 4GB
        .groupBy()
        .applyInPandas(
            lambda pdf: udf.aggregate(*params_a, pdf),
            schema="payload binary, error int, total int",
        )
        .groupBy()
        .applyInPandas(
            lambda pdf: udf.total_share(*params_a, pdf),
            schema="payload binary, error int, total int",
        )
    )

    aggregate_b = explain(
        show(
            shares.join(verify2_a, on="id")
            .join(verify2_b, on="id")
            .repartitionByRange(num_partitions, "id")
            .select(
                F.col("shares.b").alias("shares"),
                F.col("verify2_b").alias("internal"),
                F.col("verify2_a").alias("external"),
            )
            .withColumn("pid", F.spark_partition_id())
        )
        .groupBy("pid")
        .applyInPandas(
            lambda pdf: udf.aggregate(*params_b, pdf),
            schema="payload binary, error int, total int",
        )
        .groupBy()
        .applyInPandas(
            lambda pdf: udf.total_share(*params_b, pdf),
            schema="payload binary, error int, total int",
        )
    )

    def test_total_shares(aggregate):
        print(aggregate)
        assert len(aggregate.payload) > 0
        assert aggregate.error == 0
        assert aggregate.total == n_rows
        return True

    assert test_total_shares(aggregate_a.first())
    assert test_total_shares(aggregate_b.first())

    published = show(
        aggregate_a.withColumn("server", F.lit("internal"))
        .union(aggregate_b.withColumn("server", F.lit("external")))
        .groupBy()
        .pivot("server", ["internal", "external"])
        .agg(*[F.min(x).alias(x) for x in aggregate_a.columns])
    ).select(
        F.pandas_udf(partial(udf.publish, *params_a), returnType="array<int>")(
            "internal_payload", "external_payload"
        ).alias("payload"),
        F.col("internal_error").alias("error"),
        F.col("internal_total").alias("total"),
    )
    published.cache()
    assert published.count() == 1
    row = published.first()
    assert row.error == 0
    assert row.total == n_rows
    assert row.payload == [
        int(i % 3 == 0 or i % 5 == 0) * n_rows for i in range(n_data)
    ]
Esempio n. 17
0
#load stores ... I'm lazy and use string ids
stores_rdd=sc.textFile(base_path+"stores.txt").map(lambda x:x.split('\t')) # id | name
stores_fields=[
	StructField('id',StringType(),False), # name,type,nullable
	StructField('name',StringType(),False), 
]
stores=sqlCtx.createDataFrame(stores_rdd,StructType(stores_fields))

products_rdd=sc.textFile(base_path+"products.txt").map(lambda x:x.split('\t')) # id | name | category
products_fields=[
	StructField('id',StringType(),False), # name,type,nullable
	StructField('name',StringType(),False), 
	StructField('category',StringType(),True), 
]
products=sqlCtx.createDataFrame(products_rdd,StructType(products_fields))

sqlCtx.registerDataFrameAsTable(sales,"sales")
sqlCtx.registerDataFrameAsTable(stores,"stores")
sqlCtx.registerDataFrameAsTable(products,"products")

# can do SQL, including joins and GroupBy !!
sqlCtx.sql("SELECT * FROM sales sa join stores st on sa.store=st.id").show()
sqlCtx.sql("SELECT * FROM sales s join products p on s.product=p.id").show()

# .explain
# .agg
from pyspark.sql import functions as funcs

sales.groupBy('day').agg(funcs.min('store').alias('MinStore'),funcs.max('quantity').alias('MaxQty')).show()
# A slightly different way to generate the two random columns
df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
#df.describe().show()
display(df.describe())


# COMMAND ----------

#df.describe('uniform', 'normal').show()
display(df.describe('uniform', 'normal'))

# COMMAND ----------

from pyspark.sql.functions import mean, min, max
#df.select([mean('uniform'), min('uniform'), max('uniform')]).show()
display(df.select([mean('uniform'), min('uniform'), max('uniform')]))

# COMMAND ----------

# MAGIC %md ### Sample covariance and correlation
# MAGIC 
# MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows:

# COMMAND ----------

from pyspark.sql.functions import rand
df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))


# COMMAND ----------
Esempio n. 19
0
  .show()
''' 
Consulta d)
Los usuarios con la fecha de creación más antigua 
y la más reciente, respectivamente
'''
'''
Necesario para utilizar la función to_date
'''
from pyspark.sql.functions import *

df.select("*")\
  .where((to_date(df.CreationDate) ==
    df.select(
      min(
        to_date("CreationDate"))\
        .alias("min"))\
        .collect()[0].min) | (
        to_date(df.CreationDate) ==
          df.select(
            max(to_date("CreationDate"))\
            .alias("max"))\
            .collect()[0].max))\
  .orderBy(to_date("CreationDate"))\
  .show()
''' Comparando fechas hasta los milisegundos'''
'''
Usuario más antiguo
'''
df.sort("CreationDate", ascending=False)\
  .limit(1)\
Esempio n. 20
0
def getAndInsertStationLimits(pStationId):
    stationLimits = daily[daily.station_id == pStationId].groupBy('station_id').agg(
        max('max_temp'), avg('max_temp'), min('max_temp'),
        max('max_pressure'), avg('max_pressure'), min('max_pressure'),
        max('min_pressure'), avg('min_pressure'), min('min_pressure'),
        max('med_temp'), avg('med_temp'), min('med_temp'),
        max('min_temp'), avg('min_temp'), min('min_temp'),
        max('precip'), avg('precip'), min('precip'),
        max('wind_med_vel'), avg('wind_med_vel'), min('wind_med_vel'),
        max('wind_streak'), avg('wind_streak'), min('wind_streak'))

    stationLimitsRenamed = stationLimits.select("max(max_temp)", "avg(max_temp)", "min(max_temp)", "max(max_pressure)",
                                                "avg(max_pressure)"
                                                , "min(max_pressure)", "max(med_temp)", "avg(med_temp)",
                                                "min(med_temp)", "max(min_temp)", "avg(min_temp)", "min(min_temp)",
                                                "max(precip)", "avg(precip)", "min(precip)", "max(wind_med_vel)",
                                                "avg(wind_med_vel)", "min(wind_med_vel)", "max(wind_streak)",
                                                "avg(wind_streak)", "min(wind_streak)", "max(min_pressure)",
                                                "avg(min_pressure)", "min(min_pressure)").withColumnRenamed(
        "max(max_temp)", "value1").withColumnRenamed(
        "avg(max_temp)", "value2").withColumnRenamed(
        "min(max_temp)", "value3").withColumnRenamed(
        "max(max_pressure)", "value4").withColumnRenamed(
        "avg(max_pressure)", "value5").withColumnRenamed(
        "min(max_pressure)", "value6").withColumnRenamed(
        "max(med_temp)", "value7").withColumnRenamed(
        "avg(med_temp)", "value8").withColumnRenamed(
        "min(med_temp)", "value9").withColumnRenamed(
        "max(min_temp)", "value10").withColumnRenamed(
        "avg(min_temp)", "value11").withColumnRenamed(
        "min(min_temp)", "value12").withColumnRenamed(
        "max(precip)", "value13").withColumnRenamed(
        "avg(precip)", "value14").withColumnRenamed(
        "min(precip)", "value15").withColumnRenamed(
        "max(wind_med_vel)", "value16").withColumnRenamed(
        "avg(wind_med_vel)", "value17").withColumnRenamed(
        "min(wind_med_vel)", "value18").withColumnRenamed(
        "max(wind_streak)", "value19").withColumnRenamed(
        "avg(wind_streak)", "value20").withColumnRenamed(
        "min(wind_streak)", "value21").withColumnRenamed(
        "max(min_pressure)", "value22").withColumnRenamed(
        "avg(min_pressure)", "value23").withColumnRenamed(
        "min(min_pressure)", "value24").collect()

    maxMaxTemp = stationLimitsRenamed[0].value1
    avgMaxTemp = stationLimitsRenamed[0].value2
    minMaxTemp = stationLimitsRenamed[0].value3
    maxMaxPressure = stationLimitsRenamed[0].value4
    avgMaxPressure = stationLimitsRenamed[0].value5
    minMaxPressure = stationLimitsRenamed[0].value6
    maxMedTemp = stationLimitsRenamed[0].value7
    avgMedTemp = stationLimitsRenamed[0].value8
    minMedTemp = stationLimitsRenamed[0].value9
    maxMinTemp = stationLimitsRenamed[0].value10
    avgMinTemp = stationLimitsRenamed[0].value11
    minMinTemp = stationLimitsRenamed[0].value12
    maxPrecip = stationLimitsRenamed[0].value13
    avgPrecip = stationLimitsRenamed[0].value14
    minPrecip = stationLimitsRenamed[0].value15
    maxWindMedVel = stationLimitsRenamed[0].value16
    avgWindMedVel = stationLimitsRenamed[0].value17
    minWindMedVel = stationLimitsRenamed[0].value18
    maxWindStreak = stationLimitsRenamed[0].value19
    avgWindStreak = stationLimitsRenamed[0].value20
    minWindStreak = stationLimitsRenamed[0].value21
    maxMinPressure = stationLimitsRenamed[0].value22
    avgMinPressure = stationLimitsRenamed[0].value23
    minMinPressure = stationLimitsRenamed[0].value24

    session.execute("INSERT INTO Station_limits (station_id,\"maxMaxTemp\",\"avgMaxTemp\",\"minMaxTemp\",\"maxMaxPressure\",\
        \"avgMaxPressure\",\"minMaxPressure\",\"maxMedTemp\",\"avgMedTemp\",\"minMedTemp\",\"maxMinTemp\",\"avgMinTemp\",\"minMinTemp\",\"maxPrecip\",\
        \"avgPrecip\",\"minPrecip\",\"maxWindMedVel\",\"avgWindMedVel\",\"minWindMedVel\",\"maxWindStreak\",\"avgWindStreak\",\"minWindStreak\",\"maxMinPressure\",\"avgMinPressure\",\"minMinPressure\") \
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
                    , [str(pStationId), maxMaxTemp, avgMaxTemp, minMaxTemp,
                       maxMaxPressure, avgMaxPressure, minMaxPressure,
                       maxMedTemp, avgMedTemp, minMedTemp,
                       maxMinTemp, avgMinTemp, minMinTemp,
                       maxPrecip, avgPrecip, minPrecip,
                       maxWindMedVel, avgWindMedVel, minWindMedVel,
                       maxWindStreak, avgWindStreak, minWindStreak,
                       maxMinPressure, avgMinPressure, minMinPressure])
Esempio n. 21
0
# MAGIC
# MAGIC **agg(\*exprs)**
# MAGIC
# MAGIC Aggregate on the entire DataFrame without groups.
# MAGIC
# MAGIC Execute the following two cells for a couple examples of the `agg` operation:

# COMMAND ----------

products.agg({"BasePrice": "max"}).show(1)

# COMMAND ----------

from pyspark.sql import functions as F

products.agg(F.min(products.BasePrice)).show(1)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Where
# MAGIC
# MAGIC **where(**condition**)**
# MAGIC
# MAGIC Filters rows using the given condition.
# MAGIC
# MAGIC `where()` is an alias for `filter()`.
# MAGIC
# MAGIC Execute the following cell:

# COMMAND ----------
Esempio n. 22
0
# In[67]:

sqlContext.sql("SELECT stringLengthString('test')").collect()

# In[68]:

df.agg({"age": "max"}).collect()

# In[69]:

from pyspark.sql import functions as F

# In[70]:

df.agg(F.min(df.age)).collect()

# In[71]:

from pyspark.sql.functions import *
df_as1 = df.alias("df_as1")
df_as2 = df.alias("df_as2")
joined_df = df_as1.join(df_as2,
                        col("df_as1.name") == col("df_as2.name"), 'inner')
joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").collect()

# In[72]:

df.coalesce(1).rdd.getNumPartitions()

# In[73]:
Esempio n. 23
0
    def process_data(self):
        ##############################################################################
        # DECLARE VARIABLES
        ##############################################################################
        dt_range = self.study_dates("2020-07-30")
        dt = dt_range
        s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s1_initial_bucket_depth = 'cuebiq/daily-feed/US/'
        s1_bucket_output = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata'
        s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/'
        s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/'
        anchor_dist = 430
        time_thresh = 28800
        part_num = 9

        gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True)
        ])

        s2_gps_schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True),
            StructField("row_number", IntegerType(), True)
        ])

        ##############################################################################
        # WINDOWS
        ##############################################################################
        w = Window().partitionBy('device_id').orderBy('utc_timestamp')
        l = Window().partitionBy('device_id',
                                 'lin_grp').orderBy('utc_timestamp')
        w2 = Window().partitionBy('device_id').orderBy('row_number')

        ##############################################################################
        # BEGIN DAILY ITERATION
        ##############################################################################

        print("Reading in files for {}".format(str(dt['study_dt'])[:10]))
        print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name,
                                                 s1_initial_bucket_depth,
                                                 dt['s3_before'],
                                                 dt['s3_study_dt'],
                                                 dt['s3_after']))
        print("")

        #################################################################################################
        # START STEP 1
        #################################################################################################
        df1 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_before'] +"/*.gz") # the day before

        df2 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_study_dt'] +"/*.gz") # actual study date

        df3 = dataFrameReader \
            .options(header = 'false', delimiter = '\t', codec = 'gzip') \
            .schema(gps_schema) \
            .format("csv") \
            .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz")
        #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth +  dt['s3_after'] +"/*.gz") # the day after

        # Union data from three inputs into 1 dataframe
        df = df1.union(df2).union(df3) \
            .repartition(part_num, 'device_id')

        del df1
        del df2
        del df3

        ##############################################################################
        # FILTER INITIAL JUNK RECORDS
        # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range
        ##############################################################################
        df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \
                    .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \
                            & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \
                            & (df['utc_timestamp'] + df['tz_offset']) \
                                    .between(dt['utc_study_dt'], dt['utc_after'])) \
                    .dropDuplicates(['utc_timestamp','device_id'])

        ##############################################################################
        # EXCESSIVE SPEED REMOVAL
        ##############################################################################
        df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \
                            lead(df['longitude'],1).over(w))) \
            .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \
            .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \
            .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \
            .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \
            .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \
            .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \
                        | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \
                        & ((col('speed_from') < 600) & (col('speed_to') < 600)) \
                        & ((col('speed_from') < 20) | (col('speed_to') < 20))) \
            .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        ##############################################################################
        # LINEAR TRAVEL PING REMOVAL
        # Break pings out into groups of 4 to measure the linear distance
        ##############################################################################
        #Assign a record number and linear grouping and lead distance
        df = df.withColumn('RecordNum',row_number().over(w)) \
            .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \
            .withColumn('dist_to', distance(df['latitude'], df['longitude'], \
                lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters'))

        # Create aggregated table for linear groupings
        expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \
            py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')]

        dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr)

        dfl_grp.createOrReplaceTempView('dfl_grp')
        df.createOrReplaceTempView('dfl')

        # Grab just the first and last records in each linear grouping and append aggregated info
        dfls = spark.sql(
            "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.min_utc_timestamp \
                    UNION ALL \
                    SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \
                    a.lin_grp, b.sum_dist, b.min_dist, b.cnt \
                    FROM dfl as a INNER JOIN dfl_grp as b \
                    ON a.device_id = b.device_id \
                    AND a.lin_grp = b.lin_grp \
                    AND a.utc_timestamp = b.max_utc_timestamp")

        # Measure the distance between first and last in each linear grouping and compare to sum distance of all points
        # Only keep groups that meet criteria for being straight-line
        df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \
                    lead(dfls['latitude'],1).over(l), \
                    lead(dfls['longitude'],1).over(l), 'meters')) \
                .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \
                .na.drop(subset=['strt_dist']) \
                .filter((dfls['min_dist'] > 0)  \
                    & (col('strt_dist').between(150, 2000)) \
                    & (dfls['cnt'] == 4) \
                    & (col('lin') >= .99825)) \
                .select('device_id','lin_grp', 'lin')

        # Outer join main dataframe to linears groups to filter non-linear pings
        df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \
            .filter(col('lin').isNull()) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset')

        del dfl_grp
        del dfls
        del df_j

        #######################################
        # CHAIN
        # Calculating the dynamic chain threshold to find proximate ping relationships
        #######################################
        df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \
            .withColumn('chain', when((distance(df['latitude'], df['longitude'], \
                            lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1)
                            .when((distance(df['latitude'], df['longitude'], \
                            lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \
            .filter(col('chain') == 1) \
            .withColumn('row_number', row_number().over(w)) \
            .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \
            .persist()

        df \
            .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \
            .write \
            .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        ##############################################################################################
        # START STEP 2
        ##############################################################################################

        print('Begin micro-clustering')

        # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold
        self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \
                        .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \
                                .when(distance(df['latitude'], df['longitude'], \
                                                lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \
                                            >= anchor_dist, col('tz_timestamp')) \
                                .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \
                        .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \
                        .repartition(part_num, 'device_id') \
                        .persist()

        print('df_dist starting count = {}'.format(
            self.df_dist.count()))  # Materialize table for caching

        df.unpersist()
        del df

        #####################################################################################################
        # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing
        # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration
        ######################################################################################################
        df_anchor1 = self.anchor_func(3, 3)
        df_anchor2 = self.anchor_func(5, 5)
        df_anchor3 = self.anchor_func(12, 6)
        df_anchor4 = self.anchor_func(20, 5)
        df_anchor5 = self.anchor_func(30, 5)
        df_anchor6 = self.anchor_func(50, 5)
        df_anchor7 = self.anchor_func(80, 5, 1000000)
        df_anchor8 = self.anchor_func(1000, 5, 1000000)

        ##################################################################################################
        # Collect remaining pings to driver for Python analysis
        print('collect remaining pings')
        anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \
                                                'longitude':row[4], 'anchor':row[7]}).collect()

        # Sort elements in list by device_id and timestamp
        anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp'))

        # Python analysis on driver of final remaining pings
        print('iterate through remaining pings on driver')
        anchor_dr = []

        for r in anchor_list:
            if r['anchor'] is not None:
                anchor_dr.append(r)

            else:
                if anchor_dr[-1]['device_id'] == r['device_id']:
                    if distance_dr(r['latitude'],r['longitude'], \
                                anchor_dr[-1]['latitude'], \
                                anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \
                                & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh:
                        anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \
                                        'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \
                                        'anchor':anchor_dr[-1]['anchor']})

                    else:
                        r['anchor'] = r['timestamp']
                        anchor_dr.append(r)

        # Condense result table for dataframe distribution
        print('generate driver anchor table')
        new_anchor = []
        for r in anchor_dr:
            new_anchor.append([r['timestamp'], r['device_id'], r['anchor']])

        # Bring driver results back into a distributed dataframe and join results
        print('disperse driver anchor table back to cluster')
        new_anchor_schema = StructType([
            StructField('tz_timestamp', IntegerType(), True),
            StructField('device_id', StringType(), True),
            StructField('anchor', IntegerType(), True)
        ])

        df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \
                        .repartition(part_num, 'device_id')

        # Join remaining anchors to main analysis table
        self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \
                                'accuracy','row_number') \
                            .join(df_anchor_dr,['tz_timestamp','device_id']) \

        # Union all anchor tables together and sort
        print('finalizing anchor results into central table')
        df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \
                            .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \
                            .repartition(part_num,'device_id') \
                            .persist()

        self.df_dist.unpersist()

        #######################################################################################
        # Calculate centroids
        #######################################################################################
        print('start calculating centroids')
        # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings
        df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \
                                                                        py.count(col('tz_timestamp')).alias('cnt')]) \
                                    .withColumn('max_acc_1', col('max_accuracy') + 1) \
                                    .filter(col('cnt') > 1) \
                                    .select('device_id','anchor','max_acc_1','cnt')

        # Calculate the nominator for each micro-cluster
        df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \
                                        .withColumn('nom',col('max_acc_1') - col('accuracy'))

        df_denom = df_anchors_fnl.groupBy(
            'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')])


        df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \
                            .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \
                            .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \
                            .withColumn('lon', df_anchors_fnl['longitude'] * col('weight'))


        expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \
                    py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \
                    py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \
                    py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \
                    py.avg(col('accuracy')).alias('avg_accuracy')]

        df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \
                                .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \
                                .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \
                                .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \
                                .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \
                                .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \
                                .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \
                                .filter(col('dwell_seconds') > 1) \
                                .select('device_id','os','start_tm','end_tm', \
                                        'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \
                                        col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy')


        df_micro \
                .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \
                .write \
                .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")
        #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",")

        df_anchors_fnl.unpersist()

        return
Esempio n. 24
0
## Question 7 - What is the mean of the Close column ?

### Python
print("The mean of the Close column is :")
data.select(col("Close"))\
    .agg(avg(col("Close")).alias("Mean"))\
    .show()

### SQL
print("The mean of the Close column is :")
spark.sql("""select mean(Close) as Mean from data_SQL""").show()

## Question 8 - What is the max and min of the Volume column ?

### Python
data.select([min("Volume").alias("Minimum_volume"), max("Volume").alias("Maximum_volume")])\
    .show()

### SQL
spark.sql(
    """select min(Volume) as `Minimum_volume`, max(Volume) as `Maximum_volume` from data_SQL"""
).show()

## Question 9 - How many days was the Close lower than 60 dollars ?

### Python
data.filter(col("Close") < 60)\
    .agg(count(col("Date")).alias("DaysNumber"))\
    .show()

### SQL
Esempio n. 25
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

if __name__ == "__main__":
    spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate()
    file_path = "hdfs:///dataset/bank-data.csv"
    df = spark.read.csv(path=file_path, header=True, inferSchema=True)

    df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show()

    df.groupBy("region").agg({"income": "mean"}).show()
Esempio n. 26
0
    def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues):
        allNumericCols = self.getNumericalFieldNames()
        if len(allNumericCols) == 0:
            self._addHTML("Unable to find a numerical column in the dataframe")
            return
        
                 
        keyFields = self.options.get("keyFields")
        valueField = self.options.get("valueFields")

        if(keyFields==None and valueField==None):
            keyFields=self.getFirstStringColInfo()
            valueField=self.getFirstNumericalColInfo() 
        else:
            keyFields = keyFields.split(',') 
            valueField = valueField.split(',') 
            if(len(valueField) > 1):
                self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField)))
                return
            keyFields = keyFields[0]
            valueField=valueField[0]
        
                
        #if(len(valueFields>)):


    
        #init
        fig=figure
        ax=axes
        
        #fig, ax = plt.subplots()   
        #fig = plt.figure()
        

        params = plt.gcf()
        plSize = params.get_size_inches()
        params.set_size_inches( (plSize[0]*2, plSize[1]*2) )


        agg=self.options.get("aggregation")
        groupByCol=self.options.get("groupByCol")
        
        if (agg=="None" or agg==None):
            colLabel = keyFields
            y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist()
            x_intv = np.arange(len(y))
            labels =  self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel(valueField, fontsize=18)
        elif(agg=='AVG'):
            y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields)
            y=y1["avg"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("Average "+valueField, fontsize=18)
        elif(agg=='SUM'):
            y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields)
            y=y1["sum"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("sum "+valueField, fontsize=18)
        elif(agg=='MAX'):
            y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields)
            y=y1["max"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("max "+valueField, fontsize=18)
        elif(agg=='MIN'):
            y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields)
            y=y1["min"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("min "+valueField, fontsize=18)
        elif(agg=='COUNT'):
            y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields)
            y=y1["count"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("count "+valueField, fontsize=18)

        mpld3.enable_notebook()      
        plt.bar(x_intv,y,color="blue",alpha=0.5)
        ax_fmt = BarChart(labels)
        mpld3.plugins.connect(fig, ax_fmt)
Esempio n. 27
0
iotmsgsRDD = sqlContext.read.json(js)
iotmsgsRDD.registerTempTable("iotmsgsTable")

print("JSON converted to DataFrame of casted floating point numbers")
sqlContext.sql("select distinct cast(payload.data.temperature as float) \
  from iotmsgsTable order by temperature desc").show()

print("DataFrame showing automated 'describe' summary of floating points")
sqlContext.sql("select distinct cast(payload.data.temperature as float) \
  from iotmsgsTable order by temperature desc").describe().show()

print("DataFrame of selected SQL dataframe functions")
temperatureDF = sqlContext.sql("select distinct cast(payload.data.temperature \
  as float) from iotmsgsTable order by temperature desc")
functionsDF = temperatureDF.select([mean('temperature'), min('temperature'), \
  max('temperature')])
print(type(functionsDF))
print(functionsDF)
functionsDF.printSchema()
functionsDF.show()

# Collect a List of Rows of data from the DataFrame
print("Extracted List of Rows of selected SQL dataframe function")
functionsList = temperatureDF.select([mean('temperature'), min('temperature'), \
  max('temperature')]).collect()
print(type(functionsList))
print(functionsList)
print()

# Collect max temperature from Row #1 of the DataFrame
Esempio n. 28
0
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')

# Correlation is a normalized measure of covariance that is easier to understand, 
# as it provides quantitative measurements of the statistical dependence between two random variables.
df.stat.corr('rand1', 'rand2')
df.stat.corr('id', 'id')
Esempio n. 29
0
    def _downsample(self, f: str) -> DataFrame:
        """
        Downsample the defined function.

        Parameters
        ----------
        how : string / mapped function
        **kwargs : kw args passed to how function
        """

        # a simple example to illustrate the computation:
        #   dates = [
        #         datetime.datetime(2012, 1, 2),
        #         datetime.datetime(2012, 5, 3),
        #         datetime.datetime(2022, 5, 3),
        #   ]
        #   index = pd.DatetimeIndex(dates)
        #   pdf = pd.DataFrame(np.array([1,2,3]), index=index, columns=['A'])
        #   pdf.resample('3Y').max()
        #                 A
        #   2012-12-31  2.0
        #   2015-12-31  NaN
        #   2018-12-31  NaN
        #   2021-12-31  NaN
        #   2024-12-31  3.0
        #
        # in this case:
        # 1, obtain one origin point to bin all timestamps, we can get one (2009-12-31)
        # from the minimum timestamp (2012-01-02);
        # 2, the default intervals for 'Y' are right-closed, so intervals are:
        # (2009-12-31, 2012-12-31], (2012-12-31, 2015-12-31], (2015-12-31, 2018-12-31], ...
        # 3, bin all timestamps, for example, 2022-05-03 belongs to interval
        # (2021-12-31, 2024-12-31], since the default label is 'right', label it with the right
        # edge 2024-12-31;
        # 4, some intervals maybe too large for this down sampling, so we need to pad the dataframe
        # to avoid missing some results, like: 2015-12-31, 2018-12-31 and 2021-12-31;
        # 5, union the binned dataframe and padded dataframe, and apply aggregation 'max' to get
        # the final results;

        # one action to obtain the range, in the future we may cache it in the index.
        ts_min, ts_max = (self._psdf._internal.spark_frame.select(
            F.min(self._resamplekey_scol),
            F.max(self._resamplekey_scol)).toPandas().iloc[0])

        # the logic to obtain an origin point to bin the timestamps is too complex to follow,
        # here just use Pandas' resample on a 1-length series to get it.
        ts_origin = (pd.Series([0],
                               index=[ts_min
                                      ]).resample(rule=self._offset.freqstr,
                                                  closed=self._closed,
                                                  label="left").sum().index[0])
        assert ts_origin <= ts_min

        bin_col_name = "__tmp_resample_bin_col__"
        bin_col_label = verify_temp_column_name(self._psdf, bin_col_name)
        bin_col_field = InternalField(
            dtype=np.dtype("datetime64[ns]"),
            struct_field=StructField(bin_col_name, TimestampType(), True),
        )
        bin_scol = self._bin_time_stamp(
            ts_origin,
            self._resamplekey_scol,
        )

        agg_columns = [
            psser for psser in self._agg_columns
            if (isinstance(psser.spark.data_type, NumericType))
        ]
        assert len(agg_columns) > 0

        # in the binning side, label the timestamps according to the origin and the freq(rule)
        bin_sdf = self._psdf._internal.spark_frame.select(
            F.col(SPARK_DEFAULT_INDEX_NAME),
            bin_scol.alias(bin_col_name),
            *[psser.spark.column for psser in agg_columns],
        )

        # in the padding side, insert necessary points
        # again, directly apply Pandas' resample on a 2-length series to obtain the indices
        pad_sdf = (ps.from_pandas(
            pd.Series([0, 0], index=[ts_min, ts_max]).resample(
                rule=self._offset.freqstr,
                closed=self._closed,
                label=self._label).sum().index)._internal.spark_frame.select(
                    F.col(SPARK_DEFAULT_INDEX_NAME).alias(bin_col_name)).where(
                        (ts_min <= F.col(bin_col_name))
                        & (F.col(bin_col_name) <= ts_max)))

        # union the above two spark dataframes.
        sdf = bin_sdf.unionByName(
            pad_sdf,
            allowMissingColumns=True).where(~F.isnull(F.col(bin_col_name)))

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
            data_spark_columns=[F.col(bin_col_name)] + [
                scol_for(sdf, psser._internal.data_spark_column_names[0])
                for psser in agg_columns
            ],
            column_labels=[bin_col_label] +
            [psser._column_label for psser in agg_columns],
            data_fields=[bin_col_field] + [
                psser._internal.data_fields[0].copy(nullable=True)
                for psser in agg_columns
            ],
            column_label_names=self._psdf._internal.column_label_names,
        )
        psdf: DataFrame = DataFrame(internal)

        groupby = psdf.groupby(psdf._psser_for(bin_col_label), dropna=False)
        downsampled = getattr(groupby, f)()
        downsampled.index.name = None

        return downsampled
Esempio n. 30
0
    def get_bins(self, column_name, num_bins=10, split_points=None):
        """
        Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied.

        :param column_name: column to be binned
        :param num_bins:    number of bins to create
        :param split_points:    list of tupels [(a,b), (b, c), ...] such that
                                all values in the range [a, b) assigned to bucket1
        :return:
        """
        if not column_name in self._numeric_columns:
            raise BIException.column_does_not_exist(column_name)

        splits = None
        if split_points == None:
            min_max = self._data_frame.agg(
                FN.min(column_name).alias('min'),
                FN.max(column_name).alias('max')).collect()
            min_value = min_max[0]['min']
            max_value = min_max[0]['max']
            quantile_discretizer = QuantileDiscretizer(numBuckets=10,
                                                       inputCol=column_name,
                                                       outputCol='buckets',
                                                       relativeError=0.01)
            bucketizer = quantile_discretizer.fit(self._data_frame)
            # splits have these values [-Inf, Q1, Median, Q3, Inf]
            splits = bucketizer.getSplits()
        else:
            splits = split_points
        # cast column_name to double type if needed, otherwise Bucketizer does not work
        splits[0] = min_value - 0.1
        splits[-1] = max_value + 0.1
        column_df = None
        if self._column_data_types.get(column_name) != DoubleType:
            column_df = self._data_frame.select(
                FN.col(column_name).cast('double').alias('values'))
        else:
            column_df = self._data_frame.select(
                FN.col(column_name).alias('values'))

        bucketizer = Bucketizer(inputCol='values', outputCol='bins')
        bucketizer.setSplits(splits)
        if min_value == max_value:
            histogram = Histogram(column_name, self._num_rows)
            bin_number = 0
            start_value = min_value - 0.5
            end_value = max_value + 0.5
            histogram.add_bin(bin_number, start_value, end_value,
                              self._num_rows)
        else:
            buckets_and_counts = bucketizer.transform(column_df).groupBy(
                'bins').agg({
                    '*': 'count'
                }).collect()
            histogram = Histogram(column_name, self._num_rows)
            for row in buckets_and_counts:
                bin_number = int(row[0])
                start_value = splits[bin_number]
                end_value = splits[bin_number + 1]
                histogram.add_bin(
                    bin_number, start_value, end_value,
                    float(row[1]) * 100.0 / (end_value - start_value))

        return histogram
Esempio n. 31
0
    # the new column objects in a `dict` of the form {'new_col_name':
    # new_col_object}, and add all the columns to `df_agg` in a loop at the end.
    #
    # We name the new columns with a suffix like `_2d` indicating that the
    # column is an aggregation of days up to 2 days back in time
    #
    # These summary columns are generally some aggregation function of the
    # corresponding `_0d` summary, over the new sliding window:
    # `f.some_func(f.col(`feat_some_func_0d`)).over(window).
    # We show a few simple examples, feel free to add your own!
    feat = settings['feature'] + '_'  # convenience
    dd = '_{}d'.format(days)  # column name suffix indicating window size
    new_cols = {
        # min = min of daily mins
        feat + 'min' + dd:
        f.min(f.col(feat + 'min_0d')).over(window),

        # max = max of daily maxes
        feat + 'max' + dd:
        f.max(f.col(feat + 'max_0d')).over(window),

        # sum = sum of daily sums
        feat + 'sum' + dd:
        f.sum(f.col(feat + 'sum_0d')).over(window),

        # count = sum of daily counts
        feat + 'count' + dd:
        f.sum(f.col(feat + 'count_0d')).over(window),

        # A few more complicated examples:
Esempio n. 32
0
                       'float'), 2).alias('Close'),
                   result_desc['Volume'].cast('int').alias('Volume')
                   ).show()

hv_ratio = walmartdf.withColumn(
    "HV ratio", walmartdf['High'] / walmartdf['Volume'])

hv_ratio.select('HV Ratio').show()

# finding highest value date
walmartdf.orderBy(walmartdf['High'].desc()).head(1)[0][0]


walmartdf.agg(mean(walmartdf['Close'])).show()

walmartdf.select(max(walmartdf['Volume']), min('Volume')).show()

walmartdf.filter(walmartdf['Close'] < 60).count()

(walmartdf.filter(walmartdf['High'] > 80).count(
) / walmartdf.agg(count(walmartdf['Date'])).head(1)[0][0]) * 100

newdf = walmartdf.withColumn("year", year(walmartdf['Date']))

newdf.groupby("year").max().select('year', 'max(High)').show()

newdf2 = walmartdf.withColumn("month", month(walmartdf['Date']))

newdf2.groupBy("month").mean().select(
    'month', 'avg(Close)').orderBy('month').show()
Esempio n. 33
0
def group_by_day(df,
                 *,
                 spark_session,
                 group_keys,
                 from_date_field,
                 to_date_field,
                 join_partitions=100,
                 count_partitions=3,
                 min_cutoff_date=None,
                 max_cutoff_date=datetime.date.today(),
                 drop_date_fields=True):
    """Count entries per day grouped by the keys in group_keys.

    Args:
        df (DataFrame): Incoming spark dataframe.

    Kwargs:
        spark_session (SparkSession): Spark session used to build date
        dataframe.
        group_keys (list): List of strings with keys to group by.
        from_date_field (str): From data field name.
        to_date_field (str): To data field name.
        join_partitions (int): Number of partitions for the join with dates.
        count_partitions (int): Number of partitions after the count.
        min_cutoff_date (datetime.date): Drop all data before this date.
        max_cutoff_date (datetime.date): Drop all data after this date,
        can often be today.
        drop_date_fields (bool): If True drop original from_date and to_date
        fields from result.
    Returns:
       Spark DataFrame with grouped counts per key combination per day.

    Example input:
    DataFrame
    +-------------+---------+-----------+-----------+---------+--------+
    |customer_id  |prod_code|shop_code  |channel    |from_date| to_date|
    +-------------+---------+-----------+-----------+---------+--------+
    |         1234|      ABC|      SHOP1|          A| 20160606|20160610|
    |         3456|      ABC|      SHOP1|          A| 20160607|20160611|
    +-------------+---------+-----------+-----------+---------+--------+
    group_keys=['prod_code', 'shop_code', 'channel']
    from_date_field='from_date'
    to_date_field='to_date'
    drop_date_fields=True

    Example output:
    DataFrame
    +--------+-----------+------------+---------+-----+
    | datenum|channel    |shop_code   |prod_code|count|
    +--------+-----------+------------+---------+-----+
    |20160606|          A|       SHOP1|      ABC|    1|
    |20160607|          A|       SHOP1|      ABC|    2|
    +--------+-----------+------------+---------+-----+
    """
    df_with_dates = df\
        .withColumn('date_from', F.col(from_date_field))\
        .withColumn('date_to', F.col(to_date_field))

    dates_range = df_with_dates.agg(F.min(F.col("date_from")),
                                    F.max(F.col("date_to"))).collect()

    all_dates = all_dates_between(dates_range[0][0], dates_range[0][1],
                                  min_cutoff_date, max_cutoff_date)
    dates_df = spark_session.createDataFrame(all_dates, T.DateType())
    between = dates_df['value'].between(df_with_dates['date_from'],
                                        df_with_dates['date_to'])
    result = df_with_dates.repartition(join_partitions)\
                          .join(dates_df, on=between)

    datenum_type = F.date_format(F.col('datenum'), 'YYYYMMdd').cast("int")
    result = result\
        .withColumn('datenum', result['value'])\
        .withColumn('datenum', datenum_type)\
        .drop('value')\
        .drop('date_from')\
        .drop('date_to')

    if drop_date_fields:
        result = result.drop(from_date_field).drop(to_date_field)

    new_group_keys = group_keys[:]
    new_group_keys.append('datenum')
    return result.groupBy(new_group_keys)\
                 .count().repartition(count_partitions)
Esempio n. 34
0
def lift_splitted(sqc,
                  query,
                  target='true_target',
                  proba='target_proba',
                  split_by={'model_name, business_dt'},
                  cost=None,
                  n_buckets=100):
    """
    Calculate lift function over splits. Requires sqlContext
    Input:
        sqc - sqlContext of spark session
        query - query Dataframe with true_target, target_proba and split columns
        split_by - list of columns to calculate lift independently
        target - binary actual values to predict
        cost - optional, charges
        proba - probabilities of target = 1
        n_buckets - number of buckets to bin lift function
    Output:
        pdf - pandas DataFrame with cumulative lift and coverage
    """
    import pandas as pd
    import pyspark.sql.functions as F

    sqc.sql(query).registerTempTable("tmp_lift_splitted")
    # generate ntiles
    if cost is not None:
        sql = """select {sb}, CAST({target} AS INT) {target}, {proba}, {cost},
                 NTILE({nb}) OVER (PARTITION BY {sb} ORDER BY {proba} DESC) as tile
                 FROM tmp_lift_splitted t
              """.format(sb=split_by,
                         target=target,
                         proba=proba,
                         cost=cost,
                         nb=n_buckets)
    else:
        sql = """select {sb}, CAST({target} AS INT) {target}, {proba},
                 NTILE({nb}) OVER (PARTITION BY {sb} ORDER BY {proba} DESC) as tile
                 FROM tmp_lift_splitted t
              """.format(sb=split_by,
                         target=target,
                         proba=proba,
                         cost=cost,
                         nb=n_buckets)

    sdf = sqc.sql(sql)
    if cost is not None:
        pdf = sdf.groupby(split_by.union({'tile'})).agg(
            F.sum(F.col(target)).alias("target_sum"),
            F.count(F.col(target)).alias("target_cnt"),
            F.min(F.col(proba)).alias("target_proba_min"),
            F.max(F.col(proba)).alias("target_proba_max"),
            F.sum(F.col(cost)).alias("cost_sum"),
            F.sum(F.col(cost).isNotNull().cast('integer')).alias(
                "cost_cnt")).toPandas()
        pdf['cost_sum'] = pdf['cost_sum'].astype(float)
    else:
        pdf = sdf.groupby(split_by.union({'tile'})).agg(
            F.sum(F.col(target)).alias("target_sum"),
            F.count(F.col(target)).alias("target_cnt"),
            F.min(F.col(proba)).alias("target_proba_min"),
            F.max(F.col(proba)).alias("target_proba_max")).toPandas()

    if 'business_dt' in split_by:
        pdf['business_dt'] = pd.to_datetime(pdf['business_dt'],
                                            errors='coerce')

    pdf = pdf.sort_values('tile')
    grouped = pdf.groupby(split_by, as_index=False)
    pdf['target_cum_sum'] = grouped.target_sum.cumsum()
    pdf['target_cum_cnt'] = grouped.target_cnt.cumsum()

    if cost is not None:
        pdf['charge_cum_sum'] = grouped.cost_sum.cumsum()
        pdf['charge_cum_cnt'] = grouped.cost_cnt.cumsum()

    pdf = pdf.sort_values(split_by).set_index(split_by)
    pdf['response_cum'] = pdf.target_cum_sum / pdf.target_cum_cnt
    pdf['target_sum_base'] = pdf.loc[pdf.tile == n_buckets, 'target_cum_sum']
    pdf['target_cnt_base'] = pdf.loc[pdf.tile == n_buckets, 'target_cum_cnt']
    pdf['lift'] = pdf['response_cum'] / (pdf['target_sum_base'] /
                                         pdf['target_cnt_base'])
    pdf['coverage'] = pdf['target_cum_sum'] / pdf['target_sum_base']

    if cost is not None:
        pdf['charge_average'] = pdf.cost_sum / pdf.cost_cnt
        pdf['charge_gain'] = pdf.charge_cum_sum / pdf.loc[pdf.tile == n_buckets, 'charge_cum_sum'] \
            .loc[pdf.index]

    return pdf
# |     ZJV03Y00|
# |     ZDEB33GK|
# |     Z302T6CW|
# +-------------+
# only showing top 5 rows

spark.sql("""SELECT
           model,
           min(capacity_bytes / pow(1024, 3)) min_GB,
           max(capacity_bytes/ pow(1024, 3)) max_GB
        FROM backblaze_stats_2019
        GROUP BY 1
        ORDER BY 3 DESC""").show(5)

backblaze_2019.groupby(F.col("model")).agg(
    F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"),
    F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"),
).orderBy(F.col("max_GB"), ascending=False).show(5)

# --------------------+--------------------+-------+
# |               model|              min_GB| max_GB|
# +--------------------+--------------------+-------+
# | TOSHIBA MG07ACA14TA|             13039.0|13039.0|
# |       ST12000NM0007|-9.31322574615478...|11176.0|
# |HGST HUH721212ALN604|             11176.0|11176.0|
# |       ST10000NM0086|              9314.0| 9314.0|
# |HGST HUH721010ALE600|              9314.0| 9314.0|
# +--------------------+--------------------+-------+
# only showing top 5 rows

spark.sql("""SELECT
Esempio n. 36
0
def create_bin():
    lat_long_rdd = reading_file()
    max_lat = lat_long_rdd.agg(F.min(lat_long_rdd.Longi)).collect()
    f.write(str(max_lat))
Esempio n. 37
0
hitsCount = diffDF.select("URL", "session_id")\
    .groupBy("session_id")\
    .agg(F.count("URL"),F.countDistinct("URL"))

# Saving output of hitsCount dataframe to disk
hitsCount.coalesce(1).write.mode("overwrite").csv(
    "hdfs:///root/pydev/dataset/hit_count", header='true')

# Solution for Question 2
# Generating Avg session time as "avg(SESSION_TIME) group by client_ip", where
# SESSION_TIME is calculated as difference between max and min of event time (rTS) grouped by client_ip, SESSION_ID


sessionTimeBySessionID = diffDF.select("rTS", "client_ip", "session_id")\
                               .groupBy("client_ip", "session_id")\
                               .agg(F.max("rTS") - F.min("rTS"))\
                               .withColumnRenamed("(max(rTS) - min(rTS))", "SESSION_TIME")

avgSessionTimeByUser = sessionTimeBySessionID.select("client_ip", "SESSION_TIME")\
                                             .groupBy("client_ip")\
                                             .agg(F.avg("SESSION_TIME"))

# Saving output of avgSessionTimeByUser dataframe to disk
avgSessionTimeByUser.coalesce(1).write.mode("overwrite").csv(
    "hdfs:///root/pydev/dataset/AvgSession_Time", header='true')

# Question 4
# Most engaged users, i.e. the IP with the longest session time
# Fetching IP with max(SESSION_TIME) within entire WebLog dataset

windowSpec = Window\
Esempio n. 38
0
 def fn(col):
     if 'window' in kwargs:
         window = kwargs['window']
         return F.min(col).over(window)
     else:
         return F.min(col)
Esempio n. 39
0
    StructField("TransID", IntegerType()),
    StructField("CustID", IntegerType()),
    StructField("TransTotal", FloatType()),
    StructField("TransNumItems", IntegerType()),
    StructField("TransDesc", StringType())
])

purchases = spark.read.csv(path="hdfs://localhost:9000/user/Purchases.txt",
                           schema=p_schema)
purchases.createOrReplaceTempView("Purchases")

T1 = spark.sql("SELECT * FROM Purchases WHERE TransTotal <= 600")
T1.write.json(path="hdfs://localhost:9000/user/T1", mode="overwrite")

T2 = T1.groupby("TransNumItems").agg(
    F.min("TransTotal"), F.max("TransTotal"),
    F.expr("percentile_approx(TransTotal,0.5)").alias("median(TransTotal)"))
T2.write.json(path="hdfs://localhost:9000/user/T2", mode="overwrite")

T3 = T1.join(customers, T1.CustID == customers.ID, "inner")
T3 = T3.filter((T3.Age <= 25) & (T3.Age >= 18)).groupby("CustID")
T3 = T3.agg(
    F.first("Age").alias("Age"),
    F.sum("TransNumItems").alias("TotalItems"),
    F.sum("TransTotal").alias("TotalSpent"))
T3.write.json(path="hdfs://localhost:9000/user/T3", mode="overwrite")

rows = T3.collect()
hits = []
n = len(rows)
for i in range(n):
Esempio n. 40
0
print "schema :", auctionDf.schema

print "Describe() :\n", auctionDf.describe().show()
print "explain() :\n", auctionDf.explain(extended=True)

print "\nAction Data Analysis..............."

print "\nauctionid Distinct Count :", auctionDf.select(
    "auctionid").distinct().count()
print "itemtype Distinct Count :", auctionDf.select(
    "itemtype").distinct().count()
print "itemtype Distinct Values :", auctionDf.select(
    "itemtype").distinct().show()

print "\nGroupby :", auctionDf.groupBy("itemtype", "auctionid").count().agg(
    func.max("count"), func.min("count"), func.avg("count")).show()
print "Groupby :", auctionDf.groupBy("itemtype",
                                     "auctionid").agg(func.max("bid"),
                                                      func.min("bid"),
                                                      func.avg("bid")).show()

print "\nFilter :", auctionDf.filter(auctionDf.price > 200).count()
xboxes = sqlc.sql(
    "SELECT itemtype,auctionid,bid,bidtime,bidder,bidderrate,openbid,price FROM auctions WHERE itemtype = 'xbox'"
)
print "SQL xboxes:\n", xboxes.head(5)
print "SQL xboxes describe:\n", xboxes.describe().show()
sc.stop()

tnow = datetime.datetime.now()
print("\n\nEnd of Program...Spark : %s" % tnow)
Esempio n. 41
0
    def reconstruct(self, df_trajectory_bucketed, df_type='pandas'):
        # Assert Implemented Methods
        assert df_type in {
            'pandas', 'spark'
        }, 'reconstruct@<TrajectoryReconstructorLinear>: df_type = "{}" is not implemented!'.format(
            df_type)
        # Reconstruct
        if (df_type == 'pandas'):
            # Sort and Copy
            df_result = df_trajectory_bucketed.copy().sort_values(
                ['id_user', 'id_timestamp'])
            df_result.reset_index(drop=True, inplace=True)
            # ID TimeStamp Bounds
            id_timestamp_min = df_result['id_timestamp'].min()
            id_timestamp_max = df_result['id_timestamp'].max()
            # New Data
            data = []
            _n = len(df_result)
            for i, (id_user, id_timestamp, lat, lng) in enumerate(
                    zip(df_result['id_user'], df_result['id_timestamp'],
                        df_result['lat'], df_result['lng'])):
                # First Row and Missing TimeStamps
                if (i == 0 and id_timestamp > id_timestamp_min):
                    data.extend([[id_user, t, lat, lng]
                                 for t in range(id_timestamp_min, id_timestamp)
                                 ])
                # Get Next Row if Possible
                if (i < (_n - 1)):
                    # Next
                    id_user_next = df_result['id_user'][i + 1]
                    id_timestamp_next = df_result['id_timestamp'][i + 1]
                    lat_next = df_result['lat'][i + 1]
                    lng_next = df_result['lng'][i + 1]
                    # Linear Interpolation
                    if (id_user == id_user_next
                            and (id_timestamp + 1) < id_timestamp_next):
                        # Line Slopes
                        slope_lat = (lat_next - lat) / (id_timestamp_next -
                                                        id_timestamp)
                        slope_lng = (lng_next - lng) / (id_timestamp_next -
                                                        id_timestamp)
                        # Update
                        for t in range(id_timestamp + 1, id_timestamp_next):
                            dt = (t - id_timestamp)
                            data.append([
                                id_user, t, lat + dt * slope_lat,
                                lng + dt * slope_lng
                            ])
                    elif (id_user != id_user_next):
                        # New User Encountered
                        data.extend([[
                            id_user, t, lat, lng
                        ] for t in range(id_timestamp, id_timestamp_max + 1)])
                        data.extend([[
                            id_user_next, t, lat, lng
                        ] for t in range(id_timestamp_min, id_timestamp_next)])
                else:
                    # Last Row and Missing Following TimeStamps
                    if (id_timestamp < id_timestamp_max):
                        data.extend(
                            [[id_user, t, lat, lng]
                             for t in range((id_timestamp +
                                             1), (id_timestamp_max + 1))])
            # Append New Data
            df_result = pd.concat([
                df_result,
                pd.DataFrame(data,
                             columns=['id_user', 'id_timestamp', 'lat', 'lng'])
            ]).sort_values(['id_user', 'id_timestamp']).reset_index(drop=True)
        elif (df_type == 'spark'):
            # ID TimeStamp Bounds
            row = df_trajectory_bucketed.agg(
                sql_functions.min(sql_functions.col("id_timestamp")).alias(
                    "id_timestamp_min"),
                sql_functions.max(sql_functions.col("id_timestamp")).alias(
                    "id_timestamp_max")).head()
            id_timestamp_min, id_timestamp_max = row['id_timestamp_min'], row[
                'id_timestamp_max']

            # Define Imputer Function
            def linear_interpolator(x_arr):
                # Result
                result = []
                # Sort by TimeStamp
                x_arr = sorted(x_arr, key=lambda x: x[0])

                # First TimeStamp
                id_timestamp_first, lat_first, lng_first = x_arr[0]
                for id_timestamp in range(id_timestamp_min,
                                          id_timestamp_first):
                    result.append((id_timestamp, lat_first, lng_first))

                # Linear Reconstruction
                for data_now, data_next in zip(x_arr[:-1], x_arr[1:]):
                    # Extract Info
                    id_timestamp_now, lat_now, lng_now = data_now
                    id_timestamp_next, lat_next, lng_next = data_next
                    # Add Now
                    result.append((id_timestamp_now, lat_now, lng_now))
                    # Skip Linear Interpolation
                    assert id_timestamp_next > id_timestamp_now, 'Sorting has gone wrong!'
                    if ((id_timestamp_next - id_timestamp_now) == 1):
                        continue
                    # Linear Interpolation
                    ## Slopes
                    dt = id_timestamp_next - id_timestamp_now
                    slope_lat = (lat_next - lat_now) / dt
                    slope_lng = (lng_next - lng_now) / dt
                    ## Add Inner Points
                    for i in range(1, id_timestamp_next - id_timestamp_now):
                        id_timestamp = id_timestamp_now + i
                        result.append((id_timestamp, lat_now + i * slope_lat,
                                       lng_now + i * slope_lng))

                # Last TimeStamp
                id_timestamp_last, lat_last, lng_last = x_arr[-1]
                for id_timestamp in range(id_timestamp_last,
                                          id_timestamp_max + 1):
                    result.append((id_timestamp, lat_last, lng_last))

                # Return
                return (result)

            udf_linear_interpolator = sql_functions.udf(
                linear_interpolator,
                sql_types.ArrayType(SCHEMA_DATA_POINT, False))
            # Aggregate, Apply UDF & Explode
            ## Aggregate Data Points
            df_result = df_trajectory_bucketed.withColumn(
                "data_point",
                udf_gather_data_point(
                    sql_functions.col("id_timestamp"),
                    sql_functions.col("lat"),
                    sql_functions.col("lng"),
                )).groupby("id_user").agg(
                    sql_functions.collect_list(sql_functions.col(
                        "data_point")).alias("data_point_list"))
            ## Apply Linear Interpolation
            df_result = df_result.select(
                "id_user",
                udf_linear_interpolator(sql_functions.col(
                    "data_point_list")).alias("data_point_list"))
            ## Explode to Comply with DataCatalogue
            df_result = df_result.withColumn(
                "data_point",
                sql_functions.explode(
                    sql_functions.col("data_point_list"))).select(
                        "id_user", "data_point.id_timestamp", "data_point.lat",
                        "data_point.lng")
        # Return
        return (df_result)
Esempio n. 42
0
logs_df.count()

# count by different code type
logs_df.groupBy("code").count().show()
# rank by counts
from pyspark.sql.functions import asc, desc
logs_df.groupBy('code').count().orderBy(desc('count')).show()

# calculate average size of different code
logs_df.groupBy("code").avg("bytes").show()
# more calculation by code - average, min, max
import pyspark.sql.functions as F
logs_df.groupBy("code").agg(
logs_df.code,
F.avg(logs_df.bytes),
F.min(logs_df.bytes),
F.max(logs_df.bytes)
).show()


# homework
# 1
yelp_df.select("cool").agg({"cool" : "mean"}).collect()
# 2
import pyspark.sql.functions as F
yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 3
yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 4
from pyspark.sql.functions import asc, desc
yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy('state').count().orderBy(desc('count')).show()
sqlCtx = SQLContext(sc)

lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"])

record = lines.map(lambda line: line.split(",")).map(
    lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2]))

recordSchema = sqlCtx.createDataFrame(record)

recordSchema.groupBy().agg({"*": "count"}).show()

recordSchema.groupBy("machine", recordSchema["domain"]).agg(
    {"domain": "max", "request": "min"}).show()

recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max(
    recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int")).groupBy("machine").count().show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").max("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").min("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").sum("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").avg("request").show()
Esempio n. 44
0
trunc_df = yelp_df.filter("review_count>=10 and open = 'True'").groupBy("state").count()

trunc_df.orderBy(desc("count")).collect()

###################

/usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv
logs_df = sqlCtx.load(source="com.databricks.spark.csv",header = 'true',inferSchema = 'true',path ='index_data_http.csv')
sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n')
sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n')
from pyspark.sql.functions import asc, desc
logs_df.groupBy("code").count().orderBy(desc("count")).show()
logs_df.groupBy("code").avg("bytes").show()
import pyspark.sql.functions as F
logs_df.groupBy("code").agg(logs_df.code,F.avg(logs_df.bytes),F.min(logs_df.bytes),F.max(logs_df.bytes)).show()

###########################################
yelp_df = sqlCtx.load(source='com.databricks.spark.csv',header = 'true',inferSchema = 'true',path ='index_data.csv')
yelp_df.registerTempTable("yelp")
filtered_yelp = sqlCtx.sql("SELECT * FROM yelp WHERE useful >= 1")
filtered_yelp.count()

sqlCtx.sql("SELECT MAX(useful) AS max_useful FROM yelp").collect()
useful_perc_data.join(yelp_df,yelp_df.id == useful_perc_data.uid,"inner").select(useful_perc_data.uid, "useful_perc", "review_count")
useful_perc_data.registerTempTable("useful_perc_data")

sqlCtx.sql(
"""SELECT useful_perc_data.uid, useful_perc,
review_count
FROM useful_perc_data
Esempio n. 45
0
#set time variables for date filtering
time = datetime.datetime.now()
epochtime = int(time.strftime("%s"))
start_time = epochtime - 86400
compare_time = datetime.datetime.fromtimestamp(start_time)

#create a dataframe from the raw metrics
rawmetrics = sqlContext.read.format("org.apache.spark.sql.cassandra").options(table="raw_metrics", keyspace="metrics").load()

#filter metrics to those in last 24 hours
last_day = rawmetrics.where(rawmetrics.metric_time > compare_time)

#aggregates
averages = last_day.groupby('device_id').agg(func.avg('metric_value').alias('metric_avg'))
maximums = last_day.groupby('device_id').agg(func.max('metric_value').alias('metric_max'))
minimums = last_day.groupby('device_id').agg(func.min('metric_value').alias('metric_min'))

#rename id columns for uniqueness
averages_a = averages.withColumnRenamed("device_id", "id")
maximums_a = maximums.withColumnRenamed("device_id", "maxid")
minimums_a = minimums.withColumnRenamed("device_id", "minid")

#join the tables above
temp = averages_a.join(maximums_a, averages_a.id == maximums_a.maxid)
aggs = temp.join(minimums, temp.id == minimums.device_id).select('id','metric_min','metric_max','metric_avg')

#add columns to format for cassandra
addday = aggs.withColumn("metric_day", lit(time))
addname = addday.withColumn("metric_name",lit("KWH"))
inserts = addname.withColumnRenamed("id","device_id")
# COMMAND ----------

from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364


# COMMAND ----------

from pyspark.sql.functions import first, last
df.select(first("StockCode"), last("StockCode")).show()


# COMMAND ----------

from pyspark.sql.functions import min, max
df.select(min("Quantity"), max("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import sum
df.select(sum("Quantity")).show() # 5176450


# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310


# COMMAND ----------
FROM
    %s
WHERE
    dt >= '%s'
AND dt <= '%s'
''' % (spa_utils.rename('app.app_pa_festival_features', params), update_start, update_end))

#df_time = sql('''select * from app.app_pa_festival_features  where dt between '2018-01-20' and '2018-04-20' ''')

df_time = df_time.withColumnRenamed('dt', 'date')
df_time.cache()

# 找出每个cid3有记录的开始时间和结束时间
df_sales_dtcid3_start_end_date = df_sales_dtcid3\
    .groupby('item_third_cate_cd')\
    .agg(F.min('date').alias('start_date'),
         F.max('date').alias('end_date'))

# 填充cid3跨度的日期特征
df_cid3_duration_time = df_sales_dtcid3_start_end_date\
    .join(df_time,
          (df_time['date'] >= df_sales_dtcid3_start_end_date['start_date']) &
          (df_time['date'] <= df_sales_dtcid3_start_end_date['end_date']),
          'left')\
    .drop('start_date', 'end_date')
df_cid3_duration_time.cache()

df_complete =  df_cid3_duration_time\
    .join(df_sales_dtcid3,
          ['date', 'item_third_cate_cd'], 'left')\
    .fillna(0)
Esempio n. 48
0
File: queries.py Progetto: jiep/ABD
Consulta d)
Los usuarios con la fecha de creación más antigua 
y la más reciente, respectivamente
'''


'''
Necesario para utilizar la función to_date
'''
from pyspark.sql.functions import *

df.select("*")\
  .where((to_date(df.CreationDate) ==
    df.select(
      min(
        to_date("CreationDate"))\
        .alias("min"))\
        .collect()[0].min) | (
        to_date(df.CreationDate) ==
          df.select(
            max(to_date("CreationDate"))\
            .alias("max"))\
            .collect()[0].max))\
  .orderBy(to_date("CreationDate"))\
  .show()

''' Comparando fechas hasta los milisegundos'''

'''
Usuario más antiguo
'''
Esempio n. 49
0
    test_df = test_df.select(*(all_cols + ['Id', 'Date'])).cache()

    # Build vocabulary of categorical columns.
    vocab = build_vocabulary(train_df.select(*categorical_cols)
                                     .unionAll(test_df.select(*categorical_cols)).cache(),
                             categorical_cols)

    # Cast continuous columns to float & lookup categorical columns.
    train_df = cast_columns(train_df, continuous_cols + ['Sales'])
    train_df = lookup_columns(train_df, vocab)
    test_df = cast_columns(test_df, continuous_cols)
    test_df = lookup_columns(test_df, vocab)

    # Split into training & validation.
    # Test set is in 2015, use the same period in 2014 from the training set as a validation set.
    test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0]
    test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0]
    a_year = datetime.timedelta(365)
    val_df = train_df.filter((test_min_date - a_year <= train_df.Date) & (train_df.Date < test_max_date - a_year))
    train_df = train_df.filter((train_df.Date < test_min_date - a_year) | (train_df.Date >= test_max_date - a_year))

    # Determine max Sales number.
    max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0]

    print('===================================')
    print('Data frame with transformed columns')
    print('===================================')
    train_df.show()

    print('================')
    print('Data frame sizes')
Esempio n. 50
0
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("MostObscureSuperheroes").getOrCreate()

schema = StructType([ \
                     StructField("id", IntegerType(), True), \
                     StructField("name", StringType(), True)])

names = spark.read.schema(schema).option(
    "sep", " ").csv("file:///SparkCourse/Marvel-names.txt")

lines = spark.read.text("file:///SparkCourse/Marvel-graph.txt")

# Small tweak vs. what's shown in the video: we trim whitespace from each line as this
# could throw the counts off by one.
connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))

minConnectionCount = connections.agg(func.min("connections")).first()[0]

minConnections = connections.filter(
    func.col("connections") == minConnectionCount)

minConnectionsWithNames = minConnections.join(names, "id")

print("The following characters have only " + str(minConnectionCount) +
      " connection(s):")

minConnectionsWithNames.select("name").show()
Esempio n. 51
0
 def fn(col):
     return ~(F.min(col))
#import SQLContext and pyspark SQL functions

from pyspark.sql import SQLContext, Row
import pyspark.sql.functions as func
sqlContext = SQLContext(sc)

inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(","))
auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8])))

# Infer the schema, and register the DataFrame as a table.
auctiondf = sqlContext.createDataFrame(auctions)
auctiondf.registerTempTable("auctions")

auctiondf.show()

auctiondf.printSchema()

totbids = auctiondf.count()
print totbids

totalauctions = auctiondf.select("auctionid").distinct().count()
print total auctions

itemtypes = auctiondf.select("itemtype").distinct().count()
print itemtypes
auctiondf.groupBy("itemtype","auctionid").count().show()
auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show()
auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show()
auctiondf.filter(auctiondf.price>200).count()
xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
    (2, "Michael Armbrust", 1, [250, 100])])\
  .toDF("id", "name", "graduate_program", "spark_status")

person.stat.crosstab("id", "name").show()

# COMMAND ----------

#Get an overview of the summary statistics
df.describe().show()

#Get specific summary statistics of numeric variables
df.select("quantity").summary("min", "max").show()

#Alternative
from pyspark.sql.functions import min, max
df.select(min("quantity"), max("quantity")).show()

#Get a specific statistic
spark.sql("select max(quantity) from df").show()

from pyspark.sql.functions import max
df.select(max("quantity")).show()

# COMMAND ----------

#Calculate the mean quantity sold and the number of customers
spark.sql("select avg(quantity), count(distinct(customerid)) from df").show()

from pyspark.sql.functions import avg, expr
df.select(avg("quantity"), expr("count(distinct(customerid))")).show()
df.selectExpr("avg(quantity)", "count(distinct(customerid))").show()
from pyspark.sql import functions as F

#Creating data frame from list
data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)]
schema = ['fname', 'lname', 'age']
df = sqlContext.createDataFrame(data, schema)
df

#Retrieving contents of data frame
df.printSchema()
df.show()
df.first()
df.count()

#Adding columns
df = df.withColumn('salary', F.lit(0))
df.show()
df.withColumn('salary2', df['age'] * 100).show()

#Filtering and subsetting 
df.filter(df['age'] > 30).select('fname','age').show()
df.select(F.max('age').alias('max-age')).show()

#Grouped aggregations
df.groupBy('lname').max('age').show()
df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()


Esempio n. 55
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate()

schema = StructType([ \
                     StructField("id", IntegerType(), True), \
                     StructField("name", StringType(), True)])

names = spark.read.schema(schema).option("sep", " ").csv("file:///sparkcourse/Marvel+names")

lines = spark.read.text("file:///sparkcourse/Marvel+graph")

connections = lines.withColumn("id", func.split(func.col("value"), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.col("value"), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))
    
minconnect = connections.agg(func.min("connections")).first()[0]

minconnections = connections.filter(func.col("connections") == minconnect)

minconn_names = minconnections.join(names, "id")

minconn_names.select("name").show(minconn_names.count())

content_size_summary_df = logs_df.describe(['content_size'])
content_size_summary_df.show()

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC Alternatively, we can use SQL to directly calculate these statistics.  You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).
# MAGIC 
# MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`.

# COMMAND ----------

from pyspark.sql import functions as sqlFunctions
contentSizeStats =  (logs_df
                     .agg(sqlFunctions.min(logs_df['content_size']),
                          sqlFunctions.avg(logs_df['content_size']),
                          sqlFunctions.max(logs_df['content_size']))
                     .first())

print 'Using SQL functions:'
print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats)

# COMMAND ----------

# MAGIC %md
# MAGIC ### (3b) Example: HTTP Status Analysis
# MAGIC 
# MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times.  We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column.

# COMMAND ----------
    StructField("longitude", FloatType(), True),
    StructField("time", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("precipIntensity", FloatType(), True),
    StructField("precipProbability", FloatType(), True),
    StructField("temperature", FloatType(), True),
    StructField("apparentTemperature", FloatType(), True),
    StructField("humidity", FloatType(), True),
    StructField("windSpeed", FloatType(), True),
    StructField("visibility", FloatType(), True),
    StructField("pressure", FloatType(), True),
]
schema = StructType(fields)

# Apply the schema to the RDD.
weather_df = sqlContext.createDataFrame(weather_csv, schema)
print "#######"
print weather_df.printSchema()
print weather_df.show()
print "#######"

# Do some simple aggregations
weather_agg = weather_df.groupBy("date", "city").agg(
    F.min(weather_df.temperature), F.max(weather_df.temperature),
    F.min(weather_df.pressure), F.max(weather_df.pressure))
weather_agg.show()

# Save the new dataframes with schemas
weather_df.write.save(
    "hdfs://hadoop:9000/weather_data_schema", mode="overwrite")