Ejemplo n.º 1
0
    def test_window_functions_cumulative_sum(self):
        df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"])
        from pyspark.sql import functions as F

        # Test cumulative sum
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0)))
        rs = sorted(sel.collect())
        expected = [("one", 1), ("two", 3)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])

        # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow
        frame_end = Window.unboundedFollowing + 1
        sel = df.select(
            df.key,
            F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end)))
        rs = sorted(sel.collect())
        expected = [("one", 3), ("two", 2)]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Ejemplo n.º 2
0
    def test_groupedData(self):
        from pyspark.sql import DataFrame
        from pyspark.sql.functions import sum, pandas_udf, PandasUDFType
        from ts.flint import TimeSeriesGroupedData

        price = self.price()

        assert(type(price.groupBy('time')) is TimeSeriesGroupedData)
        assert(type(price.groupby('time')) is TimeSeriesGroupedData)

        result1 = price.groupBy('time').agg(sum(price['price'])).sort('time').toPandas()
        expected1 = DataFrame.groupBy(price, 'time').agg(sum(price['price'])).sort('time').toPandas()
        assert_same(result1, expected1)

        result2 = price.groupBy('time').pivot('id').sum('price').toPandas()
        expected2 = DataFrame.groupBy(price, 'time').pivot('id').sum('price').toPandas()
        assert_same(result2, expected2)

        @pandas_udf(price.schema, PandasUDFType.GROUPED_MAP)
        def foo(df):
            return df
        result3 = price.groupby('time').apply(foo).toPandas()
        expected3 = DataFrame.groupBy(price, 'time').apply(foo).toPandas()
        assert_same(result3, expected3)

        result4 = price.groupby('time').count().toPandas()
        expected4 = DataFrame.groupBy(price, 'time').count().toPandas()
        assert_same(result4, expected4)

        result5 = price.groupby('time').mean('price').toPandas()
        expected5 = DataFrame.groupBy(price, 'time').mean('price').toPandas()
        assert_same(result5, expected5)
Ejemplo n.º 3
0
    def test_mixed_sql(self):
        """
        Test mixing group aggregate pandas UDF with sql expression.
        """
        df = self.data
        sum_udf = self.pandas_agg_sum_udf

        # Mix group aggregate pandas UDF with sql expression
        result1 = (df.groupby('id')
                   .agg(sum_udf(df.v) + 1)
                   .sort('id'))
        expected1 = (df.groupby('id')
                     .agg(sum(df.v) + 1)
                     .sort('id'))

        # Mix group aggregate pandas UDF with sql expression (order swapped)
        result2 = (df.groupby('id')
                     .agg(sum_udf(df.v + 1))
                     .sort('id'))

        expected2 = (df.groupby('id')
                       .agg(sum(df.v + 1))
                       .sort('id'))

        # Wrap group aggregate pandas UDF with two sql expressions
        result3 = (df.groupby('id')
                   .agg(sum_udf(df.v + 1) + 2)
                   .sort('id'))
        expected3 = (df.groupby('id')
                     .agg(sum(df.v + 1) + 2)
                     .sort('id'))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
Ejemplo n.º 4
0
    def test_nondeterministic_vectorized_udf_in_aggregate(self):
        df = self.spark.range(10)
        random_udf = self.nondeterministic_vectorized_udf

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
                df.groupby(df.id).agg(sum(random_udf(df.id))).collect()
            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
                df.agg(sum(random_udf(df.id))).collect()
Ejemplo n.º 5
0
    def test_nondeterministic_udf_in_aggregate(self):
        from pyspark.sql.functions import udf, sum
        import random
        udf_random_col = udf(lambda: int(100 * random.random()), 'int').asNondeterministic()
        df = self.spark.range(10)

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
                df.groupby('id').agg(sum(udf_random_col())).collect()
            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
                df.agg(sum(udf_random_col())).collect()
Ejemplo n.º 6
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
def run_benchmarks(base_path):
    print("=========================================================================================")
    print("Loading data for: ")
    print(base_path)
    print("=========================================================================================")

    start=time.time()
    df=hive_context.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(base_path)
    #print(df)
    #print(df.printSchema())
    print(df.count())
    df.cache()

    print("Time taken for groupBy on DataFrame column C followed by sum aggregate: ")
    start_task=time.time()
    df_groupby_C=df.groupBy('C').agg(F.sum(df.id))
    print(df_groupby_C.count())
    end_task=time.time()
    end=time.time()

    x=[base_path, end-start, end_task-start_task]
    print("=========================================================================================")
    print("OUTPUT")
    print(x)
    print("=========================================================================================")
    return x
Ejemplo n.º 8
0
 def getValueFieldValueLists(self, handlerId, keyFields, valueFields):
     df = self.entity.groupBy(keyFields)
     agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId))
     maxRows = int(self.options.get("rowCount","100"))
     numRows = min(maxRows,df.count())
     valueLists = []
     for valueField in valueFields:
         valueDf = None
         if agg == "SUM":
             valueDf = df.agg(F.sum(valueField).alias("agg"))
         elif agg == "AVG":
             valueDf = df.agg(F.avg(valueField).alias("agg"))
         elif agg == "MIN":
             valueDf = df.agg(F.min(valueField).alias("agg"))
         elif agg == "MAX":
             valueDf = df.agg(F.max(valueField).alias("agg"))
         else:
             valueDf = df.agg(F.count(valueField).alias("agg"))
         for keyField in keyFields:
             valueDf = valueDf.sort(F.col(keyField).asc())
         valueDf = valueDf.dropna()
         rows = valueDf.select("agg").take(numRows)
         valueList = []
         for row in rows:
             valueList.append(row["agg"])
         valueLists.append(valueList)
     return valueLists   
    def test_multiple_udfs(self):
        """
        Test multiple group aggregate pandas UDFs in one agg function.
        """
        from pyspark.sql.functions import sum, mean

        df = self.data
        mean_udf = self.pandas_agg_mean_udf
        sum_udf = self.pandas_agg_sum_udf
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        result1 = (df.groupBy('id')
                   .agg(mean_udf(df.v),
                        sum_udf(df.v),
                        weighted_mean_udf(df.v, df.w))
                   .sort('id')
                   .toPandas())
        expected1 = (df.groupBy('id')
                     .agg(mean(df.v),
                          sum(df.v),
                          mean(df.v).alias('weighted_mean(v, w)'))
                     .sort('id')
                     .toPandas())

        self.assertPandasEqual(expected1, result1)
Ejemplo n.º 10
0
def sum_aggregations(category, hours=None):
    actual_suffix = ''
    new_suffix = '_%s' % category
    if hours:
        actual_suffix = '_%s' % category
        new_suffix += '_%sh' % hours

    return [func.sum(column + actual_suffix).alias(column + new_suffix) for column in ['Pickup_Count', 'Dropoff_Count']]
Ejemplo n.º 11
0
    def test_retain_group_columns(self):
        with self.sql_conf({"spark.sql.retainGroupColumns": False}):
            df = self.data
            sum_udf = self.pandas_agg_sum_udf

            result1 = df.groupby(df.id).agg(sum_udf(df.v))
            expected1 = df.groupby(df.id).agg(sum(df.v))
            self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
Ejemplo n.º 12
0
def formatItens(firstTime):
    #format itenary data
    global itens
    itens = itens.withColumn("ORIGIN_AIRPORT_ID",toInt("ORIGIN_AIRPORT_ID"))
    itens = itens.withColumn("DEST_AIRPORT_ID",toInt("DEST_AIRPORT_ID"))
    itens = itens.withColumn("MARKET_MILES_FLOWN",toKm("MARKET_MILES_FLOWN"))
    itens = itens.withColumn("PASSENGERS",toInt("PASSENGERS"))
    if firstTime:
        aggArg = sum("PASSENGERS").alias("PASSENGERS"),mean("MARKET_MILES_FLOWN").alias("MARKET_KMS_FLOWN")
        itens = itens.groupBy("ORIGIN_AIRPORT_ID","DEST_AIRPORT_ID").agg(*aggArg).cache()
Ejemplo n.º 13
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Ejemplo n.º 14
0
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
Ejemplo n.º 15
0
def makeMapping(firstTime):
    global routes
    grpString = "ORIGIN_AIRPORT_ID","ORIGIN_CITY_NAME","ORIGIN","DEST_AIRPORT_ID","DEST_CITY_NAME","DEST","UNIQUE_CARRIER_NAME"
    if firstTime:
        routes = routes.groupBy(*grpString).agg(sum("PASSENGERS").alias("PASSENGERS"),sum("DEPARTURES_PERFORMED").alias("DEPARTURES_PERFORMED"),mean("RAMP_TO_RAMP").alias("RAMP_TO_RAMP"))
    for i in routes.collect():
        if not dictAir.get("Airport{}".format(i[0])):
            initNode(i[0],(i[1],i[2]),i[8])
        if not dictAir.get("Airport{}".format(i[3])):
            initNode(i[3],(i[4],i[5]),0)
        if (i[9]!=9876543.21):
            tripTime =i[9]
            getApt(i[0])['depts'] += i[8]
            sourceCNX = getApt(i[0])['cnx']
            sourceCNX.append((int(i[3]),tripTime,i[6]))
Ejemplo n.º 16
0
    def test_udf_with_aggregate_function(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql.functions import udf, col, sum
        from pyspark.sql.types import BooleanType

        my_filter = udf(lambda a: a == 1, BooleanType())
        sel = df.select(col("key")).distinct().filter(my_filter(col("key")))
        self.assertEqual(sel.collect(), [Row(key=1)])

        my_copy = udf(lambda x: x, IntegerType())
        my_add = udf(lambda a, b: int(a + b), IntegerType())
        my_strlen = udf(lambda x: len(x), IntegerType())
        sel = df.groupBy(my_copy(col("key")).alias("k"))\
            .agg(sum(my_strlen(col("value"))).alias("s"))\
            .select(my_add(col("k"), col("s")).alias("t"))
        self.assertEqual(sel.collect(), [Row(t=4), Row(t=3)])
Ejemplo n.º 17
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
Ejemplo n.º 18
0
 def test_aggregate_messages(self):
     g = self._graph("friends")
     # For each user, sum the ages of the adjacent users,
     # plus 1 for the src's sum if the edge is "friend".
     sendToSrc = (
         AM.dst['age'] +
         sqlfunctions.when(
             AM.edge['relationship'] == 'friend',
             sqlfunctions.lit(1)
         ).otherwise(0))
     sendToDst = AM.src['age']
     agg = g.aggregateMessages(
         sqlfunctions.sum(AM.msg).alias('summedAges'),
         sendToSrc=sendToSrc,
         sendToDst=sendToDst)
     # Run the aggregation again providing SQL expressions as String instead.
     agg2 = g.aggregateMessages(
         "sum(MSG) AS `summedAges`",
         sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)",
         sendToDst="src['age']")
     # Convert agg and agg2 to a mapping from id to the aggregated message.
     aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()}
     agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()}
     # Compute the truth via brute force.
     user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()}
     trueAgg = {}
     for src, dst, rel in g.edges.select("src", "dst", "relationship").collect():
         trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0)
         trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src]
     # Compare if the agg mappings match the brute force mapping
     self.assertEqual(aggMap, trueAgg)
     self.assertEqual(agg2Map, trueAgg)
     # Check that TypeError is raises with messages of wrong type
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=object(),
             sendToDst="src['age']")
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=dst['age'],
             sendToDst=object())
Ejemplo n.º 19
0
    def test_wrong_args(self):
        df = self.data

        with QuietTest(self.sc):
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(lambda x: x)
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(udf(lambda x: x, DoubleType()))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(sum(df.v))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(df.v + 1)
            with self.assertRaisesRegexp(ValueError, 'Invalid function'):
                df.groupby('id').apply(
                    pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())])))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf'):
                df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType()))
            with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'):
                df.groupby('id').apply(
                    pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR))
    def test_complex_groupby(self):
        from pyspark.sql.functions import sum

        df = self.data
        sum_udf = self.pandas_agg_sum_udf
        plus_one = self.python_plus_one
        plus_two = self.pandas_scalar_plus_two

        # groupby one expression
        result1 = df.groupby(df.v % 2).agg(sum_udf(df.v))
        expected1 = df.groupby(df.v % 2).agg(sum(df.v))

        # empty groupby
        result2 = df.groupby().agg(sum_udf(df.v))
        expected2 = df.groupby().agg(sum(df.v))

        # groupby one column and one sql expression
        result3 = df.groupby(df.id, df.v % 2).agg(sum_udf(df.v)).orderBy(df.id, df.v % 2)
        expected3 = df.groupby(df.id, df.v % 2).agg(sum(df.v)).orderBy(df.id, df.v % 2)

        # groupby one python UDF
        result4 = df.groupby(plus_one(df.id)).agg(sum_udf(df.v))
        expected4 = df.groupby(plus_one(df.id)).agg(sum(df.v))

        # groupby one scalar pandas UDF
        result5 = df.groupby(plus_two(df.id)).agg(sum_udf(df.v))
        expected5 = df.groupby(plus_two(df.id)).agg(sum(df.v))

        # groupby one expression and one python UDF
        result6 = df.groupby(df.v % 2, plus_one(df.id)).agg(sum_udf(df.v))
        expected6 = df.groupby(df.v % 2, plus_one(df.id)).agg(sum(df.v))

        # groupby one expression and one scalar pandas UDF
        result7 = df.groupby(df.v % 2, plus_two(df.id)).agg(sum_udf(df.v)).sort('sum(v)')
        expected7 = df.groupby(df.v % 2, plus_two(df.id)).agg(sum(df.v)).sort('sum(v)')

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
        self.assertPandasEqual(expected5.toPandas(), result5.toPandas())
        self.assertPandasEqual(expected6.toPandas(), result6.toPandas())
        self.assertPandasEqual(expected7.toPandas(), result7.toPandas())
Ejemplo n.º 21
0
    def test_smvTimePanelAgg_with_Week(self):
        df = self.createDF("k:Integer; ts:String; v:Double",
                 "1,20120301,1.5;" +
                 "1,20120304,4.5;" +
                 "1,20120308,7.5;" +
                 "1,20120309,2.45"
             ).withColumn("ts", col('ts').smvStrToTimestamp("yyyyMMdd"))

        import smv.panel as p

        res = df.smvGroupBy('k').smvTimePanelAgg(
            'ts', p.Week(2012, 3, 1), p.Week(2012, 3, 10)
        )(
            sum('v').alias('v')
        )

        expect = self.createDF("k: Integer;smvTime: String;v: Double",
            """1,W20120305,9.95;
                1,W20120227,6.0""")

        self.should_be_same(res, expect)
Ejemplo n.º 22
0
    def test_smvTimePanelAgg(self):
        df = self.createDF("k:Integer; ts:String; v:Double",
            """1,20120101,1.5;
                1,20120301,4.5;
                1,20120701,7.5;
                1,20120501,2.45"""
            ).withColumn("ts", col('ts').smvStrToTimestamp("yyyyMMdd"))

        import smv.panel as p

        res = df.smvGroupBy('k').smvTimePanelAgg(
            'ts', p.Quarter(2012,1), p.Quarter(2012,2)
        )(
            sum('v').alias('v')
        )

        expect = self.createDF("k: Integer;smvTime: String;v: Double",
                """1,Q201201,6.0;
                    1,Q201202,2.45""")

        self.should_be_same(expect, res)
Ejemplo n.º 23
0
    def test_complex_expressions(self):
        df = self.data
        plus_one = self.python_plus_one
        plus_two = self.pandas_scalar_plus_two
        sum_udf = self.pandas_agg_sum_udf

        # Test complex expressions with sql expression, python UDF and
        # group aggregate pandas UDF
        result1 = (df.withColumn('v1', plus_one(df.v))
                   .withColumn('v2', df.v + 2)
                   .groupby(df.id, df.v % 2)
                   .agg(sum_udf(col('v')),
                        sum_udf(col('v1') + 3),
                        sum_udf(col('v2')) + 5,
                        plus_one(sum_udf(col('v1'))),
                        sum_udf(plus_one(col('v2'))))
                   .sort('id')
                   .toPandas())

        expected1 = (df.withColumn('v1', df.v + 1)
                     .withColumn('v2', df.v + 2)
                     .groupby(df.id, df.v % 2)
                     .agg(sum(col('v')),
                          sum(col('v1') + 3),
                          sum(col('v2')) + 5,
                          plus_one(sum(col('v1'))),
                          sum(plus_one(col('v2'))))
                     .sort('id')
                     .toPandas())

        # Test complex expressions with sql expression, scala pandas UDF and
        # group aggregate pandas UDF
        result2 = (df.withColumn('v1', plus_one(df.v))
                   .withColumn('v2', df.v + 2)
                   .groupby(df.id, df.v % 2)
                   .agg(sum_udf(col('v')),
                        sum_udf(col('v1') + 3),
                        sum_udf(col('v2')) + 5,
                        plus_two(sum_udf(col('v1'))),
                        sum_udf(plus_two(col('v2'))))
                   .sort('id')
                   .toPandas())

        expected2 = (df.withColumn('v1', df.v + 1)
                     .withColumn('v2', df.v + 2)
                     .groupby(df.id, df.v % 2)
                     .agg(sum(col('v')),
                          sum(col('v1') + 3),
                          sum(col('v2')) + 5,
                          plus_two(sum(col('v1'))),
                          sum(plus_two(col('v2'))))
                     .sort('id')
                     .toPandas())

        # Test sequential groupby aggregate
        result3 = (df.groupby('id')
                   .agg(sum_udf(df.v).alias('v'))
                   .groupby('id')
                   .agg(sum_udf(col('v')))
                   .sort('id')
                   .toPandas())

        expected3 = (df.groupby('id')
                     .agg(sum(df.v).alias('v'))
                     .groupby('id')
                     .agg(sum(col('v')))
                     .sort('id')
                     .toPandas())

        self.assertPandasEqual(expected1, result1)
        self.assertPandasEqual(expected2, result2)
        self.assertPandasEqual(expected3, result3)
Ejemplo n.º 24
0
 def run(self, i):
     df = i[_DEP_NAME_]
     return df.groupBy(F.col("ST")).agg(F.sum(F.col("EMP")).alias("EMP"))
Ejemplo n.º 25
0
    hbase_rdd = sc.newAPIHadoopRDD(HBaseUtil.inputFormatClass,
                                   HBaseUtil.keyClass,
                                   HBaseUtil.valueClass,
                                   keyConverter=HBaseUtil.keyConv,
                                   valueConverter=HBaseUtil.valueConv,
                                   conf=HBaseUtil.conf)
    values = hbase_rdd.values()
    init_rdd = values.flatMap(lambda x: x.split("\n")).map(lambda x: json.loads(x)) \
        .map(lambda x: dp.dict_del(x))
    data_frame = sqlContext.read.json(init_rdd)
    # data_frame.show()
    # data_frame.printSchema()

    result = data_frame.groupBy('qualifier').agg(
        F.min(data_frame.value), F.max(data_frame.value),
        F.avg(data_frame.value), F.sum(data_frame.value),
        F.count(data_frame.value)).collect()
    for x in result:
        print(x)

    # valid_rdd = init_rdd.filter(
    #     lambda x: x.get('A1')[0:4] == '0110' and int(x.get('A6')) == 0).cache()
    # roll_filter_rdd = valid_rdd.filter(lambda x: not x.get('A2') is None).map(lambda x: float(x.get('A2'))).cache()
    # roll_mean = roll_filter_rdd.mean()
    # roll_count = roll_filter_rdd.count()
    # roll_max = roll_filter_rdd.max()
    # roll_min = roll_filter_rdd.min()
    #
    # pitch_filter_rdd = valid_rdd.filter(lambda x: not x.get('A3') is None).map(lambda x: float(x.get('A3'))).cache()
    # pitch_mean = pitch_filter_rdd.mean()
    # pitch_count = pitch_filter_rdd.count()
Ejemplo n.º 26
0
    .getOrCreate()

#part 1 question 1
df = spark.read.csv(r"C:\Users\pallavi\PycharmProjects\BDP_ICP10\survey.csv",
                    header=True)
df.createOrReplaceTempView("Survey")

#part 1 question 2
df.write.option("header", "true").csv("spark_survey3.csv")

#part 1 question 3
print(df.dropDuplicates().count())
df.groupBy(df.columns)\
.count()\
.where(f.col('count') > 1)\
.select(f.sum('count'))\
.show()

#part 1 question 4
spark.sql(
    "select * from Survey where Gender = 'Male' or Gender = 'M' or Gender='male'"
).createTempView("Table_Male")
spark.sql("select * from Survey where Gender = 'Female' or Gender = 'female'"
          ).createTempView("Table_Female")
spark.sql(
    "select * from Table_Male union select * from Table_Female order by Country "
).show(50)

#part 1 question 5
spark.sql("select treatment,count(*) as count from Survey group by treatment"
          ).show()
Ejemplo n.º 27
0
    def test_complex_expressions(self):
        df = self.data
        plus_one = self.python_plus_one
        plus_two = self.pandas_scalar_plus_two
        sum_udf = self.pandas_agg_sum_udf

        # Test complex expressions with sql expression, python UDF and
        # group aggregate pandas UDF
        result1 = (df.withColumn('v1', plus_one(df.v))
                   .withColumn('v2', df.v + 2)
                   .groupby(df.id, df.v % 2)
                   .agg(sum_udf(col('v')),
                        sum_udf(col('v1') + 3),
                        sum_udf(col('v2')) + 5,
                        plus_one(sum_udf(col('v1'))),
                        sum_udf(plus_one(col('v2'))))
                   .sort(['id', '(v % 2)'])
                   .toPandas().sort_values(by=['id', '(v % 2)']))

        expected1 = (df.withColumn('v1', df.v + 1)
                     .withColumn('v2', df.v + 2)
                     .groupby(df.id, df.v % 2)
                     .agg(sum(col('v')),
                          sum(col('v1') + 3),
                          sum(col('v2')) + 5,
                          plus_one(sum(col('v1'))),
                          sum(plus_one(col('v2'))))
                     .sort(['id', '(v % 2)'])
                     .toPandas().sort_values(by=['id', '(v % 2)']))

        # Test complex expressions with sql expression, scala pandas UDF and
        # group aggregate pandas UDF
        result2 = (df.withColumn('v1', plus_one(df.v))
                   .withColumn('v2', df.v + 2)
                   .groupby(df.id, df.v % 2)
                   .agg(sum_udf(col('v')),
                        sum_udf(col('v1') + 3),
                        sum_udf(col('v2')) + 5,
                        plus_two(sum_udf(col('v1'))),
                        sum_udf(plus_two(col('v2'))))
                   .sort(['id', '(v % 2)'])
                   .toPandas().sort_values(by=['id', '(v % 2)']))

        expected2 = (df.withColumn('v1', df.v + 1)
                     .withColumn('v2', df.v + 2)
                     .groupby(df.id, df.v % 2)
                     .agg(sum(col('v')),
                          sum(col('v1') + 3),
                          sum(col('v2')) + 5,
                          plus_two(sum(col('v1'))),
                          sum(plus_two(col('v2'))))
                     .sort(['id', '(v % 2)'])
                     .toPandas().sort_values(by=['id', '(v % 2)']))

        # Test sequential groupby aggregate
        result3 = (df.groupby('id')
                   .agg(sum_udf(df.v).alias('v'))
                   .groupby('id')
                   .agg(sum_udf(col('v')))
                   .sort('id')
                   .toPandas())

        expected3 = (df.groupby('id')
                     .agg(sum(df.v).alias('v'))
                     .groupby('id')
                     .agg(sum(col('v')))
                     .sort('id')
                     .toPandas())

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

my_spark = SparkSession.builder.getOrCreate()

df = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").option(
    "uri",
    "mongodb://*****:*****@ec2-54-210-44-189.compute-1.amazonaws.com/test.reviews"
).load()
df1 = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").option(
    "uri",
    "mongodb://*****:*****@ec2-54-210-44-189.compute-1.amazonaws.com/test.metadata"
).load()

df3 = df.groupBy("asin").agg(
    func.sum("overall").alias("item_sum"),
    func.count(func.lit(1)).alias("item_counts"))
df3 = df3.filter(df3["item_counts"] >= 100)
df3 = df3.withColumn("item_avg",
                     func.col("item_sum") /
                     func.col("item_counts")).drop('item_sum')

#select from metadata
df4 = df1.select("asin", "categories", "title")
df5 = df4.join(df3, "asin", "inner").drop("asin")

df6 = df5.select("categories", "title", "item_counts", "item_avg",
                 func.explode_outer("categories"))
df7 = df6.drop("categories").withColumnRenamed("col", "categories")
df9 = df7.select("categories", "title", "item_counts",
                 "item_avg").sort("categories")
Ejemplo n.º 29
0
    #master_regex_one_df = master_regex_one_df.repartition(160)
    mid_time1 = time.time()
    print("Number of partitions of master_regex_one_df: {}".format(
        master_regex_one_df.rdd.getNumPartitions()))
    master_shrunken_df = master_regex_one_df.where(
        'regexes_diff_bool == 1 or core_diff_bool == 1')

    print("--- %s seconds ---" % (time.time() - mid_time1))

    print(
        'these two should be these same, the summing of regexes_diff_bool and core_diff_bool:'
    )
    # TEST - these two should be the same (if not 0):
    print("master:")
    master_regex_one_df.groupBy("year", "month").agg(
        f.sum("regexes_diff_bool").alias("num_revs_with_regex_diff"),
        f.sum("core_diff_bool").alias("num_revs_with_core_diff")).orderBy(
            master_regex_one_df.year, master_regex_one_df.month).show(n=30)
    print("filtered:")
    master_shrunken_df.groupBy("year", "month").agg(
        f.sum("regexes_diff_bool").alias("num_revs_with_regex_diff"),
        f.sum("core_diff_bool").alias("num_revs_with_core_diff")).orderBy(
            master_shrunken_df.year, master_shrunken_df.month).show(n=30)
    print('\n\n\n')

    # MASTER
    #TODO See if we can export the master_regex_one_df file actually
    #print("Preview master_regex_one_df: ")
    #master_regex_one_df.orderBy(master_regex_one_df.articleid.asc_nulls_first(), master_regex_one_df.year, master_regex_one_df.month, master_regex_one_df.date_time).show(n=10)

    #out_filepath_master = "{}/{}_master_{}.tsv".format(args.output_directory,args.output_filename,datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S"))
# MAGIC - Explore the data to see whether we can use it to produce energy consumption forecasts, and if yes, write code to prepare the data to fit a model on
# MAGIC 
# MAGIC We can start by calculating energy consumption at daily and monthly levels.
# MAGIC We can do that by doing separate aggregations and save the datasets separately. Alternatively, we can do roll up aggregations, where we calculate hiearchical subtotals
# MAGIC (from left to right).

# COMMAND ----------

consumption_df = (consumption_df.selectExpr('to_date(DateTime) AS Date', '*')
                 .selectExpr('year(date) AS Year', 'month(Date) as Month', '*')
                 )

# COMMAND ----------

consumption_rollup_df = (consumption_df.rollup(['Year', 'Month', 'Date', 'LCLid']) # pay attention to the ordering of cols, as aggregations goes left-to-right
                      .agg(f.sum('KWH_half_hour').alias('KWH'),
                          f.countDistinct('LCLid').alias('n_households')
                          )
                      .orderBy(['Year', 'Month', 'Date', 'LCLid'])
).cache()

# COMMAND ----------

# triggering the computation
display(consumption_rollup_df)

# COMMAND ----------

# we cached the object, so this will be instantaneous:
display(consumption_rollup_df)
    StructField("name", StringType(), True)
])

names = spark.read.schema(schema).option("sep", " ").csv("./Marvel-Names.txt")

lines = spark.read.text("./Marvel-Graph.txt")

connections = (
    # the first column of each row is the id of a superhero we want to check
    lines.withColumn("id",
                     func.split(func.col("value"), " ")[0])
    # - 1 means exclude the superhero him/herself
    .withColumn("connections",
                func.size(func.split(func.col("value"), " ")) -
                1).groupBy("id").agg(
                    func.sum("connections").alias("connections")))

minConnectionCount = connections.agg(func.min("connections")).first()[0]
# list all heroes who has the least amount of connections
mostObscureHeroes = connections.filter(
    func.col("connections") == minConnectionCount)

mostObscureHeroNames = names.join(mostObscureHeroes, "id", "inner")

print(
    f"The following characters have only {minConnectionCount} connection{ '' if minConnectionCount <=1 else 's' }"
)

mostObscureHeroNames.select("name").show()

# Stop the session
Ejemplo n.º 32
0
    target, F.log1p(F.col(target))))

fitted = gbt.fit(X_train)

yhat = (fitted.transform(X_test).withColumn(
    "prediction", F.expm1(F.col("prediction"))).withColumn(
        target, F.expm1(F.col(target))).withColumn(
            'fiability', 1 - F.abs(F.col(target) - F.col("prediction")) /
            F.col(target)).withColumn(
                'fiability',
                F.when(F.col("fiability") < 0,
                       0).otherwise(F.col("fiability"))))

print(
    yhat.select(
        F.sum(F.col(target) * F.col("fiability")) /
        F.sum(F.col(target))).show())

eval_ = RegressionEvaluator(labelCol=target,
                            predictionCol="prediction",
                            metricName="rmse")

rmse = eval_.evaluate(yhat)

print('rmse is %.2f' % rmse)

mae = eval_.evaluate(yhat, {eval_.metricName: "mae"})
print('mae is %.2f' % mae)

r2 = eval_.evaluate(yhat, {eval_.metricName: "r2"})
print('r2 is %.2f' % r2)
Ejemplo n.º 33
0
    df.createTempView('orders_VW')
    spark.sql('select * from orders_VW order by order_status desc').show()
 
 

 
 #--------------------------------------------
 #-------------------AGGREGATE----------------
 #--------------------------------------------
    # list all availale functions , spark-shell can be used for this to print all available apis'
    # spark-shell
    # org.apache.spark.sql.functions.
    
    
        from pyspark.sql.functions import sum # check if required
        df.select(round(sum('order_price'),2)).show()
    
    
#GROUPBY
        df.groupBy('Order_id').sum('order_price').show()
    
        df.groupBy('Order_id').agg(sum('order_price')).show()
        df.groupBy('Order_id').agg(sum('order_price').alias('sum_price')).show()  # alias not allowed without agg
    
    
        df.groupBy('Order_id').count().show()
        
        df.groupBy("department","state").sum("salary","bonus")
    
    # to have alias and every function inside agg reuqires column name`
        df_orderItems.groupBy('Order_id').agg(count('order_id').alias('price_count')).show() 
Ejemplo n.º 34
0
# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(sizedFirst[0], len(sizedFirst[1]), 'incorrect implementation for sized')

# COMMAND ----------

# MAGIC %md
# MAGIC Next, you'll need to aggregate the counts.  You can do this using `func.sum` in either a `.select` or `.agg` method call on the `DataFrame`.  Make sure to give your `Column` the alias `numberOfWords`.  There are some examples in [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.GroupedData.agg) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrame) in the APIs.

# COMMAND ----------

# ANSWER
numberOfWords = sized.agg(func.sum('size').alias('numberOfWords'))

wordCount = numberOfWords.first()[0]
print wordCount

# COMMAND ----------

# TEST
Test.assertEquals(wordCount, 1903220, 'incorrect word count')

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll compute the word count using `select`, the function `func.explode()`, and then taking a `count()` on the `DataFrame`.  Make sure to name the column returned by the `explode` function 'word'.

# COMMAND ----------
Ejemplo n.º 35
0
#   - daily_usage_aggs
# as health:
#   - as_health_aggs
# as sessions:
#   - as_session_aggs
# as clicks:
#   - as_clicks_aggs
# as snippets:
#   - as_snippets_aggs = get_as_snippets_aggs(message_id=None)

# agg fields: main summary usage ------------------

days_used = F.countDistinct(F.col('activity_dt')).alias('days_used')

active_hours = F.sum(
    F.coalesce(F.col('active_ticks'), F.lit(0)) * F.lit(5) /
    F.lit(3600)).alias('active_hours')

uris = F.sum(
    F.coalesce(F.col('scalar_parent_browser_engagement_total_uri_count'),
               F.lit(0))).alias('uris')

tabs_opened = F.sum(
    F.coalesce(F.col('scalar_parent_browser_engagement_tab_open_event_count'),
               F.lit(0))).alias('tabs_opened')

windows_opened = F.sum(
    F.coalesce(
        F.col('scalar_parent_browser_engagement_window_open_event_count'),
        F.lit(0))).alias('windows_opened')
Ejemplo n.º 36
0
    # data_c = spark.read.json('channel_base/part*', schema = channel_sch)
    
    convertTime = functions.udf(timeToFrame)

    data_s = spark.read.json('stream_info.json', schema = stream_sch)
    data_c = spark.read.json('channel_info.json', schema = channel_sch).cache()
    
	data_s = data_s.withColumn('time_frame', convertTime(data_s.created_at)).cache()

    data_s.createOrReplaceTempView('data_s')
    data_c.createOrReplaceTempView('data_c')

    game_count_by_time = data_s.groupBy('time_frame', 'game_name').count()
	game_count_by_time = game_count_by_time.orderBy(game_count_by_time['count'].desc())
    
    view_count_by_time = data_s.groupBy('time_frame', 'game_name').agg(functions.sum('viewers').alias('total_view'))
    view_count_by_time = view_count_by_time.orderBy(view_count_by_time['total_view'].desc())
    
	game_count_by_time.coalesce(1).write.json('game_count_by_time', mode = 'overwrite')
    view_count_by_time.coalesce(1).write.json('view_count_by_time', mode = 'overwrite')


    view_num_by_game = data_c.groupby(data_c['game']).agg(functions.sum(data_c['views']),functions.sum(data_c['followers']))

    view_num_by_streamer = data_c\
        .select('stream_id','channel_id','game','name','views','followers','created_at','updated_at','partner')\
        .orderBy(functions.desc('views'),'game')

    print(view_num_by_streamer.show(5))

    viewcount_by_game = view_num_by_game\
Ejemplo n.º 37
0
df_vuelos_retraso = df_vuelos_retraso.join(df_pais, df_pais["cod_pais"] == df_vuelos_retraso["origen"], 'inner')
df_vuelos_retraso = df_vuelos_retraso.select("vuelo", f.col("pais").alias("origen"), "destino","dias_retraso")

df_vuelos_retraso = df_vuelos_retraso.join(df_pais, df_pais["cod_pais"] == df_vuelos_retraso["destino"], 'inner')
df_vuelos_retraso = df_vuelos_retraso.select("vuelo", "origen", f.col("pais").alias("destino"),"dias_retraso")

print("Retraso Vuelos")
print(df_vuelos_retraso.show())

#df_top_dia_retrasos.write.mode("overwrite").saveAsTable("top_dia_retrasos")
df_retraso_acumulado = df_join.join(df_vuelos, "vuelo", 'inner')
df_retraso_acumulado = df_retraso_acumulado.groupBy('origen', 'dia').sum('dias_retraso')
df_retraso_acumulado = df_retraso_acumulado.select("origen","dia",f.col("sum(dias_retraso)").alias("retraso"))
window = w.Window.partitionBy(f.col("origen")).orderBy(f.col("retraso"))
df_retraso_acumulado = df_retraso_acumulado.withColumn("retraso_acumulado", f.sum("retraso").over(window))

df2 = df_retraso_acumulado.repartition(4)

##testing
"""
pais_vip = ["Peru", "España", "Mexico"]
udf_pais_vip = f.udf(lambda x : "VIP" if x in pais_vip else "NO VIP")

df = df_salidas.select(f.col("pais"),f.col("top_salidas"), udf_pais_vip(f.col("pais")).alias("vip"))

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]
df = spark.createDataFrame(data=data,schema=columns)
Ejemplo n.º 38
0
duration_udf = udf(lambda start, end: getDuration(start, end))
length_udf = udf(lambda length: convertLength(length))
count_active_udf = udf(
    lambda duration, length: countForActive(duration, length))
percentage_udf = udf(
    lambda numActive, numTotal: percentageActive(numActive, numTotal))

content_map = spark.read.csv('../content_mapping.csv',
                             header='true').select('title', 'length')
df = spark.read.parquet(
    './DLT_03_users_device_logs_shifted/*').dropDuplicates().na.drop()
df = df.join(content_map, [df.item_name == content_map.title])
df = df.withColumn('duration', duration_udf(df['start_time'], df['end_time']))
df = df.withColumn('length', length_udf(df['length']))
df = df.withColumn('counts_for_active',
                   count_active_udf(df['duration'], df['length']))
df = df.groupBy('device_id',
                'month').agg(F.sum('counts_for_active').alias('count'))
active = df.filter(df['count'] >= 20)
non_active = df.filter(df['count'] < 20)
active = active.groupBy('month').agg(
    F.count('device_id').alias('active')).orderBy('month')
non_active = non_active.groupBy('month').agg(
    F.count('device_id').alias('user_counts')).orderBy('month').selectExpr(
        'month as month2', 'user_counts as non_active')
all_users = active.join(
    non_active,
    [active.month == non_active.month2]).drop('month2').orderBy('month')

all_users.coalesce(1).write.parquet('DTL_04_users_by_month')
Ejemplo n.º 39
0
 def test_select_aggregate_dont_preserve_order(self):
     from pyspark.sql.functions import sum
     self.shared_test_partition_preserving(lambda df: df.select(sum('forecast')), False)
Ejemplo n.º 40
0
def count_not_null(c):
    return sum(col(c).isNotNull().cast("integer")).alias(c)
Ejemplo n.º 41
0
 def count_null(col_name):
     """ Build up a list of column expressions, one per column. """
     return sum(col(col_name).isNull().cast("integer")).alias(col_name)
Ejemplo n.º 42
0
  .master("local[4]")\
  .appName("NASA Kennedy Space Center WWW server")\
  .config("spark.sql.execution.arrow.enabled", "true")\
  .config("spark.memory.fraction", 0.8)\
  .config("spark.executor.memory", "1g")\
  .config("spark.driver.memory", "1g")\
  .getOrCreate()

nasaSchema = StructType([StructField("host",StringType(),True),\
      StructField("timestamp",StringType(),True),\
      StructField("requisicao",StringType(),True),\
      StructField("code_http",IntegerType(),True),\
      StructField("total_bytes",StringType(),True)])

nasa = spark.createDataFrame(df_nasa, schema=nasaSchema)

nasa.select('host').distinct().count().show()

nasa.filter(f.col('code_http') == 404).count().show()

nasa.groupBy("requisicao").agg(f.count('requisicao').alias('qtd')).sort(
    f.desc(qtd)).limit(5).show()

nasa.withColumn('data', nasa['timestamp'].substr(1, 10)).show()

nasa.filter(f.col('code_http') == 404).groupBy('data').agg(
    f.count('code_http')).show()

nasa.filter(f.col('code_http') == 404).groupBy('data').agg(
    f.sum('total_bytes')).show()
Ejemplo n.º 43
0
def trioAnalysis(nMax):
    routed = itens.map(lambda x: (x[0],x[1],(parseResults(closestRoute(x[0],x[1],0,0),"trios")),x[2],x[3]))
    routed = routed.map(lambda x: (x[0],x[1],x[2],(x[4]-tripDistance(x[2]))*x[3]))
    print("Calculating optimal routes... (this could take a minute, but let's see you try to give directions to 5.8 million people!)")
    schemaString = ['ORIGIN_AIRPORT_ID','Dest','Trip','kmSaved']
    kmSavedDF=sqlc.createDataFrame(routed,schemaString).groupBy("ORIGIN_AIRPORT_ID").agg(sum("kmSaved").alias("kmSaved"))
    print("Looking up airport names...")
    kmSavedDF=kmSavedDF.map(lambda x: Row(Name=getName(x[0]),kmPerDept=(x[1]/getApt(x[0])['depts']),totalDepts=getApt(x[0])['depts'],totalKm = x[1]))
    kmSavedDF=sqlc.createDataFrame(kmSavedDF,['Name','kmPerDept','totalDepts','totalKm'])
    print("Saving data...")
    cwd = os.getcwd()
    kmSavedDF.write.parquet(cwd+"/FlightData/FlightOptResults.parquet")
    routes.write.parquet(cwd+"/FlightData/FlightOptRoutes.parquet")
    itens.write.parquet(cwd+"/FlightData/FlightOptItens.parquet")
    return kmSavedDF
Ejemplo n.º 44
0
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "trades") \
        .option("startingOffsets", "earliest") \
        .load()

    value_df = kafka_df.select(
        from_json(col("value").cast("string"), stock_schema).alias("value"))

    trade_df = value_df.select("value.*") \
        .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
        .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \
        .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end"))

    window_agg_df = trade_df \
        .withWatermark("CreatedTime", "30 minute") \
        .groupBy(window(col("CreatedTime"), "15 minute")) \
        .agg(sum("Buy").alias("TotalBuy"),
             sum("Sell").alias("TotalSell"))

    output_df = window_agg_df.select("window.start", "window.end", "TotalBuy",
                                     "TotalSell")

    window_query = output_df.writeStream \
        .format("console") \
        .outputMode("update") \
        .option("checkpointLocation", "chk-point-dir") \
        .trigger(processingTime="30 second") \
        .start()

    logger.info("Waiting for Query")
    window_query.awaitTermination()
Ejemplo n.º 45
0
    def test_mixed_udfs(self):
        """
        Test mixing group aggregate pandas UDF with python UDF and scalar pandas UDF.
        """
        df = self.data
        plus_one = self.python_plus_one
        plus_two = self.pandas_scalar_plus_two
        sum_udf = self.pandas_agg_sum_udf

        # Mix group aggregate pandas UDF and python UDF
        result1 = (df.groupby('id')
                   .agg(plus_one(sum_udf(df.v)))
                   .sort('id'))
        expected1 = (df.groupby('id')
                     .agg(plus_one(sum(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and python UDF (order swapped)
        result2 = (df.groupby('id')
                   .agg(sum_udf(plus_one(df.v)))
                   .sort('id'))
        expected2 = (df.groupby('id')
                     .agg(sum(plus_one(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and scalar pandas UDF
        result3 = (df.groupby('id')
                   .agg(sum_udf(plus_two(df.v)))
                   .sort('id'))
        expected3 = (df.groupby('id')
                     .agg(sum(plus_two(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and scalar pandas UDF (order swapped)
        result4 = (df.groupby('id')
                   .agg(plus_two(sum_udf(df.v)))
                   .sort('id'))
        expected4 = (df.groupby('id')
                     .agg(plus_two(sum(df.v)))
                     .sort('id'))

        # Wrap group aggregate pandas UDF with two python UDFs and use python UDF in groupby
        result5 = (df.groupby(plus_one(df.id))
                   .agg(plus_one(sum_udf(plus_one(df.v))))
                   .sort('plus_one(id)'))
        expected5 = (df.groupby(plus_one(df.id))
                     .agg(plus_one(sum(plus_one(df.v))))
                     .sort('plus_one(id)'))

        # Wrap group aggregate pandas UDF with two scala pandas UDF and user scala pandas UDF in
        # groupby
        result6 = (df.groupby(plus_two(df.id))
                   .agg(plus_two(sum_udf(plus_two(df.v))))
                   .sort('plus_two(id)'))
        expected6 = (df.groupby(plus_two(df.id))
                     .agg(plus_two(sum(plus_two(df.v))))
                     .sort('plus_two(id)'))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
        self.assertPandasEqual(expected5.toPandas(), result5.toPandas())
        self.assertPandasEqual(expected6.toPandas(), result6.toPandas())
Ejemplo n.º 46
0
# +---+--------------------+
# | id|                name|
# +---+--------------------+
# |  1|24-HOUR MAN/EMMANUEL|
# |  2|3-D MAN/CHARLES CHAN|
# |  3|    4-D MAN/MERCURIO|
# |  4|             8-BALL/|
# |  5|                   A|
# +---+--------------------+

lines = spark.read.text('datasets/Marvel-graph.txt')

connections = lines.withColumn('id', F.split(F.col('value'), ' ')[0])\
    .withColumn('connections', F.size(F.split(F.col('value'), ' ')) - 1)\
    .groupby('id')\
    .agg(F.sum('connections').alias('connections'))\
    .sort('connections', ascending=True)

# We get the first from a ordered rank on #connections
least_popular = connections.first()

least_popular_conn = connections.filter(F.col('connections') == least_popular.connections)

least_popular_conn.show()

joined = least_popular_conn.join(hero_names, 'id')

joined.show()
# +----+-----------+--------------------+
# |  id|connections|                name|
# +----+-----------+--------------------+
Ejemplo n.º 47
0
# 取出特定欄位
statColumn = ['CUSAUNT', 'CARNO', 'STDNO', 'TDATE', 'QTY', 'MILE']
pDf = df.select(statColumn)
# 分離日期欄位的年及月至新欄位
tDf = (pDf.withColumn('TDATEYEAR', pDf['TDATE'].substr(1, 4)).withColumn(
    'TDATEMONTH',
    pDf['TDATE'].substr(5, 2)).withColumn('TDATEDAY',
                                          pDf['TDATE'].substr(7, 2)))
# 刪除不必要欄位
tDf = tDf.drop('TDATE')
#
groupColumn = [
    'CUSAUNT', 'CARNO', 'STDNO', 'TDATEYEAR', 'TDATEMONTH', 'TDATEDAY'
]
sDf = (tDf.groupBy(groupColumn).agg(
    sum(tDf.QTY.cast('float')).alias('sQty'),
    sum(tDf.MILE.cast('float')).alias('sMile'),
    count(tDf.QTY.cast('float')).alias('cTimes')).orderBy(groupColumn))
#
# 目的路徑
outputPath = "/home/cpc/data/resultData"
# 目的資料
outputFile = "cusauntCarnoYMDsQtysMilecTimes"
# 完整路徑和資料
outputFull = outputPath + "/" + outputFile
#
sDf.toJSON().coalesce(1).saveAsTextFile(outputFull)

#
# 年度月油品(汽油/柴油)銷售總量
#
    # Subtract startTime from endTime to get lifeTime of each revision
    tScoring = perf_counter()
    dfScore = dfScore.withColumn(
        "LiveSeconds",
        func.when(
            func.isnull(dfScore.EpochTimestampEnd -
                        dfScore.EpochTimestampStart),
            dateOfDataDumpInEpochSeconds -
            dfScore.EpochTimestampStart).otherwise(
                dfScore.EpochTimestampEnd - dfScore.EpochTimestampStart))
    #    dfScore.show()
    #    print("Table with Lifetime Calcs ^^^")

    # Score each contributor based on the average and total life of their revisions
    contributors = Window.partitionBy("cleanContributor")
    outDf = dfScore.withColumn("Score-Sum", func.sum("LiveSeconds").over(contributors)) \
              .withColumn("Score-Avg", func.avg("LiveSeconds").over(contributors)) \
              .withColumn("Score-Count", func.count("LiveSeconds").over(contributors)) \
              .select("cleanContributor","Score-Sum","Score-Avg","Score-Count","isRegistered").distinct().orderBy("cleanContributor","Score-Sum","Score-Avg","Score-Count",ascending=False)
    #    numContributors = outDf.count()
    tScoringEnd = perf_counter()
    #    outDf.show(numContributors,truncate=False)
    #    outDf.show()
    #    print("Total number contributors: ",numContributors)

    # Write output to Postgresql database
    dbTableName = "ContributorScores"
    tWriteDB = perf_counter()
    url = 'jdbc:postgresql://10.0.0.11:5442/cluster_output'
    postgresUser = os.environ['POSTGRES_USER']
    postgresPass = os.environ['POSTGRES_PASS']
spark = SparkSession.builder.master(
    "spark://ec2-34-206-0-125.compute-1.amazonaws.com:7077").appName(
        "amazon-insights").config("spark.executor.memory",
                                  "6gb").getOrCreate()
sqlContext = SQLContext(spark.sparkContext)
departments = []
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket='amazonreviewsinsight', Delimiter='/')
obj = response.get('CommonPrefixes')
for obj in response.get('CommonPrefixes'):
    department = str(obj.get('Prefix')).replace("product_category=", "")
    departments.append(department)

reviews = sqlContext.read.parquet(
    's3a://amazonreviewsinsight/product_category=Electronics/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet'
)
reviews = sqlContext.read.parquet(
    's3a://amazonreviewsinsight/product_category=' + departments[0])
reviews = reviews.filter(reviews.marketplace == 'US')
reviews = reviews.drop('market_place', 'product_id', 'customer_id',
                       'review_id', 'product_parent', 'vine',
                       'review_headline')
reviews = reviews.groupby('product_title', 'review_date').agg(
    f.mean('star_rating').alias('avg_star_rating_daily'),
    f.count('product_title').alias('no_of_purchases'),
    f.sum('helpful_votes').alias('helpful_votes_in_day'),
    f.sum('total_votes').alias('total_votes_in_day'),
    f.collect_list('review_body').alias("daily_text_review"))
reviews = reviews.withColumn(
    "reviews_no_punc",
    lower(trim(regexp_replace('daily_reviews', '[^A-Za-z0-9 ]+', ''))))
Ejemplo n.º 50
0
 def get_aggregared_sum(self, df, dimension_columns):
     grouped_data = {}
     for col in dimension_columns:
         agg_df = df.groupBy(col).agg(FN.sum(col).alias("count")).toPandas()
         grouped_data[col] = dict(list(zip(agg_df[col], agg_df["count"])))
     return grouped_data
Ejemplo n.º 51
0
).option('mode', 'DROPMALFORMED').load(
    "file:///media/alessandro/storage/big_data-primoProgetto/dataset/X1_historical_stocks.csv"
)

stock_prices = stock_prices.select('ticker', 'close', 'volume',
                                   year("date").alias('year'))

stocks = stocks.select('ticker', 'sector')

joined = stock_prices.join(stocks, on='ticker')

filtered = joined.filter((joined.year <= '2018') & (joined.year >= '2004')
                         & (joined.sector != 'N/A'))

intermediate1 = filtered.groupBy('sector', 'year').agg(
    F.sum(filtered.volume).alias('volCompl'),
    F.mean(filtered.close).alias('avg_volume'))

intermediate1 = intermediate1.sort(F.desc('sector'), F.desc('year'))

intermediate2 = filtered.groupBy('sector', 'year').agg(
    F.sum(filtered.close).alias('actualQuote'))

intermediate2 = intermediate2.sort(F.desc('sector'), F.desc('year'))

intermediate3 = intermediate2.withColumn(
    'previousQuote',
    F.lead('actualQuote').over(
        Window.partitionBy('sector').orderBy(F.desc('sector'),
                                             F.desc('year'))))
Ejemplo n.º 52
0
    def __call__(self, df, c, by=None, index='_idx', result='_res'):
        return dataframe.percentiles(df, c, by, self.p, index, result)


class typeof:
    def __call__(self, df, c, by=None, index='_idx', result='_res'):
        _gcols = [by] if isinstance(by, str) and by else by or []
        t = df.select(c).schema.fields[0].dataType.simpleString()
        return df.select(c, *_gcols).groupby(*_gcols).agg(
            F.lit(c).alias(index),
            F.lit(t).alias(result))


df_functions = (typeof, topn, topn_count, topn_values, percentiles)

null = lambda c: F.sum(c.isNull().cast('int'))
nan = lambda c: F.sum(c.isnan)
integer = lambda c: F.coalesce(F.sum((F.rint(c) == c).cast('int')), F.lit(0))
boolean = lambda c: F.coalesce(
    F.sum((c.cast('boolean') == F.rint(c)).cast('int')), F.lit(0))
zero = lambda c: F.sum((c == 0).cast('int'))
empty = lambda c: F.sum((F.length(c) == 0).cast('int'))
pos = lambda c: F.sum((c > 0).cast('int'))
neg = lambda c: F.sum((c < 0).cast('int'))
distinct = lambda c: F.countDistinct(c)

one = lambda c: F.first(c, False).cast(T.StringType())
count = F.count

sum = F.sum
sum_pos = lambda c: F.sum(F.when(c > 0, c))
Ejemplo n.º 53
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # date_now = datetime.now()
    # preday = date_now + timedelta(days=-1)
    # d1 = preday.strftime("%Y%m%d")
    # print("d1 =", d1)
    #
    # now = datetime.now()  # current date and time
    # year = now.strftime("%Y%m%d")
    # print("year:", year)

    dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="mapping_lo_student_history"
    )
    print('Count:', dyf_mapping_lo_student_history.count())
    # # Filter nhung ban ghi cua ngay hom truoc, filter nhung ban ghi co diem != 0
    # dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None)
    dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history,
                                                  f=lambda x: x['date_id'] is not None and
                                                              (x['knowledge'] != 0 or x['comprehension'] != 0 or x[
                                                                  'application'] != 0 or x['analysis'] != 0 or x[
                                                                   'synthesis'] != 0 or x['evaluation'] != 0))
    if dyf_mapping_lo_student_history.count() > 0:
        print('START JOB---------------')
        df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF()
        df_mapping_lo_student_history = df_mapping_lo_student_history.groupby('date_id', 'student_id',
                                                                              'learning_object_id').agg(
            f.sum("knowledge").alias("knowledge"),
            f.sum("comprehension").alias("comprehension"), f.sum("application").alias("application"),
            f.sum("analysis").alias("analysis"), f.sum("synthesis").alias("synthesis"),
            f.sum("evaluation").alias("evaluation"))
        df_mapping_lo_student_history.printSchema()
        df_mapping_lo_student_history.show()
        print('END JOB---------------')

        dyf_mapping_lo_student_used = DynamicFrame.fromDF(df_mapping_lo_student_history, glueContext,
                                                          "dyf_student_lo_init")
        # print('COUNT:', dyf_student_lo_init.count())
        # dyf_student_lo_init.printSchema()
        # dyf_student_lo_init.show()

        dyf_mapping_lo_student_used = ApplyMapping.apply(frame=dyf_mapping_lo_student_used,
                                                         mappings=[("student_id", "long", "student_id", "long"),
                                                                   ("learning_object_id", "long", "learning_object_id",
                                                                    "long"),
                                                                   ("date_id", "int", "date_id", "long"),
                                                                   ("knowledge", 'long', 'knowledge', 'long'),
                                                                   ("comprehension", 'long', 'comprehension', 'long'),
                                                                   ("application", 'long', 'application', 'long'),
                                                                   ("analysis", 'long', 'analysis', 'long'),
                                                                   ("synthesis", 'long', 'synthesis', 'long'),
                                                                   ("evaluation", 'long', 'evaluation', 'long')])
        dyf_mapping_lo_student_used = ResolveChoice.apply(frame=dyf_mapping_lo_student_used, choice="make_cols",
                                                          transformation_ctx="resolvechoice2")
        dyf_mapping_lo_student_used = DropNullFields.apply(frame=dyf_mapping_lo_student_used,
                                                           transformation_ctx="dyf_mapping_lo_student_used")
        datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_mapping_lo_student_used,
                                                                   catalog_connection="glue_redshift",
                                                                   connection_options={
                                                                       "dbtable": "mapping_lo_student_used",
                                                                       "database": "dts_odin",
                                                                       "postactions": """ call proc_insert_tbhv();
                                                                       INSERT INTO mapping_lo_student_history SELECT * FROM mapping_lo_student_used;
                                                                       DROP TABLE IF EXISTS mapping_lo_student_used """
                                                                   },
                                                                   redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_init",
                                                                   transformation_ctx="datasink5")
Ejemplo n.º 54
0
def AggFunctions(df, cols):

    org_window = Window.partitionBy(F.col("latitude"),
                                    F.col("longitude")).orderBy(
                                        F.col("date").asc())
    windowSpec = org_window.rowsBetween(-7, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(col + "_mean_7_days",
                           F.avg(F.col(col)).over(windowSpec))
        df = df.withColumn(
            col + "_max_7_days",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-7, 0)))
        df = df.withColumn(
            col + "_min_7_days",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-7, 0)))
        df = df.withColumn(
            col + "_cummalitive_7_days",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-7, 0)))
        df = df.withColumn(
            col + "_std_7_days",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-7, 0)))

    windowSpec = org_window.rowsBetween(-14, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_14_days",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-14, 0)))
        df = df.withColumn(
            col + "_max_14_days",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-14, 0)))
        df = df.withColumn(
            col + "_min_14_days",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-14, 0)))
        df = df.withColumn(
            col + "_cummalitive_14_days",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-14, 0)))
        df = df.withColumn(
            col + "_std_14_days",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-14, 0)))

    windowSpec = org_window.rowsBetween(-30, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_30_days",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-30, 0)))
        df = df.withColumn(
            col + "_max_30_days",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-30, 0)))
        df = df.withColumn(
            col + "_min_30_days",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-30, 0)))
        df = df.withColumn(
            col + "_cummalitive_30_days",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-30, 0)))
        df = df.withColumn(
            col + "_std_30_days",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-30, 0)))

    windowSpec = org_window.rowsBetween(-60, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_60_days",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-60, 0)))
        df = df.withColumn(
            col + "_max_60_days",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-60, 0)))
        df = df.withColumn(
            col + "_min_60_days",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-60, 0)))
        df = df.withColumn(
            col + "_cummalitive_60_days",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-60, 0)))
        df = df.withColumn(
            col + "_std_60_days",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-60, 0)))

    windowSpec = org_window.rowsBetween(-90, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_90_days",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-90, 0)))
        df = df.withColumn(
            col + "_max_90_days",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-90, 0)))
        df = df.withColumn(
            col + "_min_90_days",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-90, 0)))
        df = df.withColumn(
            col + "_cummalitive_90_days",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-90, 0)))
        df = df.withColumn(
            col + "_std_90_days",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-90, 0)))

    windowSpec = org_window.rowsBetween(-180, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_6_months",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-180, 0)))
        df = df.withColumn(
            col + "_max_6_months",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-180, 0)))
        df = df.withColumn(
            col + "_min_6_months",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-180, 0)))
        df = df.withColumn(
            col + "_cummalitive_6_months",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-180, 0)))
        df = df.withColumn(
            col + "_std_6_months",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-180, 0)))

    windowSpec = org_window.rowsBetween(-365, 0)
    for idx, col in enumerate(cols):
        df = df.withColumn(
            col + "_mean_1_year",
            F.avg(F.col(col)).over(windowSpec.rowsBetween(-365, 0)))
        df = df.withColumn(
            col + "_max_1_year",
            F.max(F.col(col)).over(windowSpec.rowsBetween(-365, 0)))
        df = df.withColumn(
            col + "_min_1_year",
            F.min(F.col(col)).over(windowSpec.rowsBetween(-365, 0)))
        df = df.withColumn(
            col + "_cummalitive_1_year",
            F.sum(F.col(col)).over(windowSpec.rowsBetween(-365, 0)))
        df = df.withColumn(
            col + "_std_1_year",
            F.stddev(F.col(col)).over(windowSpec.rowsBetween(-365, 0)))
    return df
Ejemplo n.º 55
0
	, (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") )
    
	# 6. cache the TFIDF data framework for further usage 
    tokensWithTfIdf.cache()
    return tokensWithTfIdf

def search_words (query , N ,TFIDF,  df):

	# 1. split the query to words
    query_lst = set(query.lower().split())
    # 2. calculate the number of words in query
	q_n = len(query_lst)
	# 3. search for query words in TFIDF and aggregate base on each document 
	# to summerize tf_idf and calculate the frequency of query words in each document
    search = TFIDF.filter(TFIDF.token.isin(query_lst)).groupby(TFIDF._id)\
	.agg(F.sum("tf_idf").alias("sum_tf_idf"), F.count("tf_idf").alias('freq'))      

	# 4. the score is calculated by multiplying sum_tf_idf to frequency of query words in document 
	# and dividing it to the number of words in query
	# in last step, order the results by highest scores and get N top of them as output
    search = search.select((search._id).alias("id") , (search.sum_tf_idf * search.freq / q_n).alias('scores'))\
	.orderBy("scores", ascending =False).limit(N)
	
	# 5. In the end, join the search output with original data framework to fetch text_entry
	# and select _id , rounded score (3 decimal) and text_entry as result of search
    search = search.join(df, df._id == search.id).select(search.id, F.bround(search.scores,3), "text_entry").orderBy("scores", ascending =False)    
    return search.collect()
    
    

def print_result (query ,result):
Ejemplo n.º 56
0
    def test_mixed_udfs(self):
        """
        Test mixing group aggregate pandas UDF with python UDF and scalar pandas UDF.
        """
        df = self.data
        plus_one = self.python_plus_one
        plus_two = self.pandas_scalar_plus_two
        sum_udf = self.pandas_agg_sum_udf

        # Mix group aggregate pandas UDF and python UDF
        result1 = (df.groupby('id')
                   .agg(plus_one(sum_udf(df.v)))
                   .sort('id'))
        expected1 = (df.groupby('id')
                     .agg(plus_one(sum(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and python UDF (order swapped)
        result2 = (df.groupby('id')
                   .agg(sum_udf(plus_one(df.v)))
                   .sort('id'))
        expected2 = (df.groupby('id')
                     .agg(sum(plus_one(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and scalar pandas UDF
        result3 = (df.groupby('id')
                   .agg(sum_udf(plus_two(df.v)))
                   .sort('id'))
        expected3 = (df.groupby('id')
                     .agg(sum(plus_two(df.v)))
                     .sort('id'))

        # Mix group aggregate pandas UDF and scalar pandas UDF (order swapped)
        result4 = (df.groupby('id')
                   .agg(plus_two(sum_udf(df.v)))
                   .sort('id'))
        expected4 = (df.groupby('id')
                     .agg(plus_two(sum(df.v)))
                     .sort('id'))

        # Wrap group aggregate pandas UDF with two python UDFs and use python UDF in groupby
        result5 = (df.groupby(plus_one(df.id))
                   .agg(plus_one(sum_udf(plus_one(df.v))))
                   .sort('plus_one(id)'))
        expected5 = (df.groupby(plus_one(df.id))
                     .agg(plus_one(sum(plus_one(df.v))))
                     .sort('plus_one(id)'))

        # Wrap group aggregate pandas UDF with two scala pandas UDF and user scala pandas UDF in
        # groupby
        result6 = (df.groupby(plus_two(df.id))
                   .agg(plus_two(sum_udf(plus_two(df.v))))
                   .sort('plus_two(id)'))
        expected6 = (df.groupby(plus_two(df.id))
                     .agg(plus_two(sum(plus_two(df.v))))
                     .sort('plus_two(id)'))

        assert_frame_equal(expected1.toPandas(), result1.toPandas())
        assert_frame_equal(expected2.toPandas(), result2.toPandas())
        assert_frame_equal(expected3.toPandas(), result3.toPandas())
        assert_frame_equal(expected4.toPandas(), result4.toPandas())
        assert_frame_equal(expected5.toPandas(), result5.toPandas())
        assert_frame_equal(expected6.toPandas(), result6.toPandas())
    timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
    timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])

    return Row(browser=browser,a_user_key=a_user_key,age=age,\
               day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
               pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
               device=device,gender=gender,days_since_registration=days_since_registration,\
               reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
               a_virtual=a_virtual)


if __name__ == "__main__":

    print(intervalIndDict)
    conf=SparkConf().setAppName('konsumprofiler').setMaster("local[8]").set('spark.app.id','200')

    sc=SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    period=2. #2 weeks of data

    konsum=sc.textFile('/home/erlenda/data/konsum/amedia_mainsite_20151124-20151207_15145.tsv').map(parseEntry)
    konsum_reg_user=konsum.filter(lambda x:(x.a_user_key!='NAN') and (x.a_user_key!='') )
    konsum_user=sqlContext.createDataFrame(konsum_reg_user)
    pprint(konsum_user.take(5))
    tt=konsum_user.groupBy('a_user_key').agg(sqlfuncs.sum('timegroup_pvs'))

    

    pprint(tt.take(5))
Ejemplo n.º 58
0
numPartitions = 32

rdd1 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
seed = 3
rdd2 = RandomRDDs.normalVectorRDD(spark, nRow, nCol, numPartitions, seed)
sc = spark.sparkContext

# convert each tuple in the rdd to a row
randomNumberRdd1 = rdd1.map(
    lambda x: Row(A=float(x[0]), B=float(x[1]), C=float(x[2]), D=float(x[3])))
randomNumberRdd2 = rdd2.map(
    lambda x: Row(E=float(x[0]), F=float(x[1]), G=float(x[2]), H=float(x[3])))

# create dataframe from rdd
schemaRandomNumberDF1 = spark.createDataFrame(randomNumberRdd1)
schemaRandomNumberDF2 = spark.createDataFrame(randomNumberRdd2)

# cache the dataframe
#schemaRandomNumberDF.cache()

cross_df = schemaRandomNumberDF1.crossJoin(schemaRandomNumberDF2)

# cache the dataframe
cross_df.cache()

# aggregate
results = cross_df.groupBy("A").agg(func.max("B"), func.sum("C"))
results.show(n=100)
print "----------Count in cross-join--------------- {0}".format(
    cross_df.count())
Ejemplo n.º 59
0
def count_null(col_name):
  return sum(col(col_name).isNull().cast('integer')).alias(col_name)
Ejemplo n.º 60
0
df.show()
"""# Aggregate each <window_minutes> window to compute:
- average number of people detected
- average group size
- average velocity
"""

# seconds = window_minutes * 60
window_str = '{} minutes'.format(window_minutes)
agg_df = (
    df.groupBy(
        window('timestamp',
               windowDuration=window_str,
               slideDuration=window_str)).agg(
                   F.sum('num_people'), F.sum('num_groups'),
                   F.sum('sum_velocities'), F.sum('num_velocities'),
                   avg('num_people'), collect_list('x_centers'),
                   collect_list('y_centers')).withColumn(
                       'avg_group_size',
                       avg_udf(struct('sum(num_people)',
                                      'sum(num_groups)'))).withColumn(
                                          'avg_velocity',
                                          avg_udf(
                                              struct('sum(sum_velocities)',
                                                     'sum(num_velocities)'))).
    withColumnRenamed(
        'avg(num_people)',
        'avg_num_people').withColumn(
            'x_centers', flattenUdf('collect_list(x_centers)')).withColumn(
                'y_centers', flattenUdf('collect_list(y_centers)')).drop(