コード例 #1
0
 def data(self):
     from pyspark.sql.functions import array, explode, col, lit
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
コード例 #2
0
    def test_basic(self):
        df = self.data
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        # Groupby one column and aggregate one UDF with literal
        result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id')
        expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())

        # Groupby one expression and aggregate one UDF with literal
        result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\
            .sort(df.id + 1)
        expected2 = df.groupby((col('id') + 1))\
            .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1)
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())

        # Groupby one column and aggregate one UDF without literal
        result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id')
        expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id')
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())

        # Groupby one expression and aggregate one UDF without literal
        result4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(weighted_mean_udf(df.v, df.w))\
            .sort('id')
        expected4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(mean(df.v).alias('weighted_mean(v, w)'))\
            .sort('id')
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
コード例 #3
0
ファイル: local_fare.py プロジェクト: jgran/TaxiPredict
def  make_prediction(event, df):
    event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6]
    udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType())
    udf_shortest_distance=udf(utils.shortest_distance, FloatType())
    df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30")
    df = df.withColumn("event_sum_distance",
        udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2")
    df = df.sort('event_sum_distance')
    if df.count() < 10:
        return [0,0]
    a = pd.DataFrame(df.take(50))
    a.columns = df.columns

    speed_array = a.as_matrix(["avg_speed"])
    dist_sf_array = a.as_matrix(["dist_sf"])
    distance_array = a["trip_distance"].tolist()
    fare_array = a["total_notip"].tolist()
    time_array = a["trip_time_in_secs"].tolist()

    #set initial parameter values
    x0 = [0.5, 0.5, 3.0, 3.0]
    bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10))
    
    #perform the fit
    res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds)
    grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

    #get the predictions
    time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean())
    fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3])
    if res.success == True:
        return [fare_pred, time_pred]
    else:
        return [0,0]
コード例 #4
0
ファイル: test_udf.py プロジェクト: drewrobb/spark
    def test_udf_with_decorator(self):
        from pyspark.sql.functions import lit, udf
        from pyspark.sql.types import IntegerType, DoubleType

        @udf(IntegerType())
        def add_one(x):
            if x is not None:
                return x + 1

        @udf(returnType=DoubleType())
        def add_two(x):
            if x is not None:
                return float(x + 2)

        @udf
        def to_upper(x):
            if x is not None:
                return x.upper()

        @udf()
        def to_lower(x):
            if x is not None:
                return x.lower()

        @udf
        def substr(x, start, end):
            if x is not None:
                return x[start:end]

        @udf("long")
        def trunc(x):
            return int(x)

        @udf(returnType="double")
        def as_double(x):
            return float(x)

        df = (
            self.spark
                .createDataFrame(
                    [(1, "Foo", "foobar", 3.0)], ("one", "Foo", "foobar", "float"))
                .select(
                    add_one("one"), add_two("one"),
                    to_upper("Foo"), to_lower("Foo"),
                    substr("foobar", lit(0), lit(3)),
                    trunc("float"), as_double("one")))

        self.assertListEqual(
            [tpe for _, tpe in df.dtypes],
            ["int", "double", "string", "string", "string", "bigint", "double"]
        )

        self.assertListEqual(
            list(df.first()),
            [2, 3.0, "FOO", "foo", "foo", 3, 1.0]
        )
コード例 #5
0
ファイル: groupby.py プロジェクト: zhang01GA/koalas
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i))
                         for i, s in enumerate(groupkeys)]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType):
                    stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        metadata = Metadata(data_columns=data_columns,
                            index_map=[('__index_level_{}__'.format(i), s.name)
                                       for i, s in enumerate(groupkeys)])
        return DataFrame(sdf, metadata)
コード例 #6
0
def Dijkstra(graph, start, end = None):
    # dist = sqlContext.createDataFrame(sc.emptyRDD(), StructType([]))
    field = [StructField("weight", FloatType(), True)]
    schema = StructType(field)
    
    dist = sqlContext.createDataFrame(sc.emptyRDD(), schema)
    # prev = sqlContext.createDataFrame(sc.emptyRDD(), StructType([]))
    prev = sqlContext.createDataFrame(sc.emptyRDD(), schema)
    
    queue = sqlContext.createDataFrame(sc.emptyRDD(), StructType([]))

    queue = queue.withColumn("weight", lit(0))
    
    for v in queue.show():
        dist_length = dist.filter(dist.weight == v)
        queue_length = queue.filter(queue.weight == v)
        dist_length = queue_length
        if dist_length == end:
            break 
        
        for w in graph.edges.filter("src = " + str(v)):
            vw_length = dist_length + graph.edges.filter("src = " + str(v) + "and dst = " + str(w))
            curr_w = queue.filter("weight = " + str(w))
            if dist.filter("weight = " + str(w)) != "":
                print "Dijkstra: found better path to already-final vertex"
            elif curr_w != "" or vw_length < curr_w:
                queue = queue.replace(queue.filter("weight = " + str(w)), vw_length)
                prev = prev.replace(prev.filter("weight = " + str(w)), v)                
    return(dist)
コード例 #7
0
    def test_udf(self):
        from ts.flint import udf
        import pyspark.sql.functions as F
        from pyspark.sql.types import LongType

        vol = self.vol()

        @udf(LongType())
        def foo(v, w):
            return v*2

        result1 = vol.withColumn("volume", foo(vol['volume'], F.lit(42))).toPandas()
        result2 = vol.withColumn("volume", udf(lambda v, w: v*2, LongType())(vol['volume'], F.lit(42))).toPandas()

        expected_pdf1 = make_pdf([
            (1000, 7, 200,),
            (1000, 3, 400,),
            (1050, 3, 600,),
            (1050, 7, 800,),
            (1100, 3, 1000,),
            (1100, 7, 1200,),
            (1150, 3, 1400,),
            (1150, 7, 1600,),
            (1200, 3, 1800,),
            (1200, 7, 2000,),
            (1250, 3, 2200,),
            (1250, 7, 2400,)
        ], ['time', 'id', 'volume'])

        assert_same(result1, expected_pdf1)
        assert_same(result2, expected_pdf1)
コード例 #8
0
ファイル: classes.py プロジェクト: larsbkrogvig/strava-spark
    def derive_schema(self):
        '''
        Loads all data in self.path and derives the schema, saves with pickle to "schema.p"
        '''

        df = self.hiveContext.read.format('com.databricks.spark.xml') \
                    .options(rowTag='trkpt') \
                    .load(self.path+'gpx/*')

        df = df.withColumn('athlete',lit(None).cast(StringType())) \
               .withColumn('activity_type',lit(None).cast(StringType()))

        df.printSchema()
        pickle.dump(df.schema, open("schema.p", "wb"))

        pass
コード例 #9
0
ファイル: kinesisStream.py プロジェクト: ibnipun10/Projects
def processRdd(rdd):
	
	try:
		print 'processRDD'
		#covnert to a dataframe from rdd
		
		printOnConsole('Started Processing the streams')

		#desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1']
		if rdd.count() > 0:
			df = pycsv.csvToDataFrame(sqlContext, rdd, columns=COLUMNS, colTypes=COLUMN_TYPES)
			#df = df.select(desiredCol)
			
			#startTime
			endTime = getCurrentTimeStamp()
			startTime = endTime - SPARK_STREAM_BATCH
			
			endTime = getDateTimeFormat(endTime)
			startTime = getDateTimeFormat(startTime)
			df = df.withColumn(COL_STARTTIME, lit(startTime))
			
			#endTime
			df = df.withColumn(COL_ENDTIME, lit(endTime))

			df.registerTempTable("tempTable")
			query = ('select' + 
					' startTime,' +  																				#startTime
					' endTime,' +  																					#endTime				
					' \'\' as ' +  COL_CUSTOMERID +  ',' +															#customerid				
					' setProjectId(`projectid`) as ' +  COL_PROJECTID + ',' +														#projectid					 	
					' \'\' as ' +  COL_FONTTYPE +  ',' + 															#FontType
					' \'\' as ' +  COL_FONTID +  ',' + 																#FontId
					' getDomainName(`referrer`) as ' +  COL_DOMAINNAME +  ',' + 											#DomainName
					' getBrowser(`useragent`) as ' + COL_USERAGENT +  ',' + 										#UserAgent
					' setIpaddress(`ip`) as ' +  COL_IPADDRESS + 																	#customer ipaddress   
					' from tempTable')

			df = sqlContext.sql(query)
			
			type =  PAGEVIEW_TYPE | PAGEVIEWGEO_TYPE
			processForTable(df, type)
		else:
			printOnConsole('Nothing to process')
	
	except Exception, ex:
		printOnConsole('There was an error...')
		print ex			
コード例 #10
0
ファイル: etl.py プロジェクト: faizana/rbc_challenge
def process_writeable_df(joined_df, date_format="yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"):
    """
    Prepares the dataframe for writing to mongo
    :param joined_df:
    :param date_format:
    :return:
    """
    df_with_parsed_dates = parse_dates(joined_df, date_format)
    df_with_id = df_with_parsed_dates.withColumn("id", f.concat(f.col('account_id'), f.lit("_"), f.col("unix_ts")))
    return df_with_id.na.drop()
コード例 #11
0
    def test_datasource_with_udf(self):
        # Same as SQLTests.test_datasource_with_udf, but with Pandas UDF
        # This needs to a separate test because Arrow dependency is optional
        import pandas as pd
        import numpy as np
        from pyspark.sql.functions import pandas_udf, lit, col

        path = tempfile.mkdtemp()
        shutil.rmtree(path)

        try:
            self.spark.range(1).write.mode("overwrite").format('csv').save(path)
            filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i')
            datasource_df = self.spark.read \
                .format("org.apache.spark.sql.sources.SimpleScanSource") \
                .option('from', 0).option('to', 1).load().toDF('i')
            datasource_v2_df = self.spark.read \
                .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \
                .load().toDF('i', 'j')

            c1 = pandas_udf(lambda x: x + 1, 'int')(lit(1))
            c2 = pandas_udf(lambda x: x + 1, 'int')(col('i'))

            f1 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(lit(1))
            f2 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(col('i'))

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                result = df.withColumn('c', c1)
                expected = df.withColumn('c', lit(2))
                self.assertEquals(expected.collect(), result.collect())

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                result = df.withColumn('c', c2)
                expected = df.withColumn('c', col('i') + 1)
                self.assertEquals(expected.collect(), result.collect())

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                for f in [f1, f2]:
                    result = df.filter(f)
                    self.assertEquals(0, result.count())
        finally:
            shutil.rmtree(path)
コード例 #12
0
ファイル: test_functions.py プロジェクト: Brett-A/spark
 def test_string_functions(self):
     from pyspark.sql.functions import col, lit
     df = self.spark.createDataFrame([['nick']], schema=['name'])
     self.assertRaisesRegexp(
         TypeError,
         "must be the same type",
         lambda: df.select(col('name').substr(0, lit(1))))
     if sys.version_info.major == 2:
         self.assertRaises(
             TypeError,
             lambda: df.select(col('name').substr(long(0), long(1))))
コード例 #13
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
コード例 #14
0
    def test_vectorized_udf_struct_with_empty_partition(self):
        df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))\
            .withColumn('name', lit('John Doe'))

        @pandas_udf("first string, last string")
        def split_expand(n):
            return n.str.split(expand=True)

        result = df.select(split_expand('name')).collect()
        self.assertEqual(1, len(result))
        row = result[0]
        self.assertEqual('John', row[0]['first'])
        self.assertEqual('Doe', row[0]['last'])
コード例 #15
0
ファイル: test_udf.py プロジェクト: drewrobb/spark
    def test_datasource_with_udf(self):
        from pyspark.sql.functions import udf, lit, col

        path = tempfile.mkdtemp()
        shutil.rmtree(path)

        try:
            self.spark.range(1).write.mode("overwrite").format('csv').save(path)
            filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i')
            datasource_df = self.spark.read \
                .format("org.apache.spark.sql.sources.SimpleScanSource") \
                .option('from', 0).option('to', 1).load().toDF('i')
            datasource_v2_df = self.spark.read \
                .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \
                .load().toDF('i', 'j')

            c1 = udf(lambda x: x + 1, 'int')(lit(1))
            c2 = udf(lambda x: x + 1, 'int')(col('i'))

            f1 = udf(lambda x: False, 'boolean')(lit(1))
            f2 = udf(lambda x: False, 'boolean')(col('i'))

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                result = df.withColumn('c', c1)
                expected = df.withColumn('c', lit(2))
                self.assertEquals(expected.collect(), result.collect())

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                result = df.withColumn('c', c2)
                expected = df.withColumn('c', col('i') + 1)
                self.assertEquals(expected.collect(), result.collect())

            for df in [filesource_df, datasource_df, datasource_v2_df]:
                for f in [f1, f2]:
                    result = df.filter(f)
                    self.assertEquals(0, result.count())
        finally:
            shutil.rmtree(path)
コード例 #16
0
ファイル: test_summarizer.py プロジェクト: mattomatic/flint
    def test_summary_weighted_covariance(self):
        import pyspark.sql.functions as F
        from ts.flint import summarizers

        price = self.price()
        forecast = self.forecast()

        expected_pdf = make_pdf([
            (0, -1.96590909091,)
        ], ["time", "price_forecast_weight_weightedCovariance"])

        joined = price.leftJoin(forecast, key="id").withColumn('weight', F.lit(2.0))
        result = joined.summarize(summarizers.weighted_covariance("price", "forecast", "weight")).toPandas()
        pdt.assert_frame_equal(result, expected_pdf)
コード例 #17
0
ファイル: sampling_improved.py プロジェクト: USF-ML2/SKYNET-
def labelRDDs(driver, path, sc):
    sqlContext = SQLContext(sc)

    target = str(driver) + '.json'
    driver_pool = list(all_drivers)
    driver_pool.remove(target)

    sample_drivers = np.random.choice(driver_pool, K, replace=False)
    sample_drivers_paths = [path + i for i in sample_drivers]
    sampled = sqlContext.read.json(sample_drivers_paths)
    orig = sqlContext.read.json(path + target)
    samples = sampled.sample(False, .0055)
    samples = samples.withColumn('label', lit(0))
    orig = orig.withColumn('label', lit(1))
    rawdata = samples.unionAll(orig)
    rawdata = rawdata.select(rawdata['driver'],
                             rawdata['trip'],
                             rawdata['x'],
                             rawdata['y'],
                             rawdata['step'],
                             rawdata['label'])
    rawRDD = rawdata.rdd
    return rawRDD.map(maketups)
コード例 #18
0
ファイル: classes.py プロジェクト: larsbkrogvig/strava-spark
    def _load_dataset(self):
        '''
        Loads strava activities from source to DataFrame self.df
        '''

        # Get athlete list if not already set
        if not self.athletes:
            self._get_athlete_directories()

        # Initialize empty dataset
        self.df = self.hiveContext.createDataFrame(
            self.sc.emptyRDD(),
            self.schema
        )

        for athlete in self.athletes:
            for activity_type in self.activity_types:
        
                # Check that there are files of that type (or else .load fails)
                if self._activities_exist(athlete, activity_type):

                    # Read data
                    dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \
                                    .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \
                                    .schema(self.schema) \
                                    .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))
                
                    dfadd = dfadd.withColumn('athlete', lit(athlete)) \
                                 .withColumn('activity_type', lit(activity_type))
                
                    self.df = self.df.unionAll(dfadd)

        if self.filter_bug_inducing_rows:
            self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull())

        pass
コード例 #19
0
ファイル: ch_spark.py プロジェクト: akamlani/cooperhewitt
 def _minimize_query(self):
     # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
     tb_samples = self.hive_cxt.sql("""
         SELECT *
         FROM (
             SELECT *,
             MIN(delta)   OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
             row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
             FROM samplestemporal st
             ORDER BY refers_to_object_id
         ) query
         where query.ranks = 1
     """)
     tb_samples = tb_samples.withColumn("meta_store", lit(1))
     tb_samples.registerTempTable('minimizedsamples')
     self.hive_cxt.cacheTable('minimizedsamples')
     return tb_samples
コード例 #20
0
ファイル: test_functions.py プロジェクト: apache/spark
    def test_string_functions(self):
        from pyspark.sql import functions
        from pyspark.sql.functions import col, lit, _string_functions
        df = self.spark.createDataFrame([['nick']], schema=['name'])
        self.assertRaisesRegexp(
            TypeError,
            "must be the same type",
            lambda: df.select(col('name').substr(0, lit(1))))
        if sys.version_info.major == 2:
            self.assertRaises(
                TypeError,
                lambda: df.select(col('name').substr(long(0), long(1))))

        for name in _string_functions.keys():
            self.assertEqual(
                df.select(getattr(functions, name)("name")).first()[0],
                df.select(getattr(functions, name)(col("name"))).first()[0])
コード例 #21
0
ファイル: tests.py プロジェクト: mengxr/graphframes
 def test_aggregate_messages(self):
     g = self._graph("friends")
     # For each user, sum the ages of the adjacent users,
     # plus 1 for the src's sum if the edge is "friend".
     sendToSrc = (
         AM.dst['age'] +
         sqlfunctions.when(
             AM.edge['relationship'] == 'friend',
             sqlfunctions.lit(1)
         ).otherwise(0))
     sendToDst = AM.src['age']
     agg = g.aggregateMessages(
         sqlfunctions.sum(AM.msg).alias('summedAges'),
         sendToSrc=sendToSrc,
         sendToDst=sendToDst)
     # Run the aggregation again providing SQL expressions as String instead.
     agg2 = g.aggregateMessages(
         "sum(MSG) AS `summedAges`",
         sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)",
         sendToDst="src['age']")
     # Convert agg and agg2 to a mapping from id to the aggregated message.
     aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()}
     agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()}
     # Compute the truth via brute force.
     user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()}
     trueAgg = {}
     for src, dst, rel in g.edges.select("src", "dst", "relationship").collect():
         trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0)
         trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src]
     # Compare if the agg mappings match the brute force mapping
     self.assertEqual(aggMap, trueAgg)
     self.assertEqual(agg2Map, trueAgg)
     # Check that TypeError is raises with messages of wrong type
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=object(),
             sendToDst="src['age']")
     with self.assertRaises(TypeError):
         g.aggregateMessages(
             "sum(MSG) AS `summedAges`",
             sendToSrc=dst['age'],
             sendToDst=object())
コード例 #22
0
ファイル: test_summarizer.py プロジェクト: mattomatic/flint
    def test_summary_weighted_correlation(self):
        import pyspark.sql.functions as F
        from ts.flint import summarizers

        price = self.price()
        forecast = self.forecast()

        joined = price.leftJoin(forecast, key="id").withColumn('weight', F.lit(1.0)).withColumn('weight2', F.lit(42.0))
        result = joined.summarize(summarizers.weighted_correlation("price", "forecast", "weight")).toPandas()
        result2 = joined.summarize(summarizers.weighted_correlation("price", "forecast", "weight2")).toPandas()
        expected = joined.summarize(summarizers.correlation("price", "forecast")).toPandas()

        assert(np.isclose(
            result['price_forecast_weight_weightedCorrelation'][0],
            expected['price_forecast_correlation'][0]))

        assert(np.isclose(
            result2['price_forecast_weight2_weightedCorrelation'][0],
            expected['price_forecast_correlation'][0]))
コード例 #23
0
def featurizeData(raw, gap, vocabFile, featFile):
    feats = raw.dropDuplicates(['cluster', 'series', 'date'])\
            .withColumn('day', datediff(col('date'), lit('1970-01-01')))\
            .na.drop(subset=['day'])\
            .rdd.groupBy(lambda r: r.cluster)\
            .flatMap(lambda c: clusterFeatures(c, gap))\
            .toDF()

    feats.cache()
    cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0)
    interner = cv.fit(feats)      # alternate possibility: grab features only from label==1 edges
    full = interner.transform(feats)
    # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features')
    # # I don't think a Pipeline will work here since we need to get the interner.vocabulary
    # full = combiner.transform(interner.transform(feats)).drop('categorial')

    full.write.parquet(featFile)
    np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s')
    feats.unpersist()
コード例 #24
0
ファイル: testColumnHelper.py プロジェクト: TresAmigosSD/SMV
    def test_smvArrayFlatten(self):
        df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4')
        df1 = df.select(F.array(
            F.array(F.lit(None), F.col('a')),
            F.array(F.col('a'), F.col('b'), F.col('c'))
        ).alias('aa'))

        res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\
            .select(SF.smvArrayCat('|', F.col('a')).alias('k'))

        exp = self.createDF("k: String",
        """||||;
            |1|1|2|;
            |2|2|3|4""")

        res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\
            .select(SF.smvArrayCat('|', F.col('a')).alias('k'))

        self.should_be_same(res1, exp)
        self.should_be_same(res2, exp)
コード例 #25
0
ファイル: test_streaming.py プロジェクト: CodingCat/spark
 def test_stream_save_options(self):
     df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') \
         .withColumn('id', lit(1))
     for q in self.spark._wrapped.streams.active:
         q.stop()
     tmpPath = tempfile.mkdtemp()
     shutil.rmtree(tmpPath)
     self.assertTrue(df.isStreaming)
     out = os.path.join(tmpPath, 'out')
     chk = os.path.join(tmpPath, 'chk')
     q = df.writeStream.option('checkpointLocation', chk).queryName('this_query') \
         .format('parquet').partitionBy('id').outputMode('append').option('path', out).start()
     try:
         self.assertEqual(q.name, 'this_query')
         self.assertTrue(q.isActive)
         q.processAllAvailable()
         output_files = []
         for _, _, files in os.walk(out):
             output_files.extend([f for f in files if not f.startswith('.')])
         self.assertTrue(len(output_files) > 0)
         self.assertTrue(len(os.listdir(chk)) > 0)
     finally:
         q.stop()
         shutil.rmtree(tmpPath)
コード例 #26
0
def process_immigration_data(spark, input_data, output_data, cit_res,
                             port_state_code, port_city, mode, addr, visa):
    """
    ETL process for i94_{mon}16_sub.sas7bdat datasets

    Parameters:
    spark (SparkSession) : Spark Session
    input_data (str) : path of input data
    output_data (str) : path of directory where output data will be stored
    cit_res (dict) : Mapping for i94cit and i94res columns' values
    port_state_code (dict) : Mapping for i94port column's values -state part-
    port_city (dict) : Mapping for i94port column's values -city part-
    mode (dict) : Mapping for i94mode column's values
    addr (dict) : Mapping for i94addr column's values
    visa (dict) : Mapping for i94visa column's values

    """

    # read data from each dataset and compine than to a single DataFrame
    df = spark.read.format('com.github.saurfang.sas.spark').load(input_data[0])
    for path in input_data[1:]:
        df = df.union(
            spark.read.format('com.github.saurfang.sas.spark').load(path))

    # drop columns that won't be used
    cols_to_drop = [
        '_c0', 'cicid', 'count', 'visapost', 'occup', 'entdepa', 'entdepd',
        'entdepu', 'matflag', 'insnum'
    ]
    df = df.drop(*cols_to_drop)

    # replace invalid state codes with '99'
    df = df.withColumn(
        'i94addr',
        when(~df['i94addr'].isin(*(addr.keys())),
             '99').otherwise(df['i94addr']))

    # (mapping dictionary, column where mapping is applied, new column name)
    maps = [(cit_res, 'i94cit', 'i94cit'), (cit_res, 'i94res', 'i94res'),
            (port_state_code, 'i94port', 'i94port_state'),
            (port_city, 'i94port', 'i94port_city'),
            (mode, 'i94mode', 'i94mode'), (addr, 'i94addr', 'state'),
            (visa, 'i94visa', 'i94visa')]

    # use mappings to replace codes in 'i94cit', 'i94res', 'i94port', 'i94mode',
    # 'i94addr' and 'i94visa' columns with their values
    for map_dic, from_col, col_name, in maps:
        mapping_expr = create_map([lit(x) for x in chain(*map_dic.items())])
        df = df.withColumn(col_name, mapping_expr.getItem(col(from_col)))

    df.createOrReplaceTempView('immigration')

    # transform gender column values: 'M' is replaced with 'Male' and 'F' with 'Female'
    # if the transportation mode is 'Land', 'Sea', 'Not reported' of NULL and
    # flight number or airline is not NULL: transportation mode is changes to 'Air'
    df = spark.sql("""
        SELECT CAST(i94yr AS INT) AS arrival_year,
               CAST(i94mon AS INT) AS arrival_month,
               DATE_ADD('1960-01-01', arrdate) AS arrival_date,
               DATE_ADD('1960-01-01', depdate) AS departure_date,

               i94port_city AS port_city,
               i94port_state AS port_state_code,

               i94cit AS origin_country,
               i94res AS residence_country,
               CAST(biryear AS INT) AS birth_year,
               CAST(i94bir AS INT) AS age,
               CASE
                    WHEN gender = 'M' THEN 'Male'
                    WHEN gender = 'F' THEN 'Female'
               END AS gender,

               CAST(admnum AS INT) AS admission_num,
               TO_DATE(dtadfile, 'yyyyMMdd') AS admission_date,
               TO_DATE(dtaddto, 'MMddyyyy') AS admitted_until,
               i94visa visa_category,
               visatype AS visa_type,

               state,
               i94addr AS state_code,

               CASE
                    WHEN (i94mode = 'Land' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air'
                    WHEN (i94mode = 'Sea' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air'
                    WHEN (i94mode = 'Not reported' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air'
                    WHEN (i94mode IS NULL AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air'
                    ELSE i94mode
               END AS transportation_mode,
               airline,
               fltno AS flight_num
          FROM immigration
    """)

    df = df.distinct()

    # data_quality_check(df)

    # save DataFrame as .parquet in output_data/us_immigration directory
    print('Saving us_immigration table to {}'.format(output_data))
    df.write.parquet(os.path.join(output_data, 'us_immigration'),
                     'overwrite',
                     partitionBy=['arrival_month', 'port_state_code'])
コード例 #27
0
storesMaxDate = RFM0.groupby(['Store']).agg(sf.max('Date'))
display(storesMaxDate)

# COMMAND ----------

storesMaxDate.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC Obtaining the difference between the global maximum date and the maximum date of each store:

# COMMAND ----------

r = (RFM0.join(storesMaxDate, on='Store').withColumn(
    'Recency', datediff(to_date(lit(maxDate[0])), col('max(Date)'))))

display(r)
#recency.count()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Frequency
# MAGIC
# MAGIC The frequency of purchases that occurred in the given period is obtained. In this case, it is observed how all the stores had sales every day.

# COMMAND ----------

f = (RFM0.groupby(['Store']).count().withColumn('Frequency', col('count')))
コード例 #28
0
notebook = os.path.basename(getNotebookPath())

input_data_path = "/dbfs/mnt/" + environment + "/automl_rev_region_forecast/inputs/"
output_data_path = "/dbfs/mnt/" + environment + "/automl_rev_region_forecast/outputs/"
blobstore_datadir = "revregionforecast_data/"

bi_config_parameter_filepath = "/mnt/{}/automl_rev_region_forecast/config/{}".format(
    environment_name, filename)

try:
    # read JSON file
    df_bi_config_parameters = (spark.read.format("json").option(
        "multiline", "true").load(bi_config_parameter_filepath))
    df_bi_config_parameters = df_bi_config_parameters.filter(
        df_bi_config_parameters.SystemName == lit(system_name))
    display(df_bi_config_parameters)

    # adding audit fields
    # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBICreatedBy",lit(ibi_created_by))
    # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBIUpdatedBy",lit(ibi_updated_by))
    # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBICreatedDate",lit(ibi_created_date).cast(TimestampType()))
    # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBIUpdatedDate",lit(ibi_updated_date).cast(TimestampType()))
    # df_bi_config_parameters = df_bi_config_parameters.withColumn('ID', row_number().over(Window.orderBy('EnvironmentName','SystemName')))

    # initializing config parameter values

    # subscription_id = "db61fd47-db56-45e3-844f-1b1f5c47990a" #you should be owner or contributor
    if environment in {"prod"}:
        subscription_id = dbutils.secrets.get(scope="kv-bi-prod-01-secrets",
                                              key="subscription-id")
コード例 #29
0
def process_log_data(spark, input_data, output_data, song_df):
    """
    Description: This function can be used to process log-data files from the 
    given input path and transform the data from json files into users, time and songplays  
    spark tables and writing these tables to the given output path as parquet tables.

    Arguments:
        spark: SparkSession object. 
        input_data: Path to the input JSON files. 
        output_data: Path to the output directory that stores output parquet tables.
        song_df: Song data dataframe.

    Returns:
        None. 
    """
    # get filepath to log data file
    log_data = input_data + 'log-data/2018/11'

    # define schema for log data file
    log_schema = t.StructType([
        t.StructField("artist", t.StringType(), True),
        t.StructField("auth", t.StringType(), True),
        t.StructField("firstName", t.StringType(), True),
        t.StructField("gender", t.StringType(), True),
        t.StructField("itemInSession", t.IntegerType(), True),
        t.StructField("lastName", t.StringType(), True),
        t.StructField("length", t.DecimalType(12, 7), True),
        t.StructField("level", t.StringType(), True),
        t.StructField("location", t.StringType(), True),
        t.StructField("method", t.StringType(), True),
        t.StructField("page", t.StringType(), True),
        t.StructField("registration", t.DecimalType(16, 2), True),
        t.StructField("sessionId", t.IntegerType(), True),
        t.StructField("song", t.StringType(), True),
        t.StructField("status", t.IntegerType(), True),
        t.StructField("ts", t.LongType(), True),
        t.StructField("userAgent", t.StringType(), True),
        t.StructField("userId", t.StringType(), True)
    ])

    # read log data file using schema
    df = spark \
        .read \
        .format("json") \
        .schema(log_schema) \
        .load(log_data)

    # filter by actions for song plays
    df = df \
        .filter('page = "NextSong"')

    # group by userId for unique users
    users_list = df \
        .groupBy('userId') \
        .agg(f.max('ts').alias('ts'))

    # extract columns to create users table
    users_table = df \
        .join(users_list, ['userId', 'ts'], 'inner') \
        .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \
        .dropDuplicates()

    # write users table to parquet files
    users_output = output_data + 'users'

    users_table \
        .write \
        .option("path", users_output) \
        .saveAsTable('users', format='parquet')

    # create timestamp column from original timestamp column
    df = df \
        .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                       t.TimestampType())
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table
    time_table = df \
        .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \
        .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_output = output_data + 'time'

    time_table \
        .write \
        .partitionBy('year', 'month') \
        .option("path", time_output) \
        .saveAsTable('time', format='parquet')

    # join and extract columns from song and log datasets to create songplays table
    cond = [
        df.artist == song_df.artist_name, df.song == song_df.title,
        df.length == song_df.duration
    ]
    songplays_df = df.join(song_df, cond, 'left')

    songplays_df = songplays_df \
        .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month'))
    w = Window().orderBy(f.lit('A'))
    songplays_table = songplays_df.withColumn('songplay_id',
                                              f.row_number().over(w))

    # write songplays table to parquet files partitioned by year and month
    songplays_output = output_data + 'songplays'

    songplays_table \
        .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\
        .write \
        .partitionBy('year', 'month') \
        .option("path", songplays_output) \
        .saveAsTable('songplays', format='parquet')
コード例 #30
0
 def test_different_schemas(self):
     right = self.data2.withColumn('v3', lit('a'))
     self._test_merge(self.data1, right,
                      'id long, k int, v int, v2 int, v3 string')
コード例 #31
0
ファイル: test_sampling.py プロジェクト: kdhingra307/ncm
def test_sampling_general_approach(spark_context, hive_context):
    """Generate a dataframe and see if sampling it has same general shape"""

    # Create wikis with different looking long tail distributions
    wikis = [
        ("foowiki", 1500, -1),
        ("barwiki", 700, -1),
        # This has a very flat long tail, with most data points being the same
        ("bazwiki", 700, -2),
    ]
    # use all combinations of lowercase letters as our set of test queries. This is 26^2,
    # or just shy of 700 queries.
    queries = [
        "%s%s" % pair for pair in itertools.product(string.ascii_lowercase,
                                                    string.ascii_lowercase)
    ]
    rows = []
    for (wiki, a, k) in wikis:
        # create sessions for each query with a long tail distribution
        for (x, q) in enumerate(queries):
            # approximate a long tail distribution using ax^k + b
            # x + 1 needed because enumerate starts at 0. b is set to 10 to test the
            # min sessions per query limit
            num_sessions = max(1, min(100, int(a * math.pow(x + 1, k)) + 10))
            for j in range(0, num_sessions):
                session_id = "%s_%s_%s" % (wiki, q, str(j))
                rows.append((wiki, q, x, session_id, list(range(3))))

    df = (spark_context.parallelize(rows).toDF(
        ['wikiid', 'query', 'norm_query_id', 'session_id', 'hit_page_ids']))

    samples_per_wiki = 1000
    # Using a constant seed ensures deterministic testing. Because this code
    # actually relies on the law of large numbers, and we do not have large
    # numbers here, many seeds probably fail.
    hit_page_id_counts, df_sampled = mjolnir.sampling.sample(
        df, samples_per_wiki=samples_per_wiki, seed=12345)
    sampled = (df_sampled.select(
        'wikiid', 'query',
        F.explode('hit_page_ids').alias(
            'hit_page_id')).drop_duplicates().groupBy('wikiid').agg(
                F.count(F.lit(1)).alias('num_samples')).collect())

    total_samples_desired = len(wikis) * samples_per_wiki
    total_samples = sum([r.num_samples for r in sampled])
    assert abs(total_samples -
               total_samples_desired) / float(total_samples_desired) < 0.05
    # Test each wiki also meets the constraint
    for (wiki, _, _) in wikis:
        # ratio of rows
        sampled_num_rows = sum(
            [r.num_samples for r in sampled if r.wikiid == wiki])
        # TODO: Why 0.10? It works with our seed...
        assert abs(sampled_num_rows -
                   samples_per_wiki) / float(samples_per_wiki) <= 0.10

    # assert correlation between sessions per query
    orig_grouped = (df.groupBy('wikiid', 'norm_query_id').agg(
        F.countDistinct('session_id').alias('num_sessions')).collect())
    sampled_grouped = (df_sampled.groupBy('wikiid', 'norm_query_id').agg(
        F.countDistinct('session_id').alias('num_sessions')).collect())

    for (wiki, _, _) in wikis:
        orig = sorted(
            [r.num_sessions for r in orig_grouped if r.wikiid == wiki])
        sampled = sorted(
            [r.num_sessions for r in sampled_grouped if r.wikiid == wiki])
        # interpolate sampled into the same length as orig
        sampled_interp = np.interp(range(len(orig)),
                                   np.linspace(1, len(orig), len(sampled)),
                                   sampled)
        # corrcoef allows comparing N data sets, returning a covariance matrix.
        # take 0,1 to get corr(orig, sampled_interp)
        corr = np.corrcoef(orig, sampled_interp)[0, 1]
        # Is .8 reasonable? Sometimes this fails when using something stricter
        # like .95
        assert corr > .8, wiki
コード例 #32
0
color_new_2019  = dfcolor_2019.exceptAll(dfcolor_2018).withColumnRenamed('advertiser','new advertiser')
starplus_lost_2019 = dfstarplus_2018.exceptAll(dfstarplus_2019).withColumnRenamed('advertiser','lost advertiser')
starplus_new_2019  = dfstarplus_2019.exceptAll(dfstarplus_2018).withColumnRenamed('advertiser','new advertiser')
starplus_lost_2019.show()
starplus_new_2019.show()
color_lost_2019.show()
color_new_2019.show()

# COMMAND ----------

from pyspark.sql.types import StructField, StructType,StringType
from pyspark.sql.dataframe import 
LostFoundSchema = StructType([StructField('channel',StringType(),True),
                              StructField('advertiser_lost',StringType(),True),
                              StructField('advertiser_found',StringType(),True)])

# COMMAND ----------

from pyspark.sql.functions import lit
df1 = starplus_lost_2019.join(starplus_new_2019,'channel','leftouter')
df2 = color_lost_2019.join(color_new_2019,'channel','inner')
dfLostFound = df1.union(df2).withColumn('year',lit(2019))

# COMMAND ----------

dfLostFound.select('year','channel','lost advertiser','new advertiser').show()

# COMMAND ----------


コード例 #33
0
def main():
    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)

    start_date_id = 20200101
    end_date_id = 20200305
    print('start_date_id: ', start_date_id)
    print('end_date_id: ', end_date_id)
    #
    start_year_month_id, end_year_month_id = get_year_month_id_from_date(start_date_id, end_date_id)
    start_year_week_id, end_year_week_id = get_year_week_id_from_date(start_date_id, end_date_id)
    #
    print('start_year_month_id: ', start_year_month_id)
    print('end_year_month_id: ', end_year_month_id)

    print('start_year_week_id: ', start_year_week_id)
    print('end_year_week_id: ', end_year_week_id)

    print('start_year_week_id: ', start_year_week_id)
    print('end_year_week_id: ', end_year_week_id)

    # ------------------------------------------------------------------------------------------------------------------#
    df_student_package_status_by_date = get_student_package_adivsor_level(start_date_id, end_date_id)
    df_student_package_status_by_date.cache()

    df_student_learning_and_duration_by_date = get_total_student_lerning_and_duration_by_date(glueContext,
                                                                              start_year_month_id,
                                                                              end_year_month_id)

    df_student_package_status_by_date_learning = df_student_package_status_by_date\
        .join(df_student_learning_and_duration_by_date,
              on=['contact_id', 'date_id'],
              how='left')

    df_student_package_status_by_date_learning = df_student_package_status_by_date_learning.na.fill({
        'total_learning_ls_sc_lt_le2': 0L,
        'total_learning_ls_sc_lt_le2_success': 0L,

        'total_learning_ls_sc_lt': 0L,
        'total_learning_ls_sc_lt_success': 0L,

        'total_learning_ls_success': 0L,
        'total_learning_sc_success': 0L,
        'total_learning_lt_success': 0L,

        'total_duration_ls_sc_lt': 0L,

        'total_learning_le2': 0L,
        'total_learning_le2_success': 0L,

        'total_learning_voxy_success': 0L,
        'total_learning_native_talk_success': 0L,
        'total_learning_home_work_success': 0L,
        'total_learning_ncsbasic_success': 0L,

        'total_duration_le2': 0L,
        'total_duration_voxy': 0L,
        'total_duration_native_talk': 0L,
        'total_duration_home_work': 0L,
        'total_duration_ncsbasic': 0L
    })

    df_student_package_status_by_date_learning.cache()

    print('df_student_package_status_by_date_learning')
    df_student_package_status_by_date_learning.printSchema()
    df_student_package_status_by_date_learning.show(3)

    if is_dev:
        dyf_student_package_status_by_date_learning = DynamicFrame \
            .fromDF(df_student_package_status_by_date_learning, glueContext, 'dyf_student_package_status_by_date_learning')
        atasink4 = glueContext.write_dynamic_frame \
            .from_jdbc_conf(frame=dyf_student_package_status_by_date_learning,
                            catalog_connection="glue_redshift",
                            connection_options={
                                "dbtable": "dev.df_student_package_status_by_date_learning",
                                "database": "student_native_report"
                            },
                            redshift_tmp_dir="s3://dts-odin/temp/nvn/knowledge/student/df_student_package_status_by_date_learning",
                            transformation_ctx="datasink4")


    #-------------- save to bc200_fact

    df_student_package_status_by_date_learning = df_student_package_status_by_date_learning \
        .select('date_id', 'package_id', 'student_level_id', 'contact_id', 'advisor_id',
            'is_activated',

            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_le2'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_le2_success'),

            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_sc_lt_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_ls_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ls_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_sc_success'] > 0L, 1L)
            .otherwise(0L).alias('is_sc_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_lt_success'] > 0L, 1L)
            .otherwise(0L).alias('is_lt_success'),

            f.when(df_student_package_status_by_date_learning['total_learning_le2'] > 0L, 1L)
            .otherwise(0L).alias('is_le2'),
            f.when(df_student_package_status_by_date_learning['total_learning_le2_success'] > 0L, 1L)
            .otherwise(0L).alias('is_le2_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_voxy_success'] > 0L, 1L)
            .otherwise(0L).alias('is_voxy_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_native_talk_success'] > 0L, 1L)
            .otherwise(0L).alias('is_native_talk_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_home_work_success'] > 0L, 1L)
            .otherwise(0L).alias('is_home_work_success'),
            f.when(df_student_package_status_by_date_learning['total_learning_ncsbasic_success'] > 0L, 1L)
            .otherwise(0L).alias('is_ncsbasic_success'),

            'total_learning_ls_sc_lt_le2',
            'total_learning_ls_sc_lt_le2_success',

            'total_learning_ls_sc_lt',
            'total_learning_ls_sc_lt_success',
            'total_learning_ls_success',
            'total_learning_sc_success',
            'total_learning_lt_success',

            'total_duration_ls_sc_lt',

            'total_learning_le2',
            'total_learning_le2_success',
            'total_learning_voxy_success',
            'total_learning_native_talk_success',
            'total_learning_home_work_success',
            'total_learning_ncsbasic_success',

            'total_duration_le2',
            'total_duration_voxy',
            'total_duration_native_talk',
            'total_duration_home_work',
            'total_duration_ncsbasic'
        )

    df_student_package_status_group_week = df_student_package_status_by_date_learning \
        .groupBy('date_id', 'package_id', 'student_level_id', 'advisor_id') \
        .agg(f.count('contact_id').alias('total_student'),
             f.sum('is_activated').alias('total_student_active'),

             f.sum('is_ls_sc_lt_le2').alias('total_student_ls_sc_lt_le2'),
             f.sum('is_ls_sc_lt_le2_success').alias('total_student_ls_sc_lt_le2_success'),

             f.sum('total_learning_ls_sc_lt_le2').alias('total_learning_ls_sc_lt_le2'),
             f.sum('total_learning_ls_sc_lt_le2_success').alias('total_learning_ls_sc_lt_le2_success'),

             f.sum('is_ls_sc_lt').alias('total_student_ls_sc_lt'),
             f.sum('is_ls_sc_lt_success').alias('total_student_ls_sc_lt_success'),
             f.sum('is_ls_success').alias('total_student_ls_success'),
             f.sum('is_sc_success').alias('total_student_sc_success'),
             f.sum('is_lt_success').alias('total_student_lt_success'),

             f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt'),
             f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt_success'),
             f.sum('total_learning_ls_success').alias('total_learning_ls_success'),
             f.sum('total_learning_sc_success').alias('total_learning_sc_success'),
             f.sum('total_learning_lt_success').alias('total_learning_lt_success'),

             f.sum('total_duration_ls_sc_lt').alias('total_duration_ls_sc_lt'),

             f.sum('is_le2').alias('total_student_le2'),
             f.sum('is_le2_success').alias('total_student_le2_success'),
             f.sum('is_voxy_success').alias('total_student_voxy_success'),
             f.sum('is_native_talk_success').alias('total_student_native_talk_success'),
             f.sum('is_home_work_success').alias('total_student_home_work_success'),
             f.sum('is_ncsbasic_success').alias('total_student_ncsbasic_success'),

             f.sum('total_learning_le2').alias('total_learning_le2'),
             f.sum('total_learning_le2_success').alias('total_learning_le2_success'),
             f.sum('total_learning_voxy_success').alias('total_learning_voxy__success'),
             f.sum('total_learning_native_talk_success').alias('total_learning_native_talk_success'),
             f.sum('total_learning_home_work_success').alias('total_learning_home_work_success'),
             f.sum('total_learning_ncsbasic_success').alias('total_learning_ncsbasic_success'),

             f.sum('total_duration_le2').alias('total_duration_le2'),
             f.sum('total_duration_voxy').alias('total_duration_voxy'),
             f.sum('total_duration_native_talk').alias('total_duration_native_talk'),
             f.sum('total_duration_home_work').alias('total_duration_home_work'),
             f.sum('total_duration_ncsbasic').alias('total_duration_ncsbasic')
             ) \
        .withColumn('period_id', f.lit(DAILY_PERIOD_ID)) \
        .withColumn('report_role_id', f.lit(REPORT_ROLE_MANAGER_ID))

    # display(df_student_package_status_group_week, "df_student_package_status_group_week")

    dyf_student_package_status_group_week = DynamicFrame.fromDF(df_student_package_status_group_week,
                                                                glueContext,
                                                                'dyf_student_package_status_group_week')

    apply_ouput = ApplyMapping \
        .apply(frame=dyf_student_package_status_group_week,
               mappings=[("report_role_id", "long", "report_role_id", "long"),
                         ("period_id", "long", "period_id", "long"),
                         ("date_id", "long", "time_id", "long"),

                         ("package_id", "long", "package_id", "long"),
                         ("student_level_id", "long", "student_level_id", "long"),
                         ("advisor_id", "long", "advisor_id", "long"),

                         ("total_student", "long", "total_student", "long"),
                         ("total_student_active", "long", "total_student_active", "long"),

                         ("total_student_ls_sc_lt_le2", "long", "total_student_ls_sc_lt_le2", "long"),
                         ("total_student_ls_sc_lt_le2_success", "long", "total_student_ls_sc_lt_le2_success", "long"),
                         ("total_learning_ls_sc_lt_le2", "long", "total_learning_ls_sc_lt_le2", "long"),
                         ("total_learning_ls_sc_lt_le2_success", "long", "total_learning_ls_sc_lt_le2_success", "long"),

                         ("total_student_ls_sc_lt", "long", "total_student_ls_sc_lt", "long"),
                         ("total_student_ls_sc_lt_success", "long", "total_student_ls_sc_lt_success", "long"),
                         ("total_student_ls_success", "long", "total_student_ls_success", "long"),
                         ("total_student_sc_success", "long", "total_student_sc_success", "long"),
                         ("total_student_lt_success", "long", "total_student_lt_success", "long"),

                         ("total_learning_ls_sc_lt", "long", "total_learning_ls_sc_lt", "long"),
                         ("total_learning_ls_sc_lt_success", "long", "total_learning_ls_sc_lt_success", "long"),
                         ("total_learning_ls_success", "long", "total_learning_ls_success", "long"),
                         ("total_learning_sc_success", "long", "total_learning_sc_success", "long"),
                         ("total_learning_lt_success", "long", "total_learning_lt_success", "long"),

                         ("total_duration_ls_sc_lt", "long", "total_duration_ls_sc_lt", "long"),

                         ("total_student_le2", "long", "total_student_le2", "long"),
                         ("total_student_le2_success", "long", "total_student_le2_success", "long"),
                         ("total_student_voxy_success", "long", "total_student_voxy_success", "long"),
                         ("total_student_native_talk_success", "long", "total_student_native_talk_success", "long"),
                         ("total_student_home_work_success", "long", "total_student_home_work_success", "long"),
                         ("total_student_ncsbasic_success", "long", "total_student_ncsbasic_success", "long"),

                         ("total_learning_le2", "long", "total_learning_le2", "long"),
                         ("total_learning_le2_success", "long", "total_learning_le2_success", "long"),
                         ("total_learning_voxy__success", "long", "total_learning_voxy__success", "long"),
                         ("total_learning_native_talk_success", "long", "total_learning_native_talk_success", "long"),
                         ("total_learning_home_work_success", "long", "total_learning_home_work_success", "long"),
                         ("total_learning_ncsbasic_success", "long", "total_learning_ncsbasic_success", "long"),

                         ("total_duration_le2", "long", "total_duration_le2", "long"),
                         ("total_duration_voxy", "long", "total_duration_voxy", "long"),
                         ("total_duration_native_talk", "long", "total_duration_native_talk", "long"),
                         ("total_duration_home_work", "long", "total_duration_home_work", "long"),
                         ("total_duration_ncsbasic", "long", "total_duration_ncsbasic", "long")
                         ])

    dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2")

    display(dfy_output, "dfy_output")

    # save_data_to_redshift(
    #     glueContext,
    #     dfy_output,
    #     'student_native_report',
    #     'bc200.bc200_fact_v2_1',
    #     "s3n://dts-odin/temp/bc200/bc200_fact_v2_1",
    #     "datasink4")

    preactions = "DELETE from bc200.bc200_fact_v2_1 WHERE period_id = " + str(DAILY_PERIOD_ID) + " and time_id >= " + str(start_date_id)
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "preactions": preactions,
                                                       "dbtable": "bc200.bc200_fact_v2_1",
                                                       "database": "student_native_report"
                                                   },
                                                   redshift_tmp_dir="s3n://dts-odin/temp/bc200/bc200_fact_v2",
                                                   transformation_ctx="datasink4")




    #-------------------------------------------------------

    df_student_package_status_by_date_learning.unpersist()
    df_student_package_status_by_date.unpersist()
コード例 #34
0
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_struct",
                                     transformation_ctx="resolvechoice2")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame=resolvechoice2,
                                       transformation_ctx="dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://go-lambda-bucket/Taxi_Data"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
##----------------------------------
#convert to a Spark DataFrame...
customDF = datasource0.toDF()

#add a new column for "type"
customDF = customDF.withColumn("type", lit('yellow'))

# Convert back to a DynamicFrame for further processing.
customDynamicFrame = DynamicFrame.fromDF(customDF, glueContext, "customDF_df")
##----------------------------------
datasink4 = glueContext.write_dynamic_frame.from_options(
    frame=customDynamicFrame,
    connection_type="s3",
    connection_options={"path": "s3://go-lambda-bucket"},
    format="parquet",
    transformation_ctx="datasink4")
job.commit()
コード例 #35
0
ファイル: word_count.py プロジェクト: sachu/sparklab
# MAGIC 
# MAGIC Let's create a new DataFrame from `wordsDF` by performing an operation that adds an 's' to each word.  To do this, we'll call the [`select` DataFrame function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select) and pass in a column that has the recipe for adding an 's' to our existing column.  To generate this `Column` object you should use the [`concat` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.concat) found in the [`pyspark.sql.functions` module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).  Note that `concat` takes in two or more string columns and returns a single string column.  In order to pass in a constant or literal value like 's', you'll need to wrap that value with the [`lit` column function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.lit).
# MAGIC 
# MAGIC Please replace `<FILL IN>` with your solution.  After you have created `pluralDF` you can run the next cell which contains two tests.  If you implementation is correct it will print `1 test passed` for each test.
# MAGIC 
# MAGIC This is the general form that exercises will take.  Exercises will include an explanation of what is expected, followed by code cells where one cell will have one or more `<FILL IN>` sections.  The cell that needs to be modified will have `# TODO: Replace <FILL IN> with appropriate code` on its first line.  Once the `<FILL IN>` sections are updated and the code is run, the test cell can then be run to verify the correctness of your solution.  The last code cell before the next markdown section will contain the tests.
# MAGIC 
# MAGIC > Note:
# MAGIC > Make sure that the resulting DataFrame has one column which is named 'word'.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import lit, concat

pluralDF = wordsDF.select(concat(wordsDF.word, lit('s')).alias('word'))
pluralDF.show()

# COMMAND ----------

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from databricks_test_helper import Test
# TEST Using DataFrame functions to add an 's' (1b)
Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s')
Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'")

# COMMAND ----------

# MAGIC %md
コード例 #36
0
ファイル: modules.py プロジェクト: jacobdr/SMV
 def dqm(self):
     return SmvDQM().add(
         DQMRule(col("b") < 0.4 , "b_lt_03")).add(
         DQMFix(col("a") < 1, lit(1).alias("a"), "a_lt_1_fix")).add(
         FailTotalRuleCountPolicy(2)).add(
         FailTotalFixCountPolicy(1))
コード例 #37
0
# COMMAND ----------

# MAGIC %md ### use `Pandas`

# COMMAND ----------

bts_data_pdf = bts_data_df.toPandas()
bts_data_pdf

# COMMAND ----------

import pyspark.sql.functions as F
bts_data_df = bts_data_df.withColumn(
    "city_agg",
    F.when(F.col("city") > 0, "Prague").otherwise(F.lit("Brno")))

# COMMAND ----------

display(bts_data_df.groupby("city_agg").count())

# COMMAND ----------

# MAGIC %sql
# MAGIC select cgi_ecgi, count(1) as row_cnt
# MAGIC from probe_data
# MAGIC group by cgi_ecgi
# MAGIC order by row_cnt DESC
# MAGIC limit 10

# COMMAND ----------
コード例 #38
0
ファイル: compiler.py プロジェクト: zhuohuwu0603/ibis
def compile_null_if(t, expr, scope, **kwargs):
    op = expr.op()
    col = t.translate(op.arg, scope)
    nullif_col = t.translate(op.null_if_expr, scope)
    return F.when(col == nullif_col, F.lit(None)).otherwise(col)
コード例 #39
0
ファイル: compiler.py プロジェクト: zhuohuwu0603/ibis
def compile_null_literal(t, expr, scope):
    return F.lit(None)
コード例 #40
0
customSchema2 = StructType([StructField("name", StringType(), True), StructField("country", StringType(), True), \
    StructField("area_code", IntegerType(), True), StructField("code", StringType(), True)])

df1 = sqlcontext.read \
    .format('com.databricks.spark.csv') \
    .options(header='true') \
    .load("file:///C:/Users/Administrator/Downloads/data1/airport.csv", schema = customSchema2)
#df1.show()
newjoindf = df.join(df1, df.origin == df1.code)


sqlcontext.registerDataFrameAsTable(newjoindf, "jointable")
sqlquery= "SELECT code as Airport, year, month , avg(CAST(arr_delay AS BIGINT)) AS avgdelay FROM jointable GROUP BY code,year,month ORDER BY avgdelay DESC LIMIT 5"
sqlquery1= "SELECT code as Airport, year, month , avg(CAST(arr_delay AS BIGINT)) AS avgdelay FROM jointable GROUP BY code,year,month ORDER BY avgdelay LIMIT 5"
top5airport = sqlcontext.sql(sqlquery).show()
last5airport = sqlcontext.sql(sqlquery1).show()

concatedate = newjoindf.withColumn('date',sf.concat(sf.col('year'),sf.lit('/'), sf.col('month'),sf.lit('/'), sf.col('day')))
func =  udf (lambda x: datetime.strptime(x, '%Y/%m/%d'), DateType())
date = concatedate.withColumn('date', func(col('date')))
sqlcontext.registerDataFrameAsTable(date, "newtable")

sqlquery2= "SELECT DISTINCT code AS airport,date FROM newtable WHERE arr_delay <> 'NA' ORDER BY date DESC"


#"SELECT DISTINCT code AS airport, date  FROM newtable WHERE arr_delay IS NOT NULL ORDER BY date DESC "
newdf = sqlcontext.sql(sqlquery2).show(100)
last7days = newdf.where(datediff(current_date(), col("date")) < 7)

コード例 #41
0
# import findspark
# findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
# spark

# import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.functions import lit
import time

row = spark.read.csv("/content/kc_house_data.csv", inferSchema=True, header=True)
# add intercept column
row = row.withColumn("intercept", lit(1))
print("Correlation price, bedrooms:", row.stat.corr('price', 'bedrooms'))
print("Correlation price, bathrooms:", row.stat.corr('price', 'bathrooms'))
print("Correlation price, sqft_living:", row.stat.corr('price', 'sqft_living'))
print("Correlation price, sqft_lot:", row.stat.corr('price', 'sqft_lot'))
print("Correlation price, floors:", row.stat.corr('price', 'floors'))
print("Correlation price, waterfront:", row.stat.corr('price', 'waterfront'))
print("Correlation price, view:", row.stat.corr('price', 'view'))
print("Correlation price, condition:", row.stat.corr('price', 'condition'))
print("Correlation price, grade:", row.stat.corr('price', 'grade'))
print("Correlation price, sqft_above:", row.stat.corr('price', 'sqft_above'))
print("Correlation price, sqft_basement:", row.stat.corr('price', 'sqft_basement'))
print("Correlation price, yr_built:", row.stat.corr('price', 'yr_built'))
print("Correlation price, yr_renovated:", row.stat.corr('price', 'yr_renovated'))
print("Correlation price, zipcode:", row.stat.corr('price', 'zipcode'))
print("Correlation price, lat:", row.stat.corr('price', 'lat'))
コード例 #42
0
cj_df = hc.sql(cj_query)
cj_lt = cj_df.schema.names
cj_lt[:] = [s.replace('cj_', 'c_') for s in cj_lt]
cj_df = cj_df.toDF(*cj_lt)

cei_df = hc.sql(cei_query)
cei_lt = cei_df.schema.names
cei_lt[:] = [s.replace('cei_', 'c_') for s in cei_lt]
cei_df = cei_df.toDF(*cei_lt)

all_columns = set(cei_df.schema.names).union(set(cj_df.schema.names))

from pyspark.sql.functions import lit

for column0 in all_columns:
    if column0 not in cj_df.columns:
        cj_df = cj_df.withColumn(column0, lit('CNP'))
        print("column missing in cj:" + column0)
    if column0 not in cei_df.columns:
        cei_df = cei_df.withColumn(column0, lit('CNP'))
        print("column missing in cei:" + column0)
    # print(column0)

print(set(cj_df.columns).difference(set(cei_df.columns)).show)

cei_df = cei_df[sorted(cei_df.schema.names)]
cj_df = cj_df[sorted(cj_df.schema.names)]

output_df = cj_df.union(cei_df)
コード例 #43
0
output_path = '/user/soyel/pyspark-cicd-template/output/user_pageviews'

# Extract
inc_df: DataFrame = spark.read.csv(path=page_views_path,
                                   header=True,
                                   schema=page_views)
prev_df: DataFrame = spark.read.table(tableName=user_pageviews_tab)

# Transform
inc_df: DataFrame = (inc_df.groupBy('email').count().select([
    'email',
    col('count').alias('page_view'),
    current_date().alias('last_active')
]))

df_transformed: DataFrame = (inc_df.join(
    prev_df, inc_df.email == prev_df.email, 'full').select([
        coalesce(prev_df.email, inc_df.email).alias('email'),
        (coalesce(prev_df.page_view, lit(0)) +
         coalesce(inc_df.page_view, lit(0))).alias('page_view'),
        coalesce(prev_df.created_date,
                 inc_df.last_active).cast('date').alias('created_date'),
        coalesce(inc_df.last_active,
                 prev_df.last_active).cast('date').alias('last_active')
    ]))

# Load
df_transformed.write.save(path=output_path, mode='overwrite')

spark.stop()
コード例 #44
0
    def classify_prediction_requests(rdd):

        from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType
        from pyspark.sql.types import StructType, StructField

        prediction_request_schema = StructType([
            StructField("Carrier", StringType(), True),
            StructField("DayOfMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DayOfYear", IntegerType(), True),
            StructField("DepDelay", DoubleType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", DoubleType(), True),
            StructField("FlightDate", DateType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("Origin", StringType(), True),
            StructField("Timestamp", TimestampType(), True),
            StructField("UUID", StringType(), True),
        ])

        prediction_requests_df = spark.createDataFrame(
            rdd, schema=prediction_request_schema)
        prediction_requests_df.show()

        #
        # Add a Route variable to replace FlightNum
        #

        from pyspark.sql.functions import lit, concat
        prediction_requests_with_route = prediction_requests_df.withColumn(
            'Route',
            concat(prediction_requests_df.Origin, lit('-'),
                   prediction_requests_df.Dest))
        prediction_requests_with_route.show(6)

        # Vectorize string fields with the corresponding pipeline for that column
        # Turn category fields into categoric feature vectors, then drop intermediate fields
        for column in ["Carrier", "Origin", "Dest", "Route"]:
            string_indexer_model = string_indexer_models[column]
            prediction_requests_with_route = string_indexer_model.transform(
                prediction_requests_with_route)

        # Vectorize numeric columns: DepDelay, Distance and index columns
        final_vectorized_features = vector_assembler.transform(
            prediction_requests_with_route)

        # Inspect the vectors
        final_vectorized_features.show()

        # Drop the individual index columns
        index_columns = [
            "Carrier_index", "Origin_index", "Dest_index", "Route_index"
        ]
        for column in index_columns:
            final_vectorized_features = final_vectorized_features.drop(column)

        # Inspect the finalized features
        final_vectorized_features.show()

        # Make the prediction
        predictions = rfc.transform(final_vectorized_features)

        # Drop the features vector and prediction metadata to give the original fields
        predictions = predictions.drop("Features_vec")
        final_predictions = predictions.drop("indices").drop("values").drop(
            "rawPrediction").drop("probability")

        # Inspect the output
        final_predictions.show()

        # Store to Mongo
        if final_predictions.count() > 0:
            final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB(
                "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response"
            )
コード例 #45
0
ファイル: test_serde.py プロジェクト: apache/spark
 def test_datetime_at_epoch(self):
     epoch = datetime.datetime.fromtimestamp(0)
     df = self.spark.createDataFrame([Row(date=epoch)])
     first = df.select('date', lit(epoch).alias('lit_date')).first()
     self.assertEqual(first['date'], epoch)
     self.assertEqual(first['lit_date'], epoch)
コード例 #46
0
ファイル: resample.py プロジェクト: ofunkey/tempo
def __appendAggKey(tsdf, freq=None):
    """
    :param tsdf: TSDF object as input
    :param freq: frequency at which to upsample
    :return: return a TSDF with a new aggregate key (called agg_key)
    """
    df = tsdf.df
    checkAllowableFreq(freq)

    # compute timestamp columns
    sec_col = f.second(f.col(tsdf.ts_col))
    min_col = f.minute(f.col(tsdf.ts_col))
    hour_col = f.hour(f.col(tsdf.ts_col))

    if (freq == SEC):
        #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0'), f.lpad(min_col, 2, '0'), f.lpad(sec_col, 2, '0'))
        agg_key = f.concat(
            f.col(tsdf.ts_col).cast("date"), f.lit(" "),
            f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'),
            f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp")
    elif (freq == MIN):
        #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0'), f.lpad(min_col, 2, '0'))
        agg_key = f.concat(
            f.col(tsdf.ts_col).cast("date"), f.lit(' '),
            f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'),
            f.lit(':'), f.lit('00')).cast("timestamp")
    elif (freq == HR):
        #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0'))
        agg_key = f.concat(
            f.col(tsdf.ts_col).cast("date"), f.lit(' '),
            f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'),
            f.lit('00')).cast("timestamp")

    df = df.withColumn("agg_key", agg_key)
    return tempo.tsdf.TSDF(df, tsdf.ts_col, partition_cols=tsdf.partitionCols)
コード例 #47
0
ファイル: sparkStreaming.py プロジェクト: wsw2008new/tfm
def dist(long_x, lat_x, long_y, lat_y):
    return acos(
        sin(radians(lat_x)) * sin(radians(lat_y)) + 
        cos(radians(lat_x)) * cos(radians(lat_y)) * 
            cos(radians(long_x) - radians(long_y))
    ) * lit(6371.0)
コード例 #48
0
                    StructField("VERTN", StringType(), True),
                    StructField("VBEWA", StringType(), True),
                    StructField("KBLNR", StringType(), True),
                    StructField("KBLPOS", ShortType(), True),
                    StructField("GRANT_NBR", StringType(), True),
                    StructField("GMVKZ", StringType(), True),
                    StructField("SRTYPE", StringType(), True),
                    StructField("LOTKZ", StringType(), True),
                    StructField("ZINKZ", StringType(), True),
                    StructField("FKBER", StringType(), True),
                    StructField("INTRENO", StringType(), True),
                    StructField("PPRCT", StringType(), True),
                    StructField("BUZID", StringType(), True),
                    StructField("AUGGJ", ShortType(), True),
                    StructField("HKTID", StringType(), True),
                    StructField("BUDGET_PD", StringType(), True),
                    StructField("KONTT", StringType(), True),
                    StructField("KONTL", StringType(), True),
                    StructField("UEBGDAT", StringType(), True),
                    StructField("VNAME", StringType(), True),
                    StructField("EGRUP", StringType(), True),
                    StructField("BTYPE", StringType(), True),
                    StructField("PROPMANO", StringType(), True),
                    StructField("INWARDNO_HD", StringType(), True),
                    StructField("INWARDDT_HD", StringType(), True)])
df1 = spark.read.load(file_path, format="csv", sep="¬", header="true", schema=schema1)
df1 = df1.withColumn("LOAD_DATE", lit(sys.argv[6]))

df1.write.insertInto(table_full, overwrite=False)

コード例 #49
0
def count_not_null(c, nan_as_null=False):
    pred = F.col(c).isNotNull() & (~isnan(c) if nan_as_null else F.lit(True))
    return F.sum(pred.cast("integer")).alias(c)
コード例 #50
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import array,udf
import numpy as np
import sys

#spark = SparkSession.builder.appName("wind_pi_eda").enableHiveSupport().getOrCreate()
df = spark.sql("select * from analytical_ds.wind_turbine_combined_data")


split_col = F.split(df['tur_date'], '-')
df = df.withColumn('month', split_col.getItem(1))
df = df.withColumn('date', split_col.getItem(0))
split_col_2 = F.split(df['tur_time'], ':')
df = df.withColumn('hour', split_col_2.getItem(0))
df = df.withColumn('minute', split_col_2.getItem(1))
df = df.withColumn('datetime_index', F.concat(F.col('date'),F.lit('-'), F.col('month'),F.lit('-2017 '), F.col('hour'),F.lit(':'),F.col('minute'),F.lit(':00')))

col_names = df.drop('ext_curtailment_ind_avg','int_derate_ind_avg','consumption_counter_sample','row_num','operating_state','state_fault').columns[3:-5]


for i in range(0,len(col_names)):
	df2 = df.select(unix_timestamp('datetime_index', "dd-MMM-yyyy HH:mm:ss") .cast(TimestampType()).alias("timestamp"),'turbine_id',col_names[i])
	df2 = df2.withColumn(col_names[i], F.last(col_names[i], True).over(Window.partitionBy('turbine_id').orderBy('timestamp').rowsBetween(-sys.maxsize, 0)))
	df2 = df2.withColumn('lag1',F.lag(df2[col_names[i]]).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag2',F.lag(df2[col_names[i]],2).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag3',F.lag(df2[col_names[i]],3).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag4',F.lag(df2[col_names[i]],4).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag5',F.lag(df2[col_names[i]],5).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag6',F.lag(df2[col_names[i]],6).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag7',F.lag(df2[col_names[i]],7).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
	df2 = df2.withColumn('lag8',F.lag(df2[col_names[i]],8).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
コード例 #51
0
ファイル: matrix_ops.py プロジェクト: sciosci/datarank
def vector_plus_vector(v1, v2):
    """Compute vector v1 + v2 where some components of v1 may not appear in v2"""
    return v1.selectExpr('i', 'value as v1').join(
        v2.selectExpr('i', 'value as v2'), on='i', how='left'). \
        select('i', (fn.col('v1') + fn.coalesce(fn.col('v2'), fn.lit(0))).alias('value'))
コード例 #52
0
 def data2(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("ks", array([lit(i) for i in range(20, 30)])) \
         .withColumn("k", explode(col('ks'))) \
         .withColumn("v2", col('k') * 100) \
         .drop('ks')
コード例 #53
0
def target_encoder(training_frame,
                   test_frame,
                   x,
                   y,
                   lambda_=0.15,
                   threshold=150,
                   test=False,
                   valid_frame=None,
                   frame_type='h2o',
                   id_col=None):
    """ Applies simple target encoding to categorical variables.

    :param training_frame: Training frame which to create target means and to be encoded.
    :param test_frame: Test frame to be encoded using information from training frame.
    :param x: Name of input variable to be encoded.
    :param y: Name of target variable to use for encoding.
    :param lambda_: Balance between level mean and overall mean for small groups.
    :param threshold: Number below which a level is considered small enough to be shrunken.
    :param test: Whether or not to print the row_val_dict for testing purposes.
    :param valid_frame: To also combine features on a validation frame include this (optional)
    :param frame_type: The type of frame being used. Accepted: ['h2o','pandas','spark']
    :param id_col: The name of the id column for spark dataframes only. Will conserve memory and only return 2 columns in dfs(id,x_Tencode)
    :return: Tuple of encoded variable from train and test set as H2OFrames.

    """

    encode_name = x + '_Tencode'

    if frame_type == 'spark':
        # x_column_type = training_frame.select(x).dtypes.flatMap(list)[1]

        #To get the average out of the df have to convert to an rdd and flatMap
        #it. Then take the first and only value from the list returned.
        overall_mean = training_frame.agg({y: 'avg'}).rdd.flatMap(list).first()
        overall_mean_train = overall_mean

        #ALTERNATIVE way to do the same thing with sql functions
        # from pyspark.sql.functions import col, avg
        # overall_mean = training_frame.agg(avg(col(y))).rdd.flatMap(list).first()

        def find_shrunken_averages(tuple_input):
            """
            Reduce function to return the proper average for a given level.

            :return: A tuple of (level, ajusted_mean||overall_mean)
            """
            #The categorical level.
            level = tuple_input[0]
            # The labels list (y varaibale) from a map function.
            labels = tuple_input[1]
            # The total number of level occurances in the frame (ie count)
            level_n = len(labels)
            level_mean = sum(labels) / level_n

            # Determine if there enough occurances of a level. If NOT return overall_mean
            if level_n >= threshold:
                return (level, level_mean)
            else:
                return(level, ((1 - lambda_) * level_mean) +\
                                      (lambda_ * overall_mean) )

        #This article shows why one has to use a map-groupByKey-map rather then map-reduce order. To collect all values into one reducer
        #you have to do a groupByKey.
        #https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html
        levels_average_list_train = training_frame.select(
            x, y).rdd.map(lambda i: (i[0], i[1])).groupByKey().map(
                find_shrunken_averages).collect()
        levels_average_list_valid = None
        overall_mean_valid = None
        if valid_frame:
            #update overall_mean to valid frames mean
            overall_mean_valid = valid_frame.agg({
                y: 'avg'
            }).rdd.flatMap(list).first()
            overall_mean = overall_mean_valid
            levels_average_list_valid = valid_frame.select(
                x, y).rdd.map(lambda i: (i[0], i[1])).groupByKey().map(
                    find_shrunken_averages).collect()
        # print(levels_average_list_train)

        from pyspark.sql.functions import lit  #creates a literal value
        # create new frames with a new column
        new_training_frame, new_test_frame, new_valid_frame = None, None, None
        if id_col != None:
            #filter out other columns to save memory if id_col specified
            new_training_frame = training_frame.select(id_col, x).withColumn(
                encode_name, lit(overall_mean_train))
            if valid_frame:
                new_valid_frame = valid_frame.select(id_col, x).withColumn(
                    encode_name, lit(overall_mean_valid))
                new_test_frame = test_frame.select(id_col, x).withColumn(
                    encode_name, lit(overall_mean_valid))
            else:
                new_test_frame = test_frame.select(id_col, x).withColumn(
                    encode_name, lit(overall_mean_train))
        else:
            new_training_frame = training_frame.withColumn(
                encode_name, lit(overall_mean_train))
            if valid_frame:
                new_valid_frame = valid_frame.withColumn(
                    encode_name, lit(overall_mean_valid))
                new_test_frame = test_frame.withColumn(encode_name,
                                                       lit(overall_mean_valid))
            else:
                new_test_frame = test_frame.withColumn(encode_name,
                                                       lit(overall_mean_train))

        #Replace the values in the dataframes with new encoded values
        from pyspark.sql.functions import when
        for k, v in levels_average_list_train:
            new_training_frame = new_training_frame.withColumn(
                encode_name,
                when(new_training_frame[x] == k,
                     v).otherwise(new_training_frame[encode_name]))
            if not valid_frame:
                new_test_frame = new_test_frame.withColumn(
                    encode_name,
                    when(new_test_frame[x] == k,
                         v).otherwise(new_test_frame[encode_name]))
        #if we have a validation frame we want to set the test levels to the original_numerics
        #from the averaged valid frame instead of the test frame
        if valid_frame:
            for k, v in levels_average_list_valid:
                new_valid_frame = new_valid_frame.withColumn(
                    encode_name,
                    when(new_valid_frame[x] == k,
                         v).otherwise(new_valid_frame[encode_name]))
                new_test_frame = new_test_frame.withColumn(
                    encode_name,
                    when(new_test_frame[x] == k,
                         v).otherwise(new_test_frame[encode_name]))
        if id_col != None:
            #remove origional x as its already in the original dfs
            if valid_frame:
                return new_training_frame.drop(x), new_valid_frame.drop(
                    x), new_test_frame.drop(x)
            else:
                return new_training_frame.drop(x), new_test_frame.drop(x)
        else:
            if valid_frame:
                return new_training_frame, new_valid_frame, new_test_frame
            else:
                return new_training_frame, new_test_frame

    else:
        import h2o
        import pandas as pd
        import numpy as np

        trdf, vdf, tss = None, None, None
        if frame_type == 'h2o':
            # convert to pandas
            trdf = training_frame.as_data_frame().loc[:, [x, y]]  # df
            vdf = valid_frame.as_data_frame().loc[:, [x, y]]  # df
            tss = test_frame.as_data_frame().loc[:, x]  # series
        elif frame_type == 'pandas':
            trdf = training_frame.loc[:, [x, y]]  # df
            vdf = valid_frame.loc[:, [x, y]]  # df
            tss = test_frame.loc[:, x]  # series

        # create dictionary of level:encode val

        overall_mean_train = trdf[y].mean()
        overall_mean_valid = vdf[y].mean()
        row_val_dict_train = {}
        row_val_dict_valid = {}

        for level in trdf[x].unique():
            level_df = trdf[trdf[x] == level][y]
            level_n = level_df.shape[0]
            level_mean = level_df.mean()
            if level_n >= threshold:
                row_val_dict_train[level] = level_mean
            else:
                row_val_dict_train[level] = ((1 - lambda_) * level_mean) +\
                                      (lambda_ * overall_mean_train)
        for level in vdf[x].unique():
            level_df = vdf[trdf[x] == level][y]
            level_n = level_df.shape[0]
            level_mean = level_df.mean()
            if level_n >= threshold:
                row_val_dict_valid[level] = level_mean
            else:
                row_val_dict_valid[level] = ((1 - lambda_) * level_mean) +\
                                      (lambda_ * overall_mean_valid)

        row_val_dict_train[
            np.nan] = overall_mean_train  # handle missing values
        row_val_dict_valid[
            np.nan] = overall_mean_valid  # handle missing values

        if test:
            print(row_val_dict_train)
            print(row_val_dict_valid)

        # apply the transform to training data
        trdf[encode_name] = trdf[x].apply(lambda i: row_val_dict_train[i])
        vdf[encode_name] = vdf[x].apply(lambda i: row_val_dict_valid[i])

        # apply the transform to test data
        tsdf = pd.DataFrame(columns=[x, encode_name])
        tsdf[x] = tss
        if valid_frame:
            tsdf.loc[:,
                     encode_name] = overall_mean_valid  # handle previously unseen values
        else:
            tsdf.loc[:,
                     encode_name] = overall_mean_train  # handle previously unseen values
        # handle values that are seen in tsdf but not row_val_dict
        for i, col_i in enumerate(tsdf[x]):
            try:
                row_val_dict_train[col_i]
            except:
                # a value that appeared in tsdf isn't in the row_val_dict so just
                # make it the overall_mean
                row_val_dict_train[col_i] = overall_mean_train

        if valid_frame:
            for i, col_i in enumerate(vdf[x]):
                try:
                    row_val_dict_valid[col_i]
                except:
                    # a value that appeared in tsdf isn't in the row_val_dict so just
                    # make it the overall_mean
                    row_val_dict_valid[col_i] = overall_mean_valid
            tsdf[encode_name] = tsdf[x].apply(lambda i: row_val_dict_valid[i])
        else:
            tsdf[encode_name] = tsdf[x].apply(lambda i: row_val_dict_train[i])

        if frame_type == 'h2o':
            # convert back to H2O
            trdf = h2o.H2OFrame(trdf[encode_name].as_matrix())
            trdf.columns = [encode_name]
            if valid_frame:
                vdf = h2o.H2OFrame(vdf[encode_name].as_matrix())
                vdf.columns = [encode_name]

            tsdf = h2o.H2OFrame(tsdf[encode_name].as_matrix())
            tsdf.columns = [encode_name]
            if valid_frame:
                return (trdf, vdf, tsdf)
            else:
                return (trdf, tsdf)
        else:  #pandas
            #just return pandas
            if valid_frame:
                return (trdf, vdf, tsdf)
            else:
                return (trdf, tsdf)
コード例 #54
0
def lookupDimensionKey(df):
    return df.withColumn("DimensionSKey", lit(1))
コード例 #55
0
ファイル: release.py プロジェクト: tylerwx51/python_mozetl
def with_effective_version(dataframe, effective_version, join_key):
    """Calculate the effective version of Firefox in the wild given the date
    and channel.

    For example, if the date is 2017-11-14 and the channel is 'Release', then
    the effective Firefox version is 57.0.0, since this is the Firefox version
    that would be available for installation from official sources. This is
    used to determine the version a profile was acquired on.

    :param dataframe:           A dataframe containing a date and channel col.
    :param effective_version:   A table mapping dates to application versions.
    :param join_key:            Column name generate version number on.
    :returns:                   A with a calculated "start_version" column
    """

    in_columns = {"channel"}
    out_columns = set(dataframe.columns) | {"start_version"}
    assert in_columns <= set(dataframe.columns), "must contain channel"

    # Firefox releases follow a train model. Each channel is a major revision
    # ahead from the upstream channel. Nightly corresponds to the build on
    # mozilla-central, and is always the head of the train. For example, if
    # the release channel is "55", then beta will be "56".
    #
    # This logic will be affected by the decommissioning of aurora.
    version_offset = (F.when(
        F.col("channel").startswith("beta"), F.lit(1)).otherwise(
            F.when(F.col("channel").startswith("aurora"), F.lit(2)).otherwise(
                F.when(F.col("channel").startswith("nightly"),
                       F.lit(3)).otherwise(F.lit(0)))))

    # Original effective version column name
    ev_version = effective_version.columns[1]

    # Column aliases in the joined table
    version = F.col(ev_version)
    date = F.col(join_key)

    joined_df = (
        dataframe
        # Rename the date field to join against the left table
        .join(effective_version.toDF(join_key, ev_version), join_key, "left")
        # Major version number e.g. "57"
        .withColumn("_major", F.split(version, "\.").getItem(0).cast("int"))
        # Offset into the release train
        .withColumn("_offset", version_offset)
    )

    # Build up operations to get the effective start version of a particular
    # channel and date.

    # There will be null values for the version if the date is not in
    # the right table. This sets the start_version to one of two values.
    fill_outer_range = (F.when(date.isNull() | (date < "2015-01-01"),
                               F.lit("older")).otherwise(F.lit("newer")))

    calculate_channel_version = (F.when(
        F.col("channel").startswith("release"), version).otherwise(
            F.concat(F.col("_major") + F.col("_offset"), F.lit(".0"))))

    start_version = (F.when(
        version.isNull(),
        fill_outer_range).otherwise(calculate_channel_version))

    return (joined_df.withColumn("start_version", start_version).fillna(
        "unknown", ["start_version"]).select(*out_columns))
コード例 #56
0
print('Generating watercraft connections')
with timer():
    print('Reading tables')
    with timer():
        embarcacao = spark.table('bases.lc_embarcacao')
        pessoa = spark.table('bases.pessoa_fisica')
        empresa = spark.table('bases.lc_cnpj')

        # Merge persons with watercrafts
        pessoa_embarcacao = pessoa.filter('num_cpf is not null').\
            withColumnRenamed('uuid', 'start_node').\
            join(embarcacao, pessoa.num_cpf == embarcacao.cpf_cnpj).\
            select(['start_node', 'uuid']).\
            withColumnRenamed('uuid', 'end_node').\
            withColumn('label', lit('PROPRIETARIO').cast('string')).\
            withColumn('uuid', uuidshaudf())

        empresa_embarcacao = empresa.filter('num_cnpj is not null').\
            withColumnRenamed('uuid', 'start_node').\
            join(embarcacao, empresa.num_cnpj == embarcacao.cpf_cnpj).\
            select(['start_node', 'uuid']).\
            withColumnRenamed('uuid', 'end_node').\
            withColumn('label', lit('PROPRIETARIO').cast('string')).\
            withColumn('uuid', uuidshaudf())

        pessoa_embarcacao.write.mode("overwrite").saveAsTable(
            "dadossinapse.pessoa_embarcacao_ope")
        empresa_embarcacao.write.mode("overwrite").saveAsTable(
            "dadossinapse.empresa_embarcacao_ope")
            else:
                
                payload['data']=pub_data[key]['data']
                pushToRedis(payload)
                    
    # cerating publisher realted data from daily data 
    pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp  from testtable where id5!="" group by publisher_id,id5,device_finger_print)  group by publisher_id,id5')
    pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip  from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5')
    pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session  from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5')
    pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer')
    pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer')
    pub_daily_df=sqlContext.read.json(daily_fetch_path)

    if(len(pub_daily_df.columns)==0):
        for dtype in pub_df.dtypes:
            pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1]))
            
    pub_daily_df=pub_daily_df.select(pub_df.columns)
    pub_df=pub_df.union(pub_daily_df)


    pub_daily_df=pub_daily_df.select(pub_df.columns)
    pub_df=pub_df.union(pub_daily_df)
    pub_df.write.mode('append').json("{}".format(save_path),"overwrite")
    pub_df.cache()
    pubs=pub_df.collect()
    sendPubDataToRedis(pubs)
    data={"job_id":fetch_job_id,"file_name":save_file_name,"type":file_type}
    #creating acknowledgement request
    url = str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/acknowledgePredictionFileJob'
    # Create your header as required
コード例 #58
0
 def run(self, i):
     df = i[inputdata.EmploymentByStateLink]
     return df.smvSelectPlus(
         (col("EMP") > lit(1000000)).alias("cat_high_emp"))
コード例 #59
0
# COMMAND ----------

bad_content_size_df = base_df.filter(~ base_df['value'].rlike(r'\d+$'))
bad_content_size_df.count()

# COMMAND ----------

# MAGIC %md
# MAGIC That's it! The count matches the number of rows in `bad_rows_df` exactly.
# MAGIC 
# MAGIC Let's take a look at some of the bad column values. Since it's possible that the rows end in extra white space, we'll tack a marker character onto the end of each line, to make it easier to see trailing white space.

# COMMAND ----------

from pyspark.sql.functions import lit, concat
bad_content_size_df.select(concat(bad_content_size_df['value'], lit('*'))).show(truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC Ah. The bad rows correspond to error results, where no content was sent back and the server emitted a "`-`" for the `content_size` field. Since we don't want to discard those rows from our analysis, let's map them to 0.

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2d) Fix the rows with null content\_size
# MAGIC 
# MAGIC The easiest solution is to replace the null values in `split_df` with 0. The DataFrame API provides a set of functions and fields specifically designed for working with null values, among them:
# MAGIC 
# MAGIC * [fillna()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna), which fills null values with specified non-null values.
# MAGIC * [na](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na), which returns a [DataFrameNaFunctions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions) object with many functions for operating on null columns.
コード例 #60
0
def timestamp_provider():
    return lit(timestamp)