def data(self): from pyspark.sql.functions import array, explode, col, lit return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
def test_basic(self): df = self.data weighted_mean_udf = self.pandas_agg_weighted_mean_udf # Groupby one column and aggregate one UDF with literal result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id') expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) # Groupby one expression and aggregate one UDF with literal result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\ .sort(df.id + 1) expected2 = df.groupby((col('id') + 1))\ .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) # Groupby one column and aggregate one UDF without literal result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id') expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id') self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) # Groupby one expression and aggregate one UDF without literal result4 = df.groupby((col('id') + 1).alias('id'))\ .agg(weighted_mean_udf(df.v, df.w))\ .sort('id') expected4 = df.groupby((col('id') + 1).alias('id'))\ .agg(mean(df.v).alias('weighted_mean(v, w)'))\ .sort('id') self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def make_prediction(event, df): event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6] udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType()) udf_shortest_distance=udf(utils.shortest_distance, FloatType()) df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30") df = df.withColumn("event_sum_distance", udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2") df = df.sort('event_sum_distance') if df.count() < 10: return [0,0] a = pd.DataFrame(df.take(50)) a.columns = df.columns speed_array = a.as_matrix(["avg_speed"]) dist_sf_array = a.as_matrix(["dist_sf"]) distance_array = a["trip_distance"].tolist() fare_array = a["total_notip"].tolist() time_array = a["trip_time_in_secs"].tolist() #set initial parameter values x0 = [0.5, 0.5, 3.0, 3.0] bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10)) #perform the fit res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds) grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon) #get the predictions time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean()) fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3]) if res.success == True: return [fare_pred, time_pred] else: return [0,0]
def test_udf_with_decorator(self): from pyspark.sql.functions import lit, udf from pyspark.sql.types import IntegerType, DoubleType @udf(IntegerType()) def add_one(x): if x is not None: return x + 1 @udf(returnType=DoubleType()) def add_two(x): if x is not None: return float(x + 2) @udf def to_upper(x): if x is not None: return x.upper() @udf() def to_lower(x): if x is not None: return x.lower() @udf def substr(x, start, end): if x is not None: return x[start:end] @udf("long") def trunc(x): return int(x) @udf(returnType="double") def as_double(x): return float(x) df = ( self.spark .createDataFrame( [(1, "Foo", "foobar", 3.0)], ("one", "Foo", "foobar", "float")) .select( add_one("one"), add_two("one"), to_upper("Foo"), to_lower("Foo"), substr("foobar", lit(0), lit(3)), trunc("float"), as_double("one"))) self.assertListEqual( [tpe for _, tpe in df.dtypes], ["int", "double", "string", "string", "string", "bigint", "double"] ) self.assertListEqual( list(df.first()), [2, 3.0, "FOO", "foo", "foo", 3, 1.0] )
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) metadata = Metadata(data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
def Dijkstra(graph, start, end = None): # dist = sqlContext.createDataFrame(sc.emptyRDD(), StructType([])) field = [StructField("weight", FloatType(), True)] schema = StructType(field) dist = sqlContext.createDataFrame(sc.emptyRDD(), schema) # prev = sqlContext.createDataFrame(sc.emptyRDD(), StructType([])) prev = sqlContext.createDataFrame(sc.emptyRDD(), schema) queue = sqlContext.createDataFrame(sc.emptyRDD(), StructType([])) queue = queue.withColumn("weight", lit(0)) for v in queue.show(): dist_length = dist.filter(dist.weight == v) queue_length = queue.filter(queue.weight == v) dist_length = queue_length if dist_length == end: break for w in graph.edges.filter("src = " + str(v)): vw_length = dist_length + graph.edges.filter("src = " + str(v) + "and dst = " + str(w)) curr_w = queue.filter("weight = " + str(w)) if dist.filter("weight = " + str(w)) != "": print "Dijkstra: found better path to already-final vertex" elif curr_w != "" or vw_length < curr_w: queue = queue.replace(queue.filter("weight = " + str(w)), vw_length) prev = prev.replace(prev.filter("weight = " + str(w)), v) return(dist)
def test_udf(self): from ts.flint import udf import pyspark.sql.functions as F from pyspark.sql.types import LongType vol = self.vol() @udf(LongType()) def foo(v, w): return v*2 result1 = vol.withColumn("volume", foo(vol['volume'], F.lit(42))).toPandas() result2 = vol.withColumn("volume", udf(lambda v, w: v*2, LongType())(vol['volume'], F.lit(42))).toPandas() expected_pdf1 = make_pdf([ (1000, 7, 200,), (1000, 3, 400,), (1050, 3, 600,), (1050, 7, 800,), (1100, 3, 1000,), (1100, 7, 1200,), (1150, 3, 1400,), (1150, 7, 1600,), (1200, 3, 1800,), (1200, 7, 2000,), (1250, 3, 2200,), (1250, 7, 2400,) ], ['time', 'id', 'volume']) assert_same(result1, expected_pdf1) assert_same(result2, expected_pdf1)
def derive_schema(self): ''' Loads all data in self.path and derives the schema, saves with pickle to "schema.p" ''' df = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt') \ .load(self.path+'gpx/*') df = df.withColumn('athlete',lit(None).cast(StringType())) \ .withColumn('activity_type',lit(None).cast(StringType())) df.printSchema() pickle.dump(df.schema, open("schema.p", "wb")) pass
def processRdd(rdd): try: print 'processRDD' #covnert to a dataframe from rdd printOnConsole('Started Processing the streams') #desiredCol = ['c-ip','cs-uri-stem','c-user-agent','customer-id','x-ec_custom-1'] if rdd.count() > 0: df = pycsv.csvToDataFrame(sqlContext, rdd, columns=COLUMNS, colTypes=COLUMN_TYPES) #df = df.select(desiredCol) #startTime endTime = getCurrentTimeStamp() startTime = endTime - SPARK_STREAM_BATCH endTime = getDateTimeFormat(endTime) startTime = getDateTimeFormat(startTime) df = df.withColumn(COL_STARTTIME, lit(startTime)) #endTime df = df.withColumn(COL_ENDTIME, lit(endTime)) df.registerTempTable("tempTable") query = ('select' + ' startTime,' + #startTime ' endTime,' + #endTime ' \'\' as ' + COL_CUSTOMERID + ',' + #customerid ' setProjectId(`projectid`) as ' + COL_PROJECTID + ',' + #projectid ' \'\' as ' + COL_FONTTYPE + ',' + #FontType ' \'\' as ' + COL_FONTID + ',' + #FontId ' getDomainName(`referrer`) as ' + COL_DOMAINNAME + ',' + #DomainName ' getBrowser(`useragent`) as ' + COL_USERAGENT + ',' + #UserAgent ' setIpaddress(`ip`) as ' + COL_IPADDRESS + #customer ipaddress ' from tempTable') df = sqlContext.sql(query) type = PAGEVIEW_TYPE | PAGEVIEWGEO_TYPE processForTable(df, type) else: printOnConsole('Nothing to process') except Exception, ex: printOnConsole('There was an error...') print ex
def process_writeable_df(joined_df, date_format="yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"): """ Prepares the dataframe for writing to mongo :param joined_df: :param date_format: :return: """ df_with_parsed_dates = parse_dates(joined_df, date_format) df_with_id = df_with_parsed_dates.withColumn("id", f.concat(f.col('account_id'), f.lit("_"), f.col("unix_ts"))) return df_with_id.na.drop()
def test_datasource_with_udf(self): # Same as SQLTests.test_datasource_with_udf, but with Pandas UDF # This needs to a separate test because Arrow dependency is optional import pandas as pd import numpy as np from pyspark.sql.functions import pandas_udf, lit, col path = tempfile.mkdtemp() shutil.rmtree(path) try: self.spark.range(1).write.mode("overwrite").format('csv').save(path) filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i') datasource_df = self.spark.read \ .format("org.apache.spark.sql.sources.SimpleScanSource") \ .option('from', 0).option('to', 1).load().toDF('i') datasource_v2_df = self.spark.read \ .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \ .load().toDF('i', 'j') c1 = pandas_udf(lambda x: x + 1, 'int')(lit(1)) c2 = pandas_udf(lambda x: x + 1, 'int')(col('i')) f1 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(lit(1)) f2 = pandas_udf(lambda x: pd.Series(np.repeat(False, len(x))), 'boolean')(col('i')) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) self.assertEquals(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) self.assertEquals(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) self.assertEquals(0, result.count()) finally: shutil.rmtree(path)
def test_string_functions(self): from pyspark.sql.functions import col, lit df = self.spark.createDataFrame([['nick']], schema=['name']) self.assertRaisesRegexp( TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) if sys.version_info.major == 2: self.assertRaises( TypeError, lambda: df.select(col('name').substr(long(0), long(1))))
def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when( AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when( AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages( sqlfunctions.sum(AM.msg).alias("aggMess"), sendToSrc=msgForSrc, sendToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a']) ).otherwise(v['belief']) # keep old beliefs for other colors newVertices = (v .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer') .drop(aggregates['id']) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief') ) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
def test_vectorized_udf_struct_with_empty_partition(self): df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))\ .withColumn('name', lit('John Doe')) @pandas_udf("first string, last string") def split_expand(n): return n.str.split(expand=True) result = df.select(split_expand('name')).collect() self.assertEqual(1, len(result)) row = result[0] self.assertEqual('John', row[0]['first']) self.assertEqual('Doe', row[0]['last'])
def test_datasource_with_udf(self): from pyspark.sql.functions import udf, lit, col path = tempfile.mkdtemp() shutil.rmtree(path) try: self.spark.range(1).write.mode("overwrite").format('csv').save(path) filesource_df = self.spark.read.option('inferSchema', True).csv(path).toDF('i') datasource_df = self.spark.read \ .format("org.apache.spark.sql.sources.SimpleScanSource") \ .option('from', 0).option('to', 1).load().toDF('i') datasource_v2_df = self.spark.read \ .format("org.apache.spark.sql.sources.v2.SimpleDataSourceV2") \ .load().toDF('i', 'j') c1 = udf(lambda x: x + 1, 'int')(lit(1)) c2 = udf(lambda x: x + 1, 'int')(col('i')) f1 = udf(lambda x: False, 'boolean')(lit(1)) f2 = udf(lambda x: False, 'boolean')(col('i')) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) self.assertEquals(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) self.assertEquals(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) self.assertEquals(0, result.count()) finally: shutil.rmtree(path)
def test_summary_weighted_covariance(self): import pyspark.sql.functions as F from ts.flint import summarizers price = self.price() forecast = self.forecast() expected_pdf = make_pdf([ (0, -1.96590909091,) ], ["time", "price_forecast_weight_weightedCovariance"]) joined = price.leftJoin(forecast, key="id").withColumn('weight', F.lit(2.0)) result = joined.summarize(summarizers.weighted_covariance("price", "forecast", "weight")).toPandas() pdt.assert_frame_equal(result, expected_pdf)
def labelRDDs(driver, path, sc): sqlContext = SQLContext(sc) target = str(driver) + '.json' driver_pool = list(all_drivers) driver_pool.remove(target) sample_drivers = np.random.choice(driver_pool, K, replace=False) sample_drivers_paths = [path + i for i in sample_drivers] sampled = sqlContext.read.json(sample_drivers_paths) orig = sqlContext.read.json(path + target) samples = sampled.sample(False, .0055) samples = samples.withColumn('label', lit(0)) orig = orig.withColumn('label', lit(1)) rawdata = samples.unionAll(orig) rawdata = rawdata.select(rawdata['driver'], rawdata['trip'], rawdata['x'], rawdata['y'], rawdata['step'], rawdata['label']) rawRDD = rawdata.rdd return rawRDD.map(maketups)
def _load_dataset(self): ''' Loads strava activities from source to DataFrame self.df ''' # Get athlete list if not already set if not self.athletes: self._get_athlete_directories() # Initialize empty dataset self.df = self.hiveContext.createDataFrame( self.sc.emptyRDD(), self.schema ) for athlete in self.athletes: for activity_type in self.activity_types: # Check that there are files of that type (or else .load fails) if self._activities_exist(athlete, activity_type): # Read data dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \ .schema(self.schema) \ .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) dfadd = dfadd.withColumn('athlete', lit(athlete)) \ .withColumn('activity_type', lit(activity_type)) self.df = self.df.unionAll(dfadd) if self.filter_bug_inducing_rows: self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull()) pass
def _minimize_query(self): # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp tb_samples = self.hive_cxt.sql(""" SELECT * FROM ( SELECT *, MIN(delta) OVER ( PARTITION BY refers_to_object_id, created) AS min_delta, row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks FROM samplestemporal st ORDER BY refers_to_object_id ) query where query.ranks = 1 """) tb_samples = tb_samples.withColumn("meta_store", lit(1)) tb_samples.registerTempTable('minimizedsamples') self.hive_cxt.cacheTable('minimizedsamples') return tb_samples
def test_string_functions(self): from pyspark.sql import functions from pyspark.sql.functions import col, lit, _string_functions df = self.spark.createDataFrame([['nick']], schema=['name']) self.assertRaisesRegexp( TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) if sys.version_info.major == 2: self.assertRaises( TypeError, lambda: df.select(col('name').substr(long(0), long(1)))) for name in _string_functions.keys(): self.assertEqual( df.select(getattr(functions, name)("name")).first()[0], df.select(getattr(functions, name)(col("name"))).first()[0])
def test_aggregate_messages(self): g = self._graph("friends") # For each user, sum the ages of the adjacent users, # plus 1 for the src's sum if the edge is "friend". sendToSrc = ( AM.dst['age'] + sqlfunctions.when( AM.edge['relationship'] == 'friend', sqlfunctions.lit(1) ).otherwise(0)) sendToDst = AM.src['age'] agg = g.aggregateMessages( sqlfunctions.sum(AM.msg).alias('summedAges'), sendToSrc=sendToSrc, sendToDst=sendToDst) # Run the aggregation again providing SQL expressions as String instead. agg2 = g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)", sendToDst="src['age']") # Convert agg and agg2 to a mapping from id to the aggregated message. aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()} agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()} # Compute the truth via brute force. user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()} trueAgg = {} for src, dst, rel in g.edges.select("src", "dst", "relationship").collect(): trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0) trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src] # Compare if the agg mappings match the brute force mapping self.assertEqual(aggMap, trueAgg) self.assertEqual(agg2Map, trueAgg) # Check that TypeError is raises with messages of wrong type with self.assertRaises(TypeError): g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc=object(), sendToDst="src['age']") with self.assertRaises(TypeError): g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc=dst['age'], sendToDst=object())
def test_summary_weighted_correlation(self): import pyspark.sql.functions as F from ts.flint import summarizers price = self.price() forecast = self.forecast() joined = price.leftJoin(forecast, key="id").withColumn('weight', F.lit(1.0)).withColumn('weight2', F.lit(42.0)) result = joined.summarize(summarizers.weighted_correlation("price", "forecast", "weight")).toPandas() result2 = joined.summarize(summarizers.weighted_correlation("price", "forecast", "weight2")).toPandas() expected = joined.summarize(summarizers.correlation("price", "forecast")).toPandas() assert(np.isclose( result['price_forecast_weight_weightedCorrelation'][0], expected['price_forecast_correlation'][0])) assert(np.isclose( result2['price_forecast_weight2_weightedCorrelation'][0], expected['price_forecast_correlation'][0]))
def featurizeData(raw, gap, vocabFile, featFile): feats = raw.dropDuplicates(['cluster', 'series', 'date'])\ .withColumn('day', datediff(col('date'), lit('1970-01-01')))\ .na.drop(subset=['day'])\ .rdd.groupBy(lambda r: r.cluster)\ .flatMap(lambda c: clusterFeatures(c, gap))\ .toDF() feats.cache() cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0) interner = cv.fit(feats) # alternate possibility: grab features only from label==1 edges full = interner.transform(feats) # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features') # # I don't think a Pipeline will work here since we need to get the interner.vocabulary # full = combiner.transform(interner.transform(feats)).drop('categorial') full.write.parquet(featFile) np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s') feats.unpersist()
def test_smvArrayFlatten(self): df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4') df1 = df.select(F.array( F.array(F.lit(None), F.col('a')), F.array(F.col('a'), F.col('b'), F.col('c')) ).alias('aa')) res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) exp = self.createDF("k: String", """||||; |1|1|2|; |2|2|3|4""") res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) self.should_be_same(res1, exp) self.should_be_same(res2, exp)
def test_stream_save_options(self): df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') \ .withColumn('id', lit(1)) for q in self.spark._wrapped.streams.active: q.stop() tmpPath = tempfile.mkdtemp() shutil.rmtree(tmpPath) self.assertTrue(df.isStreaming) out = os.path.join(tmpPath, 'out') chk = os.path.join(tmpPath, 'chk') q = df.writeStream.option('checkpointLocation', chk).queryName('this_query') \ .format('parquet').partitionBy('id').outputMode('append').option('path', out).start() try: self.assertEqual(q.name, 'this_query') self.assertTrue(q.isActive) q.processAllAvailable() output_files = [] for _, _, files in os.walk(out): output_files.extend([f for f in files if not f.startswith('.')]) self.assertTrue(len(output_files) > 0) self.assertTrue(len(os.listdir(chk)) > 0) finally: q.stop() shutil.rmtree(tmpPath)
def process_immigration_data(spark, input_data, output_data, cit_res, port_state_code, port_city, mode, addr, visa): """ ETL process for i94_{mon}16_sub.sas7bdat datasets Parameters: spark (SparkSession) : Spark Session input_data (str) : path of input data output_data (str) : path of directory where output data will be stored cit_res (dict) : Mapping for i94cit and i94res columns' values port_state_code (dict) : Mapping for i94port column's values -state part- port_city (dict) : Mapping for i94port column's values -city part- mode (dict) : Mapping for i94mode column's values addr (dict) : Mapping for i94addr column's values visa (dict) : Mapping for i94visa column's values """ # read data from each dataset and compine than to a single DataFrame df = spark.read.format('com.github.saurfang.sas.spark').load(input_data[0]) for path in input_data[1:]: df = df.union( spark.read.format('com.github.saurfang.sas.spark').load(path)) # drop columns that won't be used cols_to_drop = [ '_c0', 'cicid', 'count', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu', 'matflag', 'insnum' ] df = df.drop(*cols_to_drop) # replace invalid state codes with '99' df = df.withColumn( 'i94addr', when(~df['i94addr'].isin(*(addr.keys())), '99').otherwise(df['i94addr'])) # (mapping dictionary, column where mapping is applied, new column name) maps = [(cit_res, 'i94cit', 'i94cit'), (cit_res, 'i94res', 'i94res'), (port_state_code, 'i94port', 'i94port_state'), (port_city, 'i94port', 'i94port_city'), (mode, 'i94mode', 'i94mode'), (addr, 'i94addr', 'state'), (visa, 'i94visa', 'i94visa')] # use mappings to replace codes in 'i94cit', 'i94res', 'i94port', 'i94mode', # 'i94addr' and 'i94visa' columns with their values for map_dic, from_col, col_name, in maps: mapping_expr = create_map([lit(x) for x in chain(*map_dic.items())]) df = df.withColumn(col_name, mapping_expr.getItem(col(from_col))) df.createOrReplaceTempView('immigration') # transform gender column values: 'M' is replaced with 'Male' and 'F' with 'Female' # if the transportation mode is 'Land', 'Sea', 'Not reported' of NULL and # flight number or airline is not NULL: transportation mode is changes to 'Air' df = spark.sql(""" SELECT CAST(i94yr AS INT) AS arrival_year, CAST(i94mon AS INT) AS arrival_month, DATE_ADD('1960-01-01', arrdate) AS arrival_date, DATE_ADD('1960-01-01', depdate) AS departure_date, i94port_city AS port_city, i94port_state AS port_state_code, i94cit AS origin_country, i94res AS residence_country, CAST(biryear AS INT) AS birth_year, CAST(i94bir AS INT) AS age, CASE WHEN gender = 'M' THEN 'Male' WHEN gender = 'F' THEN 'Female' END AS gender, CAST(admnum AS INT) AS admission_num, TO_DATE(dtadfile, 'yyyyMMdd') AS admission_date, TO_DATE(dtaddto, 'MMddyyyy') AS admitted_until, i94visa visa_category, visatype AS visa_type, state, i94addr AS state_code, CASE WHEN (i94mode = 'Land' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air' WHEN (i94mode = 'Sea' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air' WHEN (i94mode = 'Not reported' AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air' WHEN (i94mode IS NULL AND ((fltno IS NOT NULL) OR (airline IS NOT NULL))) THEN 'Air' ELSE i94mode END AS transportation_mode, airline, fltno AS flight_num FROM immigration """) df = df.distinct() # data_quality_check(df) # save DataFrame as .parquet in output_data/us_immigration directory print('Saving us_immigration table to {}'.format(output_data)) df.write.parquet(os.path.join(output_data, 'us_immigration'), 'overwrite', partitionBy=['arrival_month', 'port_state_code'])
storesMaxDate = RFM0.groupby(['Store']).agg(sf.max('Date')) display(storesMaxDate) # COMMAND ---------- storesMaxDate.printSchema() # COMMAND ---------- # MAGIC %md # MAGIC Obtaining the difference between the global maximum date and the maximum date of each store: # COMMAND ---------- r = (RFM0.join(storesMaxDate, on='Store').withColumn( 'Recency', datediff(to_date(lit(maxDate[0])), col('max(Date)')))) display(r) #recency.count() # COMMAND ---------- # MAGIC %md # MAGIC ### Frequency # MAGIC # MAGIC The frequency of purchases that occurred in the given period is obtained. In this case, it is observed how all the stores had sales every day. # COMMAND ---------- f = (RFM0.groupby(['Store']).count().withColumn('Frequency', col('count')))
notebook = os.path.basename(getNotebookPath()) input_data_path = "/dbfs/mnt/" + environment + "/automl_rev_region_forecast/inputs/" output_data_path = "/dbfs/mnt/" + environment + "/automl_rev_region_forecast/outputs/" blobstore_datadir = "revregionforecast_data/" bi_config_parameter_filepath = "/mnt/{}/automl_rev_region_forecast/config/{}".format( environment_name, filename) try: # read JSON file df_bi_config_parameters = (spark.read.format("json").option( "multiline", "true").load(bi_config_parameter_filepath)) df_bi_config_parameters = df_bi_config_parameters.filter( df_bi_config_parameters.SystemName == lit(system_name)) display(df_bi_config_parameters) # adding audit fields # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBICreatedBy",lit(ibi_created_by)) # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBIUpdatedBy",lit(ibi_updated_by)) # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBICreatedDate",lit(ibi_created_date).cast(TimestampType())) # df_bi_config_parameters = df_bi_config_parameters.withColumn("IBIUpdatedDate",lit(ibi_updated_date).cast(TimestampType())) # df_bi_config_parameters = df_bi_config_parameters.withColumn('ID', row_number().over(Window.orderBy('EnvironmentName','SystemName'))) # initializing config parameter values # subscription_id = "db61fd47-db56-45e3-844f-1b1f5c47990a" #you should be owner or contributor if environment in {"prod"}: subscription_id = dbutils.secrets.get(scope="kv-bi-prod-01-secrets", key="subscription-id")
def process_log_data(spark, input_data, output_data, song_df): """ Description: This function can be used to process log-data files from the given input path and transform the data from json files into users, time and songplays spark tables and writing these tables to the given output path as parquet tables. Arguments: spark: SparkSession object. input_data: Path to the input JSON files. output_data: Path to the output directory that stores output parquet tables. song_df: Song data dataframe. Returns: None. """ # get filepath to log data file log_data = input_data + 'log-data/2018/11' # define schema for log data file log_schema = t.StructType([ t.StructField("artist", t.StringType(), True), t.StructField("auth", t.StringType(), True), t.StructField("firstName", t.StringType(), True), t.StructField("gender", t.StringType(), True), t.StructField("itemInSession", t.IntegerType(), True), t.StructField("lastName", t.StringType(), True), t.StructField("length", t.DecimalType(12, 7), True), t.StructField("level", t.StringType(), True), t.StructField("location", t.StringType(), True), t.StructField("method", t.StringType(), True), t.StructField("page", t.StringType(), True), t.StructField("registration", t.DecimalType(16, 2), True), t.StructField("sessionId", t.IntegerType(), True), t.StructField("song", t.StringType(), True), t.StructField("status", t.IntegerType(), True), t.StructField("ts", t.LongType(), True), t.StructField("userAgent", t.StringType(), True), t.StructField("userId", t.StringType(), True) ]) # read log data file using schema df = spark \ .read \ .format("json") \ .schema(log_schema) \ .load(log_data) # filter by actions for song plays df = df \ .filter('page = "NextSong"') # group by userId for unique users users_list = df \ .groupBy('userId') \ .agg(f.max('ts').alias('ts')) # extract columns to create users table users_table = df \ .join(users_list, ['userId', 'ts'], 'inner') \ .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \ .dropDuplicates() # write users table to parquet files users_output = output_data + 'users' users_table \ .write \ .option("path", users_output) \ .saveAsTable('users', format='parquet') # create timestamp column from original timestamp column df = df \ .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC')) # create datetime column from original timestamp column get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), t.TimestampType()) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = df \ .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_output = output_data + 'time' time_table \ .write \ .partitionBy('year', 'month') \ .option("path", time_output) \ .saveAsTable('time', format='parquet') # join and extract columns from song and log datasets to create songplays table cond = [ df.artist == song_df.artist_name, df.song == song_df.title, df.length == song_df.duration ] songplays_df = df.join(song_df, cond, 'left') songplays_df = songplays_df \ .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month')) w = Window().orderBy(f.lit('A')) songplays_table = songplays_df.withColumn('songplay_id', f.row_number().over(w)) # write songplays table to parquet files partitioned by year and month songplays_output = output_data + 'songplays' songplays_table \ .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\ .write \ .partitionBy('year', 'month') \ .option("path", songplays_output) \ .saveAsTable('songplays', format='parquet')
def test_different_schemas(self): right = self.data2.withColumn('v3', lit('a')) self._test_merge(self.data1, right, 'id long, k int, v int, v2 int, v3 string')
def test_sampling_general_approach(spark_context, hive_context): """Generate a dataframe and see if sampling it has same general shape""" # Create wikis with different looking long tail distributions wikis = [ ("foowiki", 1500, -1), ("barwiki", 700, -1), # This has a very flat long tail, with most data points being the same ("bazwiki", 700, -2), ] # use all combinations of lowercase letters as our set of test queries. This is 26^2, # or just shy of 700 queries. queries = [ "%s%s" % pair for pair in itertools.product(string.ascii_lowercase, string.ascii_lowercase) ] rows = [] for (wiki, a, k) in wikis: # create sessions for each query with a long tail distribution for (x, q) in enumerate(queries): # approximate a long tail distribution using ax^k + b # x + 1 needed because enumerate starts at 0. b is set to 10 to test the # min sessions per query limit num_sessions = max(1, min(100, int(a * math.pow(x + 1, k)) + 10)) for j in range(0, num_sessions): session_id = "%s_%s_%s" % (wiki, q, str(j)) rows.append((wiki, q, x, session_id, list(range(3)))) df = (spark_context.parallelize(rows).toDF( ['wikiid', 'query', 'norm_query_id', 'session_id', 'hit_page_ids'])) samples_per_wiki = 1000 # Using a constant seed ensures deterministic testing. Because this code # actually relies on the law of large numbers, and we do not have large # numbers here, many seeds probably fail. hit_page_id_counts, df_sampled = mjolnir.sampling.sample( df, samples_per_wiki=samples_per_wiki, seed=12345) sampled = (df_sampled.select( 'wikiid', 'query', F.explode('hit_page_ids').alias( 'hit_page_id')).drop_duplicates().groupBy('wikiid').agg( F.count(F.lit(1)).alias('num_samples')).collect()) total_samples_desired = len(wikis) * samples_per_wiki total_samples = sum([r.num_samples for r in sampled]) assert abs(total_samples - total_samples_desired) / float(total_samples_desired) < 0.05 # Test each wiki also meets the constraint for (wiki, _, _) in wikis: # ratio of rows sampled_num_rows = sum( [r.num_samples for r in sampled if r.wikiid == wiki]) # TODO: Why 0.10? It works with our seed... assert abs(sampled_num_rows - samples_per_wiki) / float(samples_per_wiki) <= 0.10 # assert correlation between sessions per query orig_grouped = (df.groupBy('wikiid', 'norm_query_id').agg( F.countDistinct('session_id').alias('num_sessions')).collect()) sampled_grouped = (df_sampled.groupBy('wikiid', 'norm_query_id').agg( F.countDistinct('session_id').alias('num_sessions')).collect()) for (wiki, _, _) in wikis: orig = sorted( [r.num_sessions for r in orig_grouped if r.wikiid == wiki]) sampled = sorted( [r.num_sessions for r in sampled_grouped if r.wikiid == wiki]) # interpolate sampled into the same length as orig sampled_interp = np.interp(range(len(orig)), np.linspace(1, len(orig), len(sampled)), sampled) # corrcoef allows comparing N data sets, returning a covariance matrix. # take 0,1 to get corr(orig, sampled_interp) corr = np.corrcoef(orig, sampled_interp)[0, 1] # Is .8 reasonable? Sometimes this fails when using something stricter # like .95 assert corr > .8, wiki
color_new_2019 = dfcolor_2019.exceptAll(dfcolor_2018).withColumnRenamed('advertiser','new advertiser') starplus_lost_2019 = dfstarplus_2018.exceptAll(dfstarplus_2019).withColumnRenamed('advertiser','lost advertiser') starplus_new_2019 = dfstarplus_2019.exceptAll(dfstarplus_2018).withColumnRenamed('advertiser','new advertiser') starplus_lost_2019.show() starplus_new_2019.show() color_lost_2019.show() color_new_2019.show() # COMMAND ---------- from pyspark.sql.types import StructField, StructType,StringType from pyspark.sql.dataframe import LostFoundSchema = StructType([StructField('channel',StringType(),True), StructField('advertiser_lost',StringType(),True), StructField('advertiser_found',StringType(),True)]) # COMMAND ---------- from pyspark.sql.functions import lit df1 = starplus_lost_2019.join(starplus_new_2019,'channel','leftouter') df2 = color_lost_2019.join(color_new_2019,'channel','inner') dfLostFound = df1.union(df2).withColumn('year',lit(2019)) # COMMAND ---------- dfLostFound.select('year','channel','lost advertiser','new advertiser').show() # COMMAND ----------
def main(): ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) start_date_id = 20200101 end_date_id = 20200305 print('start_date_id: ', start_date_id) print('end_date_id: ', end_date_id) # start_year_month_id, end_year_month_id = get_year_month_id_from_date(start_date_id, end_date_id) start_year_week_id, end_year_week_id = get_year_week_id_from_date(start_date_id, end_date_id) # print('start_year_month_id: ', start_year_month_id) print('end_year_month_id: ', end_year_month_id) print('start_year_week_id: ', start_year_week_id) print('end_year_week_id: ', end_year_week_id) print('start_year_week_id: ', start_year_week_id) print('end_year_week_id: ', end_year_week_id) # ------------------------------------------------------------------------------------------------------------------# df_student_package_status_by_date = get_student_package_adivsor_level(start_date_id, end_date_id) df_student_package_status_by_date.cache() df_student_learning_and_duration_by_date = get_total_student_lerning_and_duration_by_date(glueContext, start_year_month_id, end_year_month_id) df_student_package_status_by_date_learning = df_student_package_status_by_date\ .join(df_student_learning_and_duration_by_date, on=['contact_id', 'date_id'], how='left') df_student_package_status_by_date_learning = df_student_package_status_by_date_learning.na.fill({ 'total_learning_ls_sc_lt_le2': 0L, 'total_learning_ls_sc_lt_le2_success': 0L, 'total_learning_ls_sc_lt': 0L, 'total_learning_ls_sc_lt_success': 0L, 'total_learning_ls_success': 0L, 'total_learning_sc_success': 0L, 'total_learning_lt_success': 0L, 'total_duration_ls_sc_lt': 0L, 'total_learning_le2': 0L, 'total_learning_le2_success': 0L, 'total_learning_voxy_success': 0L, 'total_learning_native_talk_success': 0L, 'total_learning_home_work_success': 0L, 'total_learning_ncsbasic_success': 0L, 'total_duration_le2': 0L, 'total_duration_voxy': 0L, 'total_duration_native_talk': 0L, 'total_duration_home_work': 0L, 'total_duration_ncsbasic': 0L }) df_student_package_status_by_date_learning.cache() print('df_student_package_status_by_date_learning') df_student_package_status_by_date_learning.printSchema() df_student_package_status_by_date_learning.show(3) if is_dev: dyf_student_package_status_by_date_learning = DynamicFrame \ .fromDF(df_student_package_status_by_date_learning, glueContext, 'dyf_student_package_status_by_date_learning') atasink4 = glueContext.write_dynamic_frame \ .from_jdbc_conf(frame=dyf_student_package_status_by_date_learning, catalog_connection="glue_redshift", connection_options={ "dbtable": "dev.df_student_package_status_by_date_learning", "database": "student_native_report" }, redshift_tmp_dir="s3://dts-odin/temp/nvn/knowledge/student/df_student_package_status_by_date_learning", transformation_ctx="datasink4") #-------------- save to bc200_fact df_student_package_status_by_date_learning = df_student_package_status_by_date_learning \ .select('date_id', 'package_id', 'student_level_id', 'contact_id', 'advisor_id', 'is_activated', f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_le2'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_le2_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_le2_success'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt'), f.when(df_student_package_status_by_date_learning['total_learning_ls_sc_lt_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_sc_lt_success'), f.when(df_student_package_status_by_date_learning['total_learning_ls_success'] > 0L, 1L) .otherwise(0L).alias('is_ls_success'), f.when(df_student_package_status_by_date_learning['total_learning_sc_success'] > 0L, 1L) .otherwise(0L).alias('is_sc_success'), f.when(df_student_package_status_by_date_learning['total_learning_lt_success'] > 0L, 1L) .otherwise(0L).alias('is_lt_success'), f.when(df_student_package_status_by_date_learning['total_learning_le2'] > 0L, 1L) .otherwise(0L).alias('is_le2'), f.when(df_student_package_status_by_date_learning['total_learning_le2_success'] > 0L, 1L) .otherwise(0L).alias('is_le2_success'), f.when(df_student_package_status_by_date_learning['total_learning_voxy_success'] > 0L, 1L) .otherwise(0L).alias('is_voxy_success'), f.when(df_student_package_status_by_date_learning['total_learning_native_talk_success'] > 0L, 1L) .otherwise(0L).alias('is_native_talk_success'), f.when(df_student_package_status_by_date_learning['total_learning_home_work_success'] > 0L, 1L) .otherwise(0L).alias('is_home_work_success'), f.when(df_student_package_status_by_date_learning['total_learning_ncsbasic_success'] > 0L, 1L) .otherwise(0L).alias('is_ncsbasic_success'), 'total_learning_ls_sc_lt_le2', 'total_learning_ls_sc_lt_le2_success', 'total_learning_ls_sc_lt', 'total_learning_ls_sc_lt_success', 'total_learning_ls_success', 'total_learning_sc_success', 'total_learning_lt_success', 'total_duration_ls_sc_lt', 'total_learning_le2', 'total_learning_le2_success', 'total_learning_voxy_success', 'total_learning_native_talk_success', 'total_learning_home_work_success', 'total_learning_ncsbasic_success', 'total_duration_le2', 'total_duration_voxy', 'total_duration_native_talk', 'total_duration_home_work', 'total_duration_ncsbasic' ) df_student_package_status_group_week = df_student_package_status_by_date_learning \ .groupBy('date_id', 'package_id', 'student_level_id', 'advisor_id') \ .agg(f.count('contact_id').alias('total_student'), f.sum('is_activated').alias('total_student_active'), f.sum('is_ls_sc_lt_le2').alias('total_student_ls_sc_lt_le2'), f.sum('is_ls_sc_lt_le2_success').alias('total_student_ls_sc_lt_le2_success'), f.sum('total_learning_ls_sc_lt_le2').alias('total_learning_ls_sc_lt_le2'), f.sum('total_learning_ls_sc_lt_le2_success').alias('total_learning_ls_sc_lt_le2_success'), f.sum('is_ls_sc_lt').alias('total_student_ls_sc_lt'), f.sum('is_ls_sc_lt_success').alias('total_student_ls_sc_lt_success'), f.sum('is_ls_success').alias('total_student_ls_success'), f.sum('is_sc_success').alias('total_student_sc_success'), f.sum('is_lt_success').alias('total_student_lt_success'), f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt'), f.sum('total_learning_ls_sc_lt').alias('total_learning_ls_sc_lt_success'), f.sum('total_learning_ls_success').alias('total_learning_ls_success'), f.sum('total_learning_sc_success').alias('total_learning_sc_success'), f.sum('total_learning_lt_success').alias('total_learning_lt_success'), f.sum('total_duration_ls_sc_lt').alias('total_duration_ls_sc_lt'), f.sum('is_le2').alias('total_student_le2'), f.sum('is_le2_success').alias('total_student_le2_success'), f.sum('is_voxy_success').alias('total_student_voxy_success'), f.sum('is_native_talk_success').alias('total_student_native_talk_success'), f.sum('is_home_work_success').alias('total_student_home_work_success'), f.sum('is_ncsbasic_success').alias('total_student_ncsbasic_success'), f.sum('total_learning_le2').alias('total_learning_le2'), f.sum('total_learning_le2_success').alias('total_learning_le2_success'), f.sum('total_learning_voxy_success').alias('total_learning_voxy__success'), f.sum('total_learning_native_talk_success').alias('total_learning_native_talk_success'), f.sum('total_learning_home_work_success').alias('total_learning_home_work_success'), f.sum('total_learning_ncsbasic_success').alias('total_learning_ncsbasic_success'), f.sum('total_duration_le2').alias('total_duration_le2'), f.sum('total_duration_voxy').alias('total_duration_voxy'), f.sum('total_duration_native_talk').alias('total_duration_native_talk'), f.sum('total_duration_home_work').alias('total_duration_home_work'), f.sum('total_duration_ncsbasic').alias('total_duration_ncsbasic') ) \ .withColumn('period_id', f.lit(DAILY_PERIOD_ID)) \ .withColumn('report_role_id', f.lit(REPORT_ROLE_MANAGER_ID)) # display(df_student_package_status_group_week, "df_student_package_status_group_week") dyf_student_package_status_group_week = DynamicFrame.fromDF(df_student_package_status_group_week, glueContext, 'dyf_student_package_status_group_week') apply_ouput = ApplyMapping \ .apply(frame=dyf_student_package_status_group_week, mappings=[("report_role_id", "long", "report_role_id", "long"), ("period_id", "long", "period_id", "long"), ("date_id", "long", "time_id", "long"), ("package_id", "long", "package_id", "long"), ("student_level_id", "long", "student_level_id", "long"), ("advisor_id", "long", "advisor_id", "long"), ("total_student", "long", "total_student", "long"), ("total_student_active", "long", "total_student_active", "long"), ("total_student_ls_sc_lt_le2", "long", "total_student_ls_sc_lt_le2", "long"), ("total_student_ls_sc_lt_le2_success", "long", "total_student_ls_sc_lt_le2_success", "long"), ("total_learning_ls_sc_lt_le2", "long", "total_learning_ls_sc_lt_le2", "long"), ("total_learning_ls_sc_lt_le2_success", "long", "total_learning_ls_sc_lt_le2_success", "long"), ("total_student_ls_sc_lt", "long", "total_student_ls_sc_lt", "long"), ("total_student_ls_sc_lt_success", "long", "total_student_ls_sc_lt_success", "long"), ("total_student_ls_success", "long", "total_student_ls_success", "long"), ("total_student_sc_success", "long", "total_student_sc_success", "long"), ("total_student_lt_success", "long", "total_student_lt_success", "long"), ("total_learning_ls_sc_lt", "long", "total_learning_ls_sc_lt", "long"), ("total_learning_ls_sc_lt_success", "long", "total_learning_ls_sc_lt_success", "long"), ("total_learning_ls_success", "long", "total_learning_ls_success", "long"), ("total_learning_sc_success", "long", "total_learning_sc_success", "long"), ("total_learning_lt_success", "long", "total_learning_lt_success", "long"), ("total_duration_ls_sc_lt", "long", "total_duration_ls_sc_lt", "long"), ("total_student_le2", "long", "total_student_le2", "long"), ("total_student_le2_success", "long", "total_student_le2_success", "long"), ("total_student_voxy_success", "long", "total_student_voxy_success", "long"), ("total_student_native_talk_success", "long", "total_student_native_talk_success", "long"), ("total_student_home_work_success", "long", "total_student_home_work_success", "long"), ("total_student_ncsbasic_success", "long", "total_student_ncsbasic_success", "long"), ("total_learning_le2", "long", "total_learning_le2", "long"), ("total_learning_le2_success", "long", "total_learning_le2_success", "long"), ("total_learning_voxy__success", "long", "total_learning_voxy__success", "long"), ("total_learning_native_talk_success", "long", "total_learning_native_talk_success", "long"), ("total_learning_home_work_success", "long", "total_learning_home_work_success", "long"), ("total_learning_ncsbasic_success", "long", "total_learning_ncsbasic_success", "long"), ("total_duration_le2", "long", "total_duration_le2", "long"), ("total_duration_voxy", "long", "total_duration_voxy", "long"), ("total_duration_native_talk", "long", "total_duration_native_talk", "long"), ("total_duration_home_work", "long", "total_duration_home_work", "long"), ("total_duration_ncsbasic", "long", "total_duration_ncsbasic", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") display(dfy_output, "dfy_output") # save_data_to_redshift( # glueContext, # dfy_output, # 'student_native_report', # 'bc200.bc200_fact_v2_1', # "s3n://dts-odin/temp/bc200/bc200_fact_v2_1", # "datasink4") preactions = "DELETE from bc200.bc200_fact_v2_1 WHERE period_id = " + str(DAILY_PERIOD_ID) + " and time_id >= " + str(start_date_id) glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "bc200.bc200_fact_v2_1", "database": "student_native_report" }, redshift_tmp_dir="s3n://dts-odin/temp/bc200/bc200_fact_v2", transformation_ctx="datasink4") #------------------------------------------------------- df_student_package_status_by_date_learning.unpersist() df_student_package_status_by_date.unpersist()
## @inputs: [frame = applymapping1] resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_struct", transformation_ctx="resolvechoice2") ## @type: DropNullFields ## @args: [transformation_ctx = "dropnullfields3"] ## @return: dropnullfields3 ## @inputs: [frame = resolvechoice2] dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") ## @type: DataSink ## @args: [connection_type = "s3", connection_options = {"path": "s3://go-lambda-bucket/Taxi_Data"}, format = "parquet", transformation_ctx = "datasink4"] ## @return: datasink4 ## @inputs: [frame = dropnullfields3] ##---------------------------------- #convert to a Spark DataFrame... customDF = datasource0.toDF() #add a new column for "type" customDF = customDF.withColumn("type", lit('yellow')) # Convert back to a DynamicFrame for further processing. customDynamicFrame = DynamicFrame.fromDF(customDF, glueContext, "customDF_df") ##---------------------------------- datasink4 = glueContext.write_dynamic_frame.from_options( frame=customDynamicFrame, connection_type="s3", connection_options={"path": "s3://go-lambda-bucket"}, format="parquet", transformation_ctx="datasink4") job.commit()
# MAGIC # MAGIC Let's create a new DataFrame from `wordsDF` by performing an operation that adds an 's' to each word. To do this, we'll call the [`select` DataFrame function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select) and pass in a column that has the recipe for adding an 's' to our existing column. To generate this `Column` object you should use the [`concat` function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.concat) found in the [`pyspark.sql.functions` module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions). Note that `concat` takes in two or more string columns and returns a single string column. In order to pass in a constant or literal value like 's', you'll need to wrap that value with the [`lit` column function](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.lit). # MAGIC # MAGIC Please replace `<FILL IN>` with your solution. After you have created `pluralDF` you can run the next cell which contains two tests. If you implementation is correct it will print `1 test passed` for each test. # MAGIC # MAGIC This is the general form that exercises will take. Exercises will include an explanation of what is expected, followed by code cells where one cell will have one or more `<FILL IN>` sections. The cell that needs to be modified will have `# TODO: Replace <FILL IN> with appropriate code` on its first line. Once the `<FILL IN>` sections are updated and the code is run, the test cell can then be run to verify the correctness of your solution. The last code cell before the next markdown section will contain the tests. # MAGIC # MAGIC > Note: # MAGIC > Make sure that the resulting DataFrame has one column which is named 'word'. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import lit, concat pluralDF = wordsDF.select(concat(wordsDF.word, lit('s')).alias('word')) pluralDF.show() # COMMAND ---------- # Load in the testing code and check to see if your answer is correct # If incorrect it will report back '1 test failed' for each failed test # Make sure to rerun any cell you change before trying the test again from databricks_test_helper import Test # TEST Using DataFrame functions to add an 's' (1b) Test.assertEquals(pluralDF.first()[0], 'cats', 'incorrect result: you need to add an s') Test.assertEquals(pluralDF.columns, ['word'], "there should be one column named 'word'") # COMMAND ---------- # MAGIC %md
def dqm(self): return SmvDQM().add( DQMRule(col("b") < 0.4 , "b_lt_03")).add( DQMFix(col("a") < 1, lit(1).alias("a"), "a_lt_1_fix")).add( FailTotalRuleCountPolicy(2)).add( FailTotalFixCountPolicy(1))
# COMMAND ---------- # MAGIC %md ### use `Pandas` # COMMAND ---------- bts_data_pdf = bts_data_df.toPandas() bts_data_pdf # COMMAND ---------- import pyspark.sql.functions as F bts_data_df = bts_data_df.withColumn( "city_agg", F.when(F.col("city") > 0, "Prague").otherwise(F.lit("Brno"))) # COMMAND ---------- display(bts_data_df.groupby("city_agg").count()) # COMMAND ---------- # MAGIC %sql # MAGIC select cgi_ecgi, count(1) as row_cnt # MAGIC from probe_data # MAGIC group by cgi_ecgi # MAGIC order by row_cnt DESC # MAGIC limit 10 # COMMAND ----------
def compile_null_if(t, expr, scope, **kwargs): op = expr.op() col = t.translate(op.arg, scope) nullif_col = t.translate(op.null_if_expr, scope) return F.when(col == nullif_col, F.lit(None)).otherwise(col)
def compile_null_literal(t, expr, scope): return F.lit(None)
customSchema2 = StructType([StructField("name", StringType(), True), StructField("country", StringType(), True), \ StructField("area_code", IntegerType(), True), StructField("code", StringType(), True)]) df1 = sqlcontext.read \ .format('com.databricks.spark.csv') \ .options(header='true') \ .load("file:///C:/Users/Administrator/Downloads/data1/airport.csv", schema = customSchema2) #df1.show() newjoindf = df.join(df1, df.origin == df1.code) sqlcontext.registerDataFrameAsTable(newjoindf, "jointable") sqlquery= "SELECT code as Airport, year, month , avg(CAST(arr_delay AS BIGINT)) AS avgdelay FROM jointable GROUP BY code,year,month ORDER BY avgdelay DESC LIMIT 5" sqlquery1= "SELECT code as Airport, year, month , avg(CAST(arr_delay AS BIGINT)) AS avgdelay FROM jointable GROUP BY code,year,month ORDER BY avgdelay LIMIT 5" top5airport = sqlcontext.sql(sqlquery).show() last5airport = sqlcontext.sql(sqlquery1).show() concatedate = newjoindf.withColumn('date',sf.concat(sf.col('year'),sf.lit('/'), sf.col('month'),sf.lit('/'), sf.col('day'))) func = udf (lambda x: datetime.strptime(x, '%Y/%m/%d'), DateType()) date = concatedate.withColumn('date', func(col('date'))) sqlcontext.registerDataFrameAsTable(date, "newtable") sqlquery2= "SELECT DISTINCT code AS airport,date FROM newtable WHERE arr_delay <> 'NA' ORDER BY date DESC" #"SELECT DISTINCT code AS airport, date FROM newtable WHERE arr_delay IS NOT NULL ORDER BY date DESC " newdf = sqlcontext.sql(sqlquery2).show(100) last7days = newdf.where(datediff(current_date(), col("date")) < 7)
# import findspark # findspark.init() from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[*]").getOrCreate() # spark # import matplotlib.pyplot as plt import numpy as np from pyspark.sql.functions import lit import time row = spark.read.csv("/content/kc_house_data.csv", inferSchema=True, header=True) # add intercept column row = row.withColumn("intercept", lit(1)) print("Correlation price, bedrooms:", row.stat.corr('price', 'bedrooms')) print("Correlation price, bathrooms:", row.stat.corr('price', 'bathrooms')) print("Correlation price, sqft_living:", row.stat.corr('price', 'sqft_living')) print("Correlation price, sqft_lot:", row.stat.corr('price', 'sqft_lot')) print("Correlation price, floors:", row.stat.corr('price', 'floors')) print("Correlation price, waterfront:", row.stat.corr('price', 'waterfront')) print("Correlation price, view:", row.stat.corr('price', 'view')) print("Correlation price, condition:", row.stat.corr('price', 'condition')) print("Correlation price, grade:", row.stat.corr('price', 'grade')) print("Correlation price, sqft_above:", row.stat.corr('price', 'sqft_above')) print("Correlation price, sqft_basement:", row.stat.corr('price', 'sqft_basement')) print("Correlation price, yr_built:", row.stat.corr('price', 'yr_built')) print("Correlation price, yr_renovated:", row.stat.corr('price', 'yr_renovated')) print("Correlation price, zipcode:", row.stat.corr('price', 'zipcode')) print("Correlation price, lat:", row.stat.corr('price', 'lat'))
cj_df = hc.sql(cj_query) cj_lt = cj_df.schema.names cj_lt[:] = [s.replace('cj_', 'c_') for s in cj_lt] cj_df = cj_df.toDF(*cj_lt) cei_df = hc.sql(cei_query) cei_lt = cei_df.schema.names cei_lt[:] = [s.replace('cei_', 'c_') for s in cei_lt] cei_df = cei_df.toDF(*cei_lt) all_columns = set(cei_df.schema.names).union(set(cj_df.schema.names)) from pyspark.sql.functions import lit for column0 in all_columns: if column0 not in cj_df.columns: cj_df = cj_df.withColumn(column0, lit('CNP')) print("column missing in cj:" + column0) if column0 not in cei_df.columns: cei_df = cei_df.withColumn(column0, lit('CNP')) print("column missing in cei:" + column0) # print(column0) print(set(cj_df.columns).difference(set(cei_df.columns)).show) cei_df = cei_df[sorted(cei_df.schema.names)] cj_df = cj_df[sorted(cj_df.schema.names)] output_df = cj_df.union(cei_df)
output_path = '/user/soyel/pyspark-cicd-template/output/user_pageviews' # Extract inc_df: DataFrame = spark.read.csv(path=page_views_path, header=True, schema=page_views) prev_df: DataFrame = spark.read.table(tableName=user_pageviews_tab) # Transform inc_df: DataFrame = (inc_df.groupBy('email').count().select([ 'email', col('count').alias('page_view'), current_date().alias('last_active') ])) df_transformed: DataFrame = (inc_df.join( prev_df, inc_df.email == prev_df.email, 'full').select([ coalesce(prev_df.email, inc_df.email).alias('email'), (coalesce(prev_df.page_view, lit(0)) + coalesce(inc_df.page_view, lit(0))).alias('page_view'), coalesce(prev_df.created_date, inc_df.last_active).cast('date').alias('created_date'), coalesce(inc_df.last_active, prev_df.last_active).cast('date').alias('last_active') ])) # Load df_transformed.write.save(path=output_path, mode='overwrite') spark.stop()
def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" )
def test_datetime_at_epoch(self): epoch = datetime.datetime.fromtimestamp(0) df = self.spark.createDataFrame([Row(date=epoch)]) first = df.select('date', lit(epoch).alias('lit_date')).first() self.assertEqual(first['date'], epoch) self.assertEqual(first['lit_date'], epoch)
def __appendAggKey(tsdf, freq=None): """ :param tsdf: TSDF object as input :param freq: frequency at which to upsample :return: return a TSDF with a new aggregate key (called agg_key) """ df = tsdf.df checkAllowableFreq(freq) # compute timestamp columns sec_col = f.second(f.col(tsdf.ts_col)) min_col = f.minute(f.col(tsdf.ts_col)) hour_col = f.hour(f.col(tsdf.ts_col)) if (freq == SEC): #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0'), f.lpad(min_col, 2, '0'), f.lpad(sec_col, 2, '0')) agg_key = f.concat( f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp") elif (freq == MIN): #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0'), f.lpad(min_col, 2, '0')) agg_key = f.concat( f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == HR): #agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lpad(hour_col, 2, '0')) agg_key = f.concat( f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp") df = df.withColumn("agg_key", agg_key) return tempo.tsdf.TSDF(df, tsdf.ts_col, partition_cols=tsdf.partitionCols)
def dist(long_x, lat_x, long_y, lat_y): return acos( sin(radians(lat_x)) * sin(radians(lat_y)) + cos(radians(lat_x)) * cos(radians(lat_y)) * cos(radians(long_x) - radians(long_y)) ) * lit(6371.0)
StructField("VERTN", StringType(), True), StructField("VBEWA", StringType(), True), StructField("KBLNR", StringType(), True), StructField("KBLPOS", ShortType(), True), StructField("GRANT_NBR", StringType(), True), StructField("GMVKZ", StringType(), True), StructField("SRTYPE", StringType(), True), StructField("LOTKZ", StringType(), True), StructField("ZINKZ", StringType(), True), StructField("FKBER", StringType(), True), StructField("INTRENO", StringType(), True), StructField("PPRCT", StringType(), True), StructField("BUZID", StringType(), True), StructField("AUGGJ", ShortType(), True), StructField("HKTID", StringType(), True), StructField("BUDGET_PD", StringType(), True), StructField("KONTT", StringType(), True), StructField("KONTL", StringType(), True), StructField("UEBGDAT", StringType(), True), StructField("VNAME", StringType(), True), StructField("EGRUP", StringType(), True), StructField("BTYPE", StringType(), True), StructField("PROPMANO", StringType(), True), StructField("INWARDNO_HD", StringType(), True), StructField("INWARDDT_HD", StringType(), True)]) df1 = spark.read.load(file_path, format="csv", sep="¬", header="true", schema=schema1) df1 = df1.withColumn("LOAD_DATE", lit(sys.argv[6])) df1.write.insertInto(table_full, overwrite=False)
def count_not_null(c, nan_as_null=False): pred = F.col(c).isNotNull() & (~isnan(c) if nan_as_null else F.lit(True)) return F.sum(pred.cast("integer")).alias(c)
from pyspark.sql import SparkSession from pyspark.sql.functions import array,udf import numpy as np import sys #spark = SparkSession.builder.appName("wind_pi_eda").enableHiveSupport().getOrCreate() df = spark.sql("select * from analytical_ds.wind_turbine_combined_data") split_col = F.split(df['tur_date'], '-') df = df.withColumn('month', split_col.getItem(1)) df = df.withColumn('date', split_col.getItem(0)) split_col_2 = F.split(df['tur_time'], ':') df = df.withColumn('hour', split_col_2.getItem(0)) df = df.withColumn('minute', split_col_2.getItem(1)) df = df.withColumn('datetime_index', F.concat(F.col('date'),F.lit('-'), F.col('month'),F.lit('-2017 '), F.col('hour'),F.lit(':'),F.col('minute'),F.lit(':00'))) col_names = df.drop('ext_curtailment_ind_avg','int_derate_ind_avg','consumption_counter_sample','row_num','operating_state','state_fault').columns[3:-5] for i in range(0,len(col_names)): df2 = df.select(unix_timestamp('datetime_index', "dd-MMM-yyyy HH:mm:ss") .cast(TimestampType()).alias("timestamp"),'turbine_id',col_names[i]) df2 = df2.withColumn(col_names[i], F.last(col_names[i], True).over(Window.partitionBy('turbine_id').orderBy('timestamp').rowsBetween(-sys.maxsize, 0))) df2 = df2.withColumn('lag1',F.lag(df2[col_names[i]]).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag2',F.lag(df2[col_names[i]],2).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag3',F.lag(df2[col_names[i]],3).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag4',F.lag(df2[col_names[i]],4).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag5',F.lag(df2[col_names[i]],5).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag6',F.lag(df2[col_names[i]],6).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag7',F.lag(df2[col_names[i]],7).over(Window.partitionBy("turbine_id").orderBy("timestamp"))) df2 = df2.withColumn('lag8',F.lag(df2[col_names[i]],8).over(Window.partitionBy("turbine_id").orderBy("timestamp")))
def vector_plus_vector(v1, v2): """Compute vector v1 + v2 where some components of v1 may not appear in v2""" return v1.selectExpr('i', 'value as v1').join( v2.selectExpr('i', 'value as v2'), on='i', how='left'). \ select('i', (fn.col('v1') + fn.coalesce(fn.col('v2'), fn.lit(0))).alias('value'))
def data2(self): return self.spark.range(10).toDF('id') \ .withColumn("ks", array([lit(i) for i in range(20, 30)])) \ .withColumn("k", explode(col('ks'))) \ .withColumn("v2", col('k') * 100) \ .drop('ks')
def target_encoder(training_frame, test_frame, x, y, lambda_=0.15, threshold=150, test=False, valid_frame=None, frame_type='h2o', id_col=None): """ Applies simple target encoding to categorical variables. :param training_frame: Training frame which to create target means and to be encoded. :param test_frame: Test frame to be encoded using information from training frame. :param x: Name of input variable to be encoded. :param y: Name of target variable to use for encoding. :param lambda_: Balance between level mean and overall mean for small groups. :param threshold: Number below which a level is considered small enough to be shrunken. :param test: Whether or not to print the row_val_dict for testing purposes. :param valid_frame: To also combine features on a validation frame include this (optional) :param frame_type: The type of frame being used. Accepted: ['h2o','pandas','spark'] :param id_col: The name of the id column for spark dataframes only. Will conserve memory and only return 2 columns in dfs(id,x_Tencode) :return: Tuple of encoded variable from train and test set as H2OFrames. """ encode_name = x + '_Tencode' if frame_type == 'spark': # x_column_type = training_frame.select(x).dtypes.flatMap(list)[1] #To get the average out of the df have to convert to an rdd and flatMap #it. Then take the first and only value from the list returned. overall_mean = training_frame.agg({y: 'avg'}).rdd.flatMap(list).first() overall_mean_train = overall_mean #ALTERNATIVE way to do the same thing with sql functions # from pyspark.sql.functions import col, avg # overall_mean = training_frame.agg(avg(col(y))).rdd.flatMap(list).first() def find_shrunken_averages(tuple_input): """ Reduce function to return the proper average for a given level. :return: A tuple of (level, ajusted_mean||overall_mean) """ #The categorical level. level = tuple_input[0] # The labels list (y varaibale) from a map function. labels = tuple_input[1] # The total number of level occurances in the frame (ie count) level_n = len(labels) level_mean = sum(labels) / level_n # Determine if there enough occurances of a level. If NOT return overall_mean if level_n >= threshold: return (level, level_mean) else: return(level, ((1 - lambda_) * level_mean) +\ (lambda_ * overall_mean) ) #This article shows why one has to use a map-groupByKey-map rather then map-reduce order. To collect all values into one reducer #you have to do a groupByKey. #https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html levels_average_list_train = training_frame.select( x, y).rdd.map(lambda i: (i[0], i[1])).groupByKey().map( find_shrunken_averages).collect() levels_average_list_valid = None overall_mean_valid = None if valid_frame: #update overall_mean to valid frames mean overall_mean_valid = valid_frame.agg({ y: 'avg' }).rdd.flatMap(list).first() overall_mean = overall_mean_valid levels_average_list_valid = valid_frame.select( x, y).rdd.map(lambda i: (i[0], i[1])).groupByKey().map( find_shrunken_averages).collect() # print(levels_average_list_train) from pyspark.sql.functions import lit #creates a literal value # create new frames with a new column new_training_frame, new_test_frame, new_valid_frame = None, None, None if id_col != None: #filter out other columns to save memory if id_col specified new_training_frame = training_frame.select(id_col, x).withColumn( encode_name, lit(overall_mean_train)) if valid_frame: new_valid_frame = valid_frame.select(id_col, x).withColumn( encode_name, lit(overall_mean_valid)) new_test_frame = test_frame.select(id_col, x).withColumn( encode_name, lit(overall_mean_valid)) else: new_test_frame = test_frame.select(id_col, x).withColumn( encode_name, lit(overall_mean_train)) else: new_training_frame = training_frame.withColumn( encode_name, lit(overall_mean_train)) if valid_frame: new_valid_frame = valid_frame.withColumn( encode_name, lit(overall_mean_valid)) new_test_frame = test_frame.withColumn(encode_name, lit(overall_mean_valid)) else: new_test_frame = test_frame.withColumn(encode_name, lit(overall_mean_train)) #Replace the values in the dataframes with new encoded values from pyspark.sql.functions import when for k, v in levels_average_list_train: new_training_frame = new_training_frame.withColumn( encode_name, when(new_training_frame[x] == k, v).otherwise(new_training_frame[encode_name])) if not valid_frame: new_test_frame = new_test_frame.withColumn( encode_name, when(new_test_frame[x] == k, v).otherwise(new_test_frame[encode_name])) #if we have a validation frame we want to set the test levels to the original_numerics #from the averaged valid frame instead of the test frame if valid_frame: for k, v in levels_average_list_valid: new_valid_frame = new_valid_frame.withColumn( encode_name, when(new_valid_frame[x] == k, v).otherwise(new_valid_frame[encode_name])) new_test_frame = new_test_frame.withColumn( encode_name, when(new_test_frame[x] == k, v).otherwise(new_test_frame[encode_name])) if id_col != None: #remove origional x as its already in the original dfs if valid_frame: return new_training_frame.drop(x), new_valid_frame.drop( x), new_test_frame.drop(x) else: return new_training_frame.drop(x), new_test_frame.drop(x) else: if valid_frame: return new_training_frame, new_valid_frame, new_test_frame else: return new_training_frame, new_test_frame else: import h2o import pandas as pd import numpy as np trdf, vdf, tss = None, None, None if frame_type == 'h2o': # convert to pandas trdf = training_frame.as_data_frame().loc[:, [x, y]] # df vdf = valid_frame.as_data_frame().loc[:, [x, y]] # df tss = test_frame.as_data_frame().loc[:, x] # series elif frame_type == 'pandas': trdf = training_frame.loc[:, [x, y]] # df vdf = valid_frame.loc[:, [x, y]] # df tss = test_frame.loc[:, x] # series # create dictionary of level:encode val overall_mean_train = trdf[y].mean() overall_mean_valid = vdf[y].mean() row_val_dict_train = {} row_val_dict_valid = {} for level in trdf[x].unique(): level_df = trdf[trdf[x] == level][y] level_n = level_df.shape[0] level_mean = level_df.mean() if level_n >= threshold: row_val_dict_train[level] = level_mean else: row_val_dict_train[level] = ((1 - lambda_) * level_mean) +\ (lambda_ * overall_mean_train) for level in vdf[x].unique(): level_df = vdf[trdf[x] == level][y] level_n = level_df.shape[0] level_mean = level_df.mean() if level_n >= threshold: row_val_dict_valid[level] = level_mean else: row_val_dict_valid[level] = ((1 - lambda_) * level_mean) +\ (lambda_ * overall_mean_valid) row_val_dict_train[ np.nan] = overall_mean_train # handle missing values row_val_dict_valid[ np.nan] = overall_mean_valid # handle missing values if test: print(row_val_dict_train) print(row_val_dict_valid) # apply the transform to training data trdf[encode_name] = trdf[x].apply(lambda i: row_val_dict_train[i]) vdf[encode_name] = vdf[x].apply(lambda i: row_val_dict_valid[i]) # apply the transform to test data tsdf = pd.DataFrame(columns=[x, encode_name]) tsdf[x] = tss if valid_frame: tsdf.loc[:, encode_name] = overall_mean_valid # handle previously unseen values else: tsdf.loc[:, encode_name] = overall_mean_train # handle previously unseen values # handle values that are seen in tsdf but not row_val_dict for i, col_i in enumerate(tsdf[x]): try: row_val_dict_train[col_i] except: # a value that appeared in tsdf isn't in the row_val_dict so just # make it the overall_mean row_val_dict_train[col_i] = overall_mean_train if valid_frame: for i, col_i in enumerate(vdf[x]): try: row_val_dict_valid[col_i] except: # a value that appeared in tsdf isn't in the row_val_dict so just # make it the overall_mean row_val_dict_valid[col_i] = overall_mean_valid tsdf[encode_name] = tsdf[x].apply(lambda i: row_val_dict_valid[i]) else: tsdf[encode_name] = tsdf[x].apply(lambda i: row_val_dict_train[i]) if frame_type == 'h2o': # convert back to H2O trdf = h2o.H2OFrame(trdf[encode_name].as_matrix()) trdf.columns = [encode_name] if valid_frame: vdf = h2o.H2OFrame(vdf[encode_name].as_matrix()) vdf.columns = [encode_name] tsdf = h2o.H2OFrame(tsdf[encode_name].as_matrix()) tsdf.columns = [encode_name] if valid_frame: return (trdf, vdf, tsdf) else: return (trdf, tsdf) else: #pandas #just return pandas if valid_frame: return (trdf, vdf, tsdf) else: return (trdf, tsdf)
def lookupDimensionKey(df): return df.withColumn("DimensionSKey", lit(1))
def with_effective_version(dataframe, effective_version, join_key): """Calculate the effective version of Firefox in the wild given the date and channel. For example, if the date is 2017-11-14 and the channel is 'Release', then the effective Firefox version is 57.0.0, since this is the Firefox version that would be available for installation from official sources. This is used to determine the version a profile was acquired on. :param dataframe: A dataframe containing a date and channel col. :param effective_version: A table mapping dates to application versions. :param join_key: Column name generate version number on. :returns: A with a calculated "start_version" column """ in_columns = {"channel"} out_columns = set(dataframe.columns) | {"start_version"} assert in_columns <= set(dataframe.columns), "must contain channel" # Firefox releases follow a train model. Each channel is a major revision # ahead from the upstream channel. Nightly corresponds to the build on # mozilla-central, and is always the head of the train. For example, if # the release channel is "55", then beta will be "56". # # This logic will be affected by the decommissioning of aurora. version_offset = (F.when( F.col("channel").startswith("beta"), F.lit(1)).otherwise( F.when(F.col("channel").startswith("aurora"), F.lit(2)).otherwise( F.when(F.col("channel").startswith("nightly"), F.lit(3)).otherwise(F.lit(0))))) # Original effective version column name ev_version = effective_version.columns[1] # Column aliases in the joined table version = F.col(ev_version) date = F.col(join_key) joined_df = ( dataframe # Rename the date field to join against the left table .join(effective_version.toDF(join_key, ev_version), join_key, "left") # Major version number e.g. "57" .withColumn("_major", F.split(version, "\.").getItem(0).cast("int")) # Offset into the release train .withColumn("_offset", version_offset) ) # Build up operations to get the effective start version of a particular # channel and date. # There will be null values for the version if the date is not in # the right table. This sets the start_version to one of two values. fill_outer_range = (F.when(date.isNull() | (date < "2015-01-01"), F.lit("older")).otherwise(F.lit("newer"))) calculate_channel_version = (F.when( F.col("channel").startswith("release"), version).otherwise( F.concat(F.col("_major") + F.col("_offset"), F.lit(".0")))) start_version = (F.when( version.isNull(), fill_outer_range).otherwise(calculate_channel_version)) return (joined_df.withColumn("start_version", start_version).fillna( "unknown", ["start_version"]).select(*out_columns))
print('Generating watercraft connections') with timer(): print('Reading tables') with timer(): embarcacao = spark.table('bases.lc_embarcacao') pessoa = spark.table('bases.pessoa_fisica') empresa = spark.table('bases.lc_cnpj') # Merge persons with watercrafts pessoa_embarcacao = pessoa.filter('num_cpf is not null').\ withColumnRenamed('uuid', 'start_node').\ join(embarcacao, pessoa.num_cpf == embarcacao.cpf_cnpj).\ select(['start_node', 'uuid']).\ withColumnRenamed('uuid', 'end_node').\ withColumn('label', lit('PROPRIETARIO').cast('string')).\ withColumn('uuid', uuidshaudf()) empresa_embarcacao = empresa.filter('num_cnpj is not null').\ withColumnRenamed('uuid', 'start_node').\ join(embarcacao, empresa.num_cnpj == embarcacao.cpf_cnpj).\ select(['start_node', 'uuid']).\ withColumnRenamed('uuid', 'end_node').\ withColumn('label', lit('PROPRIETARIO').cast('string')).\ withColumn('uuid', uuidshaudf()) pessoa_embarcacao.write.mode("overwrite").saveAsTable( "dadossinapse.pessoa_embarcacao_ope") empresa_embarcacao.write.mode("overwrite").saveAsTable( "dadossinapse.empresa_embarcacao_ope")
else: payload['data']=pub_data[key]['data'] pushToRedis(payload) # cerating publisher realted data from daily data pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp from testtable where id5!="" group by publisher_id,id5,device_finger_print) group by publisher_id,id5') pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5') pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5') pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer') pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer') pub_daily_df=sqlContext.read.json(daily_fetch_path) if(len(pub_daily_df.columns)==0): for dtype in pub_df.dtypes: pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1])) pub_daily_df=pub_daily_df.select(pub_df.columns) pub_df=pub_df.union(pub_daily_df) pub_daily_df=pub_daily_df.select(pub_df.columns) pub_df=pub_df.union(pub_daily_df) pub_df.write.mode('append').json("{}".format(save_path),"overwrite") pub_df.cache() pubs=pub_df.collect() sendPubDataToRedis(pubs) data={"job_id":fetch_job_id,"file_name":save_file_name,"type":file_type} #creating acknowledgement request url = str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/acknowledgePredictionFileJob' # Create your header as required
def run(self, i): df = i[inputdata.EmploymentByStateLink] return df.smvSelectPlus( (col("EMP") > lit(1000000)).alias("cat_high_emp"))
# COMMAND ---------- bad_content_size_df = base_df.filter(~ base_df['value'].rlike(r'\d+$')) bad_content_size_df.count() # COMMAND ---------- # MAGIC %md # MAGIC That's it! The count matches the number of rows in `bad_rows_df` exactly. # MAGIC # MAGIC Let's take a look at some of the bad column values. Since it's possible that the rows end in extra white space, we'll tack a marker character onto the end of each line, to make it easier to see trailing white space. # COMMAND ---------- from pyspark.sql.functions import lit, concat bad_content_size_df.select(concat(bad_content_size_df['value'], lit('*'))).show(truncate=False) # COMMAND ---------- # MAGIC %md # MAGIC Ah. The bad rows correspond to error results, where no content was sent back and the server emitted a "`-`" for the `content_size` field. Since we don't want to discard those rows from our analysis, let's map them to 0. # COMMAND ---------- # MAGIC %md # MAGIC ### (2d) Fix the rows with null content\_size # MAGIC # MAGIC The easiest solution is to replace the null values in `split_df` with 0. The DataFrame API provides a set of functions and fields specifically designed for working with null values, among them: # MAGIC # MAGIC * [fillna()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna), which fills null values with specified non-null values. # MAGIC * [na](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na), which returns a [DataFrameNaFunctions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions) object with many functions for operating on null columns.
def timestamp_provider(): return lit(timestamp)