def test_vectorized_udf_complex(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), col('id').cast('int').alias('b'), col('id').cast('double').alias('c')) add = pandas_udf(lambda x, y: x + y, IntegerType()) power2 = pandas_udf(lambda x: 2 ** x, IntegerType()) mul = pandas_udf(lambda x, y: x * y, DoubleType()) res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c'))) expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c')) self.assertEquals(expected.collect(), res.collect())
def test_expr(self): from pyspark.sql import functions row = Row(a="length string", b=75) df = self.sqlCtx.createDataFrame([row]) result = df.select(functions.expr("length(a)")).collect()[0].asDict() self.assertEqual(13, result["'length(a)"])
def test_register_vectorized_udf_basic(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), col('id').cast('int').alias('b')) original_add = pandas_udf(lambda x, y: x + y, IntegerType()) self.assertEqual(original_add.deterministic, True) self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF) new_add = self.spark.catalog.registerFunction("add1", original_add) res1 = df.select(new_add(col('a'), col('b'))) res2 = self.spark.sql( "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") expected = df.select(expr('a + b')) self.assertEquals(expected.collect(), res1.collect()) self.assertEquals(expected.collect(), res2.collect())
def _calculate_metrics(self): """Calculate ranking metrics.""" self._items_for_user_pred = self.rating_pred self._items_for_user_true = ( self.rating_true .groupBy(self.col_user) .agg(expr("collect_list(" + self.col_item + ") as ground_truth")) .select(self.col_user, "ground_truth") ) self._items_for_user_all = self._items_for_user_pred.join( self._items_for_user_true, on=self.col_user ).drop(self.col_user) return RankingMetrics(self._items_for_user_all.rdd)
def distance_bw_parking_spots(data): if not isinstance(data, DataFrame): raise ValueError('Type pass to distance_bw_parking_spots() should be DataFrame.') data = data.\ withColumn('LONGITUDE_ORIGINE_rad', F.expr('radians(LONGITUDE_ORIGINE)')).\ withColumn('LATITUDE_ORIGINE_rad', F.expr('radians(LATITUDE_ORIGINE)')).\ withColumn('LONGITUDE_DESTINATION_rad', F.expr('radians(LONGITUDE_DESTINATION)')).\ withColumn('LATITUDE_DESTINATION_rad', F.expr('radians(LATITUDE_DESTINATION)')).\ withColumn('Diff_long', F.expr('(LONGITUDE_DESTINATION_rad-LONGITUDE_ORIGINE_rad)/2')).\ withColumn('Diff_lat', F.expr('(LONGITUDE_DESTINATION_rad-LONGITUDE_ORIGINE_rad)/2')).\ withColumn('LATITUDE_DESTINATION_cos', F.expr('cos(LATITUDE_DESTINATION_rad)')).\ withColumn('LATITUDE_ORIGINE_cos', F.expr('cos(LATITUDE_ORIGINE_rad)')).\ withColumn('Diff_long', F.expr('sin(Diff_long)')).\ withColumn('Diff_lat', F.expr('sin(Diff_lat)')).\ withColumn('A', F.expr('Diff_lat*Diff_lat + LATITUDE_DESTINATION_cos * LATITUDE_ORIGINE_cos * Diff_long * Diff_long')).\ withColumn('One_minus_A', F.expr('1-A')).\ withColumn('C', F.expr('2 * atan2( sqrt(A), sqrt(One_minus_A))')).\ withColumn('Distance_km', F.expr('6373.0*C')) # cols_needed = ['DATE_ORIGINE', 'LONGITUDE_ORIGINE', 'LATITUDE_ORIGINE', 'Distance(Km)', 'MOTIF_REMORQUAGE'] df_final = data.select('DATE_ORIGINE', 'LONGITUDE_ORIGINE', 'LATITUDE_ORIGINE', 'Distance_km', 'MOTIF_REMORQUAGE') try: assert df_final.count()==250077 except AssertionError: logging.error('Final count does not match before removing NA. Saving to file anyways...') df_final = df_final.na.drop() try: assert df_final.count()==248476 except AssertionError: logging.error('Final count does not match after removing NA. Saving to file anyways...') return df_final
def main(saprk): # Format the VIIRS dataset viirsDf = spark.read \ .format("csv") \ .option("header", True) \ .option("inferSchema", True) \ .load("/tmp/{}".format(viirs_file)) viirsDf2 = viirsDf \ .withColumn("acq_time_min", F.expr("acq_time % 100")) \ .withColumn("acq_time_hr", F.expr("int(acq_time / 100)")) \ .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"))) \ .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600")) \ .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3"))) \ .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3") \ .withColumnRenamed("confidence", "confidence_level") \ .withColumn("brightness", F.lit(None)) \ .withColumn("bright_t31", F.lit(None)) viirsDf2.show() viirsDf2.printSchema() # This piece of code shows the repartition by confidence level, so you # can compare when you convert the confidence as a % to a level for the # MODIS dataset. df = viirsDf2.groupBy("confidence_level").count() count = viirsDf2.count() df = df.withColumn("%", F.round(F.expr("100 / {} * count".format(count)), 2)) df.show() # Format the MODIS dataset low = 40 high = 100 modisDf = spark.read.format("csv") \ .option("header", True) \ .option("inferSchema", True) \ .load("/tmp/{}".format(modis_file)) \ .withColumn("acq_time_min", F.expr("acq_time % 100")) \ .withColumn("acq_time_hr", F.expr("int(acq_time / 100)")) \ .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"))) \ .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600")) \ .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3"))) \ .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3") \ .withColumn("confidence_level", F.when(F.col("confidence") <= F.lit(low), "low") .when((F.col("confidence") > F.lit(low)) & (F.col("confidence") < F.lit(high)), "nominal") .when(F.isnull(F.col("confidence")), "high") .otherwise(F.col("confidence"))) \ .drop("confidence") \ .withColumn("bright_ti4", F.lit(None)) \ .withColumn("bright_ti5", F.lit(None)) modisDf.show() modisDf.printSchema() # This piece of code shows the repartition by confidence level, so you # can compare when you convert the confidence as a % to a level for the # MODIS dataset. df = modisDf.groupBy("confidence_level").count() count = modisDf.count() df = df.withColumn("%", F.round(F.expr("100 / {} * count".format(count)), 2)) df.show() wildfireDf = viirsDf2.unionByName(modisDf) wildfireDf.show() wildfireDf.printSchema() logging.info("# of partitions: {}".format( wildfireDf.rdd.getNumPartitions())) wildfireDf.write.format("parquet") \ .mode("overwrite") \ .save("/tmp/fires_parquet") outputDf = wildfireDf.filter("confidence_level = 'high'") \ .repartition(1) outputDf.write.format("csv") \ .option("header", True) \ .mode("overwrite") \ .save("/tmp/high_confidence_fires_csv")
# COMMAND ---------- motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)") # COMMAND ---------- from pyspark.sql.functions import expr motifs.selectExpr("*", "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart", "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart", "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\ .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\ .where("a.id != b.id").where("b.id != c.id")\ .where("abStart < bcStart").where("bcStart < caStart")\ .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\ .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`") .limit(1).show(1, False) # COMMAND ---------- from pyspark.sql.functions import desc ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10) ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10) # COMMAND ---------- inDeg = stationGraph.inDegrees inDeg.orderBy(desc("inDegree")).show(5, False)
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed): val_set = spark.read.parquet(f'{dirname}/val.parquet') print(f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...') # load corresponding trained model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # computing RMSE on validation set predictions = model.transform(val_set) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predictions) print(f'rmse: {rmse}') print(f'Constructing top {k} books recommended to per user ...') val_users = val_set.select('user_id').distinct() start_time = time.time() perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k) myudf = udf(extract_item, ArrayType(IntegerType())) perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn('predictions', myudf(perUserPredictedTopKItemsDF['recommendations'])).drop('recommendations') print('Constructing actual books per user ...') perUserActualItemsDF = val_set.filter(column('rating') >= 3.0).groupBy('user_id').agg(expr('collect_list(book_id) as book_ids')) print('Constructing Ranking Metrics ...') perUserItemsRDD = perUserPredictedTopKItemsDF.join(perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) precisionAtK = rankingMetrics.precisionAt(k) mAP = rankingMetrics.meanAveragePrecision end_time = time.time() time_delta = str(datetime.timedelta(seconds = end_time - start_time)) print(f'p@{k}: {precisionAtK}') print(f'mAP: {mAP}') print(f'run time: {time_delta}')
print("Recommendations: ------------------------------") user_recs = best_model.recommendForAllUsers(500) print(user_recs.count()) prediction_val = best_model.transform(df_validation) print(" Predictions for validation dataset: ------------------------------") prediction_val.show() prediction_test = best_model.transform(df_test) print(" Predictions for test dataset: ------------------------------") prediction_test.show() actual_val = df_validation.groupBy("user_id").agg(expr("collect_set(book_id) as books")) pred_val = user_recs.select('user_id','recommendations.book_id') output_val =pred_val.join(actual_val,['user_id']).select('book_id','books') metrics_val = RankingMetrics(output_val.rdd) result_val = metrics_val.meanAveragePrecision result_val2 = metrics_val.precisionAt(20) print("Mean average precision for validation dataset: " + str(result_val)) print("Precision @ 20 for validation dataset: " + str(result_val2)) rmse_val = evaluator.evaluate(prediction_val) print("RMSE for validation dataset=" + str(rmse_val)) actual_test = df_test.groupBy("user_id").agg(expr("collect_set(book_id) as books")) pred_test = user_recs.select('user_id','recommendations.book_id') output_test =pred_test.join(actual_test,['user_id']).select('book_id','books')
]) df = spark.read.format("json").schema(myManualSchema)\ .load("/data/flight-data/json/2015-summary.json") # COMMAND ---------- from pyspark.sql.functions import col, column col("someColumnName") column("someColumnName") # COMMAND ---------- from pyspark.sql.functions import expr expr("(((someCol + 5) * 200) - 6) < otherCol") # COMMAND ---------- from pyspark.sql import Row myRow = Row("Hello", None, 1, False) # COMMAND ---------- myRow[0] myRow[2] # COMMAND ----------
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df
# Databricks notebook source from pyspark.sql.functions import expr, pow, col, round, bround, lit, corr #dataframe 생성 df = spark.read.format("csv")\ .option("header","true")\ .option("inferSchema","true")\ .load("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv") df.createOrReplaceGlobalTempView("dfTable") #pow fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5 df.select(expr("CustomerId"), fabricatedQuantity.alias("RealQuntity")).show(2) #두개의 column이 모두 수치형 데이터 이므로 계산 가능 #내림 df.select(round(lit("1.6")), bround(lit("1.6"))).show(2) #두 column의 상관관계 df.stat.corr("Quantity", "UnitPrice") df.select(corr("Quantity", "UnitPrice")).show(2) #describe (평균, 표준편차, 최소,최대,집계) #통계 스키마는 변경될 수 있으므로 확인용으로만 사용 df.describe().show(6) #statFunctions package #stat. 을 통해 접근 ColName = "UnitPrice" quantiledProbs = [0.5] relError = 0.05 df.stat.approxQuantile("UnitPrice", quantiledProbs, relError)
# COMMAND ---------- wrongJoinExpression = person["name"] == graduateProgram["school"] # COMMAND ---------- joinType = "inner" # COMMAND ---------- gradProgram2 = graduateProgram.union(spark.createDataFrame([ (0, "Masters", "Duplicated Row", "Duplicated School")])) gradProgram2.createOrReplaceTempView("gradProgram2") # COMMAND ---------- from pyspark.sql.functions import expr person.withColumnRenamed("id", "personId")\ .join(sparkStatus, expr("array_contains(spark_status, id)")).show() # COMMAND ----------
# COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ---------- from pyspark.sql.functions import sum, count, avg, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ----------
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}, ... columns=['A', 'B', 'C']) >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, str) and isinstance(value, str) for key, value in func_or_funcs.items()): raise ValueError("aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys)] reordered = [F.expr('{1}({0}) as {0}'.format(key, value)) for key, value in func_or_funcs.items()] sdf = sdf.groupby(*groupkey_cols).agg(*reordered) metadata = Metadata(data_columns=[key for key, _ in func_or_funcs.items()], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(sdf, metadata)
kafka_source_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "invoice-items") \ .option("startingOffsets", "earliest") \ .load() avroSchema = open('schema/invoice-items', mode='r').read() value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value")) rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \ .groupBy("value.CustomerCardNo") \ .agg(sum("value.TotalValue").alias("TotalPurchase"), sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards")) kafka_target_df = rewards_df.select(expr("CustomerCardNo as key"), to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value")) # kafka_target_df.show(truncate=False) rewards_writer_query = kafka_target_df \ .writeStream \ .queryName("Rewards Writer") \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("topic", "customer-rewards") \ .outputMode("update") \ .option("checkpointLocation", "chk-point-dir") \ .start()
predictions = model.transform(test) window = Window.partitionBy(predictions['user_id']).orderBy( predictions['prediction'].desc()) test_pred_order = predictions.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(test_pred_order) # evaluate the model by computing the MAP on the validation data test_pred_list = test_pred_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) test_RDD = test_pred_list.join(test_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(test_RDD) current_map = rankingMetrics.meanAveragePrecision print( '\nThe best baseline model select by RMSE = {} has {} latent factors and ' 'regularization = {} with maxIter = {} MAP = {}'.format( rmse, current_rank, reg, iteration, current_map)) """ # evaluate read model train_new = train.withColumn('rating',when(train.is_read == 0,float('nan')).otherwise(train.rating)) train_read = train_new.na.drop() train_unread = train.subtract(train_read)
def test_udf_plus1(self): scala('''val plus1 = udf { x: Int => x + 1 }; spark.udf.register("plus1", plus1)''') from pyspark.sql.functions import expr res = self.spark.createDataFrame(range(10), "int").select(expr('plus1(value) value')).collect() res = [x.value for x in res] self.assertListEqual(res, list(range(1, 11)))
.format("memory").outputMode("complete")\ .start() # COMMAND ---------- from time import sleep for x in range(5): spark.sql("SELECT * FROM activity_counts").show() sleep(1) # COMMAND ---------- from pyspark.sql.functions import expr simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\ .where("stairs")\ .where("gt is not null")\ .select("gt", "model", "arrival_time", "creation_time")\ .writeStream\ .queryName("simple_transform")\ .format("memory")\ .outputMode("append")\ .start() # COMMAND ---------- deviceModelStats = streaming.cube("gt", "model").avg()\ .drop("avg(Arrival_time)")\ .drop("avg(Creation_Time)")\
StructField("ORIGIN_COUNTRY_NAME", StringType(), True), StructField("count", LongType(), False, metadata={"hello": "world"})]) myRow = Row("Hello", None, 1) myDf = spark.createDataFrame([myRow],myManualSchema) myDf.show() # Select and SelectExpr # Selecting single column df.select("DEST_COUNTRY_NAME").show(2) # Selecting multiple column df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2) df.select( expr("DEST_COUNTRY_NAME"), col("DEST_COUNTRY_NAME"), column("DEST_COUNTRY_NAME") ).show(2) # Using Alias -- select DEST_COUNTRY_NAME as destination from table df.select(expr("DEST_COUNTRY_NAME As destination")).show(2) df.select(expr ("DEST_COUNTRY_NAME As destination").alias("DEST_COUNTRY_NAME")).show(2) df.selectExpr("DEST_COUNTRY_NAME As destination","DEST_COUNTRY_NAME").show(2) # SelectExpr Example -- Comparing the column value return boolean df.selectExpr( "*","(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME ) as withinCountry" ).show(2) df.selectExpr(
training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0) [training_df.count(), validation_df.count(), testing_df.count()] lambda_par = 0.02 alpha_par = 0.3 en_lr = LogisticRegression().\ setLabelCol('sentiment').\ setFeaturesCol('tfidf').\ setRegParam(lambda_par).\ setMaxIter(100).\ setElasticNetParam(alpha_par) en_lr_pipeline = Pipeline(stages=[idf_pipeline, en_lr]).fit(review) en_lr_pipeline.transform(review).select( fn.avg(fn.expr('float(prediction = sentiment)'))).show() en_weights = en_lr_pipeline.stages[-1].coefficients.toArray() en_coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': en_weights}) #en_coeffs_df.sort_values('weight').head(15) #en_coeffs_df.sort_values('weight', ascending=False).head(15) #en_coeffs_df.query('weight == 0.0').shape en_coeffs_df.query('weight == 0.0').shape[0] / en_coeffs_df.shape[0] from pyspark.ml.tuning import ParamGridBuilder en_lr_estimator = Pipeline(stages=[idf_pipeline, en_lr]) grid = ParamGridBuilder().\ addGrid(en_lr.regParam, [0., 0.01, 0.02]).\
def get_aggregations(year, month, in_category, all=False, flipsign=True): """ This function builds the aggregations as needed for that year and month. It also produces histograms in a specific category. :param year: input year :param month: input month :param in_category: input category for histogram :param all: for histogram only past 10 months :param flipsign: flipsign is used for reversing the bars. required for only fewer categories like pay. :return: returns aggregations and histograms """ # getting the main master file path all_transactions_path = config["target"]["all_master"] # reading the category flags category_flags_path = config["lookup"]["category_flags"] # reading all the descriptions desc_flags_path = config["lookup"]["description_flags"] # extracting date values as needed start_date = str(year) + month + '01' _, end_day = calendar.monthrange(year, int(month)) end_date = str(year) + month + str(end_day) # reading in the latest all master file latest_file = read_latest_file_from_hdfs(spark, all_transactions_path, match_filename='20') all_transactions = spark.read.csv(latest_file, sep=',', header=True) # if all = true then it is for histogram or it is only monthly expenditure if not all: # filtering that respective months transations monthly_transactions = all_transactions.filter( f"trndt between {start_date} and {end_date}") category_flags = spark.read.csv(category_flags_path, sep=',', header=True) desc_flags = spark.read.csv(desc_flags_path, sep=',', header=True) desc_only_flags = desc_flags.select('DESCRIPTIONS') Flag_df = [str(i.DESCRIPTIONS) for i in desc_only_flags.collect()] # converting it to pandas dataframe pandas_monthly_transactions = monthly_transactions.toPandas() # rewwrite the new descriptions to the transactions for i in Flag_df: pandas_monthly_transactions.loc[ pandas_monthly_transactions['Description'].str. contains(i, case=False), 'new_Description'] = i headers = [ "Transaction_date", "Description", "Amount", "trndt", "act_type", "new_Description" ] schema = StructType( [StructField(col, StringType()) for col in headers]) # converting back to spark dataframe transactions = spark.createDataFrame(pandas_monthly_transactions, schema=schema) t = transactions.alias('t') f = desc_flags.alias('f') # flags the transactions transactions = t.join(f, t.new_Description == f.DESCRIPTIONS, "left_outer").drop("DESCRIPTIONS") print('No of null amounts are', transactions.filter("Amount is null").count()) transactions = transactions.filter("Amount is not null") # replaces commas in the amount field transactions = transactions.withColumn("Amount", replace_comma("Amount")) transactions.show(20, False) # gives the number of transactions per account type in a month print('number of transactions per account type in a month') transactions.groupby('act_type').agg( F.count("act_type").alias("total_act_type_transactions")).show() # gives the latest transaction date per each account print('latest transaction date per each account') transactions.groupby('act_type').agg( F.max("trndt").alias("max_trndt")).sort("max_trndt").show() # gives the new transaction list print('brand new transactions') null_flags = transactions.filter("FLAG is null") null_flags.orderBy(F.asc("trndt")).show(100, False) print(f'checking for transactions under category {in_category}') filter_flags = transactions.filter(f"FLAG='{in_category}'") filter_flags.orderBy(F.asc("trndt")).show(100, False) grouped_df = transactions.groupby('FLAG').agg( F.sum("Amount").alias("total_amt")) print('grouped_df per category') grouped_df = grouped_df.join(category_flags, "FLAG", "left_outer")\ .orderBy(F.desc("total_amt")) grouped_df = grouped_df.filter("FLAG<>'PAY'") grouped_df.show(200, False) y = grouped_df.select("total_amt").rdd.map( lambda row: row[0]).collect() print(y) print('incoming', sum(i for i in y if i > 0)) print('outgoing', sum(i for i in y if i < 0)) # all_transactions = all_transactions.withColumn("year_month", F.substring(F.col("trndt"), 1, 6)) # all_transactions.groupBy("year_month","") # all_transactions.show(200, False) else: category_flags = spark.read.csv(category_flags_path, sep=',', header=True) desc_flags = spark.read.csv(desc_flags_path, sep=',', header=True) category_flags.show(100, False) desc_only_flags = desc_flags.select('DESCRIPTIONS') Flag_df = [str(i.DESCRIPTIONS) for i in desc_only_flags.collect()] pandas_monthly_transactions = all_transactions.toPandas() for i in Flag_df: pandas_monthly_transactions.loc[ pandas_monthly_transactions['Description'].str. contains(i, case=False), 'new_Description'] = i headers = [ "Transaction_date", "Description", "Amount", "trndt", "act_type", "new_Description" ] schema = StructType( [StructField(col, StringType()) for col in headers]) transactions = spark.createDataFrame(pandas_monthly_transactions, schema=schema) t = transactions.alias('t') f = desc_flags.alias('f') transactions = t.join(f, t.new_Description == f.DESCRIPTIONS, "left_outer").drop("DESCRIPTIONS") print('No of null amounts are', transactions.filter("Amount is null").count()) transactions = transactions.filter("Amount is not null") transactions = transactions.withColumn("Amount", replace_comma("Amount")) transactions.show(20, False) transactions.groupby('act_type').agg( F.count("act_type").alias("total_act_type_transactions")).show() transactions.groupby('act_type').agg( F.max("trndt").alias("max_trndt")).sort("max_trndt").show() transactions = transactions.withColumn( "trn_month", F.expr("concat(substr(trndt, 3, 2),'-',substr(trndt, 5, 2))")) # F.concat(F.substring(F.col("trndt"), 3, 4), "-", F.substring(F.col("trndt"), 4, 5))) null_flags = transactions.filter("FLAG is null") null_flags.orderBy(F.asc("trndt")).show(100, False) filter_flags = transactions.filter(f"FLAG='{in_category}'") # filter_flags = transactions.filter("act_type='bofacredit'") filter_flags.orderBy(F.asc("trndt")).show(100, False) # filter_flags_2 = transactions.filter("FLAG='GR'") # # filter_flags = transactions.filter("act_type='bofacredit'") # filter_flags_2.orderBy(F.asc("trndt")).show(100, False) grouped_df = transactions.groupby('FLAG', 'trn_month').agg( F.sum("Amount").alias("total_amt")) print('grouped_df') grouped_df = grouped_df.join(category_flags, "FLAG", "left_outer") \ .orderBy(F.desc("trn_month")) grouped_df.show(200, False) grouped_df = grouped_df.filter(f"FLAG='{in_category}'").orderBy( F.asc("trn_month")) grouped_df = grouped_df.filter("FLAG<>'PAY'") print_category = grouped_df.select("CATEGORY").filter( f"FLAG='{in_category}'").distinct().rdd.map( lambda row: row[0]).collect() grouped_df.orderBy(F.desc("trn_month")).show() if flipsign: grouped_df = grouped_df.withColumn("total_amt", flip_sign("total_amt")) # plotting the histograms x = grouped_df.select("trn_month").rdd.map( lambda row: row[0]).collect() y = grouped_df.select("total_amt").rdd.map( lambda row: row[0]).collect() x = x[-10:] y = y[-10:] avg = Average(y) print(x) print(y) print(avg) plt.bar(x, y, align='center') plt.ylabel(f'{print_category}') plt.xlabel('[months]') plt.title(f'avg in 10 months is {avg}') for i in range(len(y)): plt.hlines(y[i], 0, x[i]) # Here you are drawing the horizontal lines plt.show() # all_transactions = all_transactions.withColumn("year_month", F.substring(F.col("trndt"), 1, 6)) # all_transactions.groupBy("year_month","") # all_transactions.show(200, False) pass
StructField("TotalValue", DoubleType()) ]))), ]) kafka_df = spark.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9092") \ .option("subscribe", "invoices") \ .option("startingOffsets", "earliest") \ .load() value_df = kafka_df.select( from_json(col("value").cast("string"), schema).alias("value")) notification_df = value_df.select("value.InvoiceNumber", "value.CustomerCardNo", "value.TotalAmount") \ .withColumn("EarnedLoyaltyPoints", expr("TotalAmount * 0.2")) # kafka_target_df = notification_df.selectExpr("InvoiceNumber as key", "to_json(struct(*)) as value") kafka_target_df = notification_df.selectExpr( "InvoiceNumber as key", """to_json(named_struct( 'CustomerCardNo', CustomerCardNo, 'TotalAmount', TotalAmount, 'EarnedLoyaltyPoints', TotalAmount * 0.2)) as value""" ) ''' notification_writer_query = kafkaTarget_df.writeStream \ .format("console") \ .outputMode("append") \ .option("truncate", "false") \ .option("checkpointLocation", "chk-point-dir") \
df.select(sum("Quantity")).show() # 5176450 # COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ---------- from pyspark.sql.functions import sum, count, avg, expr df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis
def build_attribute_matrix( spark, sub=0, book_df='hdfs:/user/yw2115/goodreads_books.json.gz', author_df='hdfs:/user/yw2115/goodreads_book_authors.json.gz', genre_df='hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz', records_path="hdfs:/user/xc1511/onepct_int_001.parquet"): ####Create Attribute Matrix for Genres#### ''' 10 categories: children| comics, graphic| fantasy, paranormal| fiction| history, historical fiction, biography, mystery, thriller, crime| non-fiction| poetry| romance| young-adult ''' book_df = spark.read.json('hdfs:/user/yw2115/goodreads_books.json.gz') author_df = spark.read.json( 'hdfs:/user/yw2115/goodreads_book_authors.json.gz') genre_df = spark.read.json( 'hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz') genre_at = genre_df.select('book_id',f.expr('genres.children'),f.expr('genres.`comics, graphic`'),\ f.expr('genres.`fantasy, paranormal`'),f.expr('genres.fiction'), \ f.expr('genres.`history, historical fiction, biography`'), f.expr('genres.`mystery, thriller, crime`'),\ f.expr('genres.`non-fiction`'),f.expr('genres.poetry'),f.expr('genres.romance'),f.expr('genres.`young-adult`')) #change col names new_col = [ 'book_id', 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10' ] genre_at = genre_at.toDF(*new_col) #genre_at.show(3) #0/1 Encoding #change Null value to 0 (meaning the book is not in this genre) # and other int to 1 (meaning the book in this genre) for i in range(1, len(new_col)): col_name = new_col[i] genre_at = genre_at.withColumn( col_name, when(genre_at[col_name].isNotNull(), 1).otherwise(0)) #genre_at.show(10) #subsample 1% data if sub == 0.01: records_pq = spark.read.parquet(records_path) records_pq.createOrReplaceTempView('records_pq') book_pq = spark.sql('SELECT DISTINCT book_id FROM records_pq') book_pq.createOrReplaceTempView('book_pq') book_df.createOrReplaceTempView('book_df') genre_at.createOrReplaceTempView('genre_at') genre_at = spark.sql('SELECT genre_at.* FROM genre_at JOIN book_pq ON \ genre_at.book_id = book_pq.book_id') book_df = spark.sql('SELECT book_df.* FROM book_df JOIN book_pq ON \ book_df.book_id = book_pq.book_id') ####Add Author Rating as Additional Attribute#### #Select the first author (there are books with more than 1 author, first author is the main author) book_df = book_df.select('book_id', f.expr('authors[0]').alias('a')) #Add author_id book_df = book_df.select('book_id', f.expr('a.author_id')) #Join book_df and author_df book_df.createOrReplaceTempView('book_df') author_df.createOrReplaceTempView('author_df') author_at = spark.sql('SELECT book_df.book_id, book_df.author_id,\ author_df.average_rating FROM book_df JOIN author_df ON \ book_df.author_id=author_df.author_id') #author_at.show(10) ####Join The Two Matrix to Get Book Attribute Matrix#### genre_at.createOrReplaceTempView('genre_at') author_at.createOrReplaceTempView('author_at') book_at = spark.sql('SELECT genre_at.book_id, genre_at.g1, genre_at.g2,\ genre_at.g3, genre_at.g4, genre_at.g5, genre_at.g6, genre_at.g7, genre_at.g8, \ genre_at.g9, genre_at.g10, author_at.average_rating AS author_rating \ FROM genre_at JOIN author_at ON genre_at.book_id=author_at.book_id') book_at = book_at.withColumn('author_rating', book_at['author_rating'].cast('float')) #return the I*N attribute matrix for book #I is number of items (books) #N = 11 is number of attribute features of the books #add a features col vecAssembler = VectorAssembler(inputCols=[ 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10', 'author_rating' ], outputCol="features") book_at = vecAssembler.transform(book_at) #note here 'features' is a SparseVector type due to spark memory default #book_at.show(3) return book_at
def main(spark, log_comp=False, drop_low=False, drop_thr=0): ''' Parameters ---------- spark : SparkSession object train_path : string, path to the training parquet file to load val_path : string, path to the validation parquet file to load test_path : string, path to the validation parquet file to load ''' ## Load in datasets train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet' val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train = spark.read.parquet(train_path) val = spark.read.parquet(val_path) test = spark.read.parquet(test_path) ## Downsample the data # Pick out user list in training set user_train = set(row['user_id'] for row in train.select('user_id').distinct().collect()) # Pick out user list in validation set user_val = set(row['user_id'] for row in val.select('user_id').distinct().collect()) # Get the previous 1M users user_prev = list(user_train - user_val) # Random sampling to get 20% k = int(0.2 * len(user_prev)) user_prev_filtered = random.sample(user_prev, k) train = train.where(train.user_id.isin(user_prev_filtered + list(user_val))) ## Create StringIndexer indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') indexer_user_model = indexer_user.fit(train) indexer_track = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') indexer_track_model = indexer_track.fit(train) train = indexer_user_model.transform(train) train = indexer_track_model.transform(train) val = indexer_user_model.transform(val) val = indexer_track_model.transform(val) test = indexer_user_model.transform(test) test = indexer_track_model.transform(test) ## ALS model rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) ## Pick out users from validation set user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Log-Compression ## count -> log(1+count) if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" ## Drop interactions that have counts lower than specified threhold if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i)) # Make top 500 recommendations for users in validation test res = model.recommendForUserSubset(user_id, 500) pred_label = res.select('user_id_indexed', 'recommendations.track_id_indexed') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('Start Evaluating for {}'.format(i)) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa) pass
def prepare_df(df): num_rows = df.count() # Expand dates. df = expand_date(df) df = df \ .withColumn('Open', df.Open != '0') \ .withColumn('Promo', df.Promo != '0') \ .withColumn('StateHoliday', df.StateHoliday != '0') \ .withColumn('SchoolHoliday', df.SchoolHoliday != '0') # Merge in store information. store = store_csv.join(store_states_csv, 'Store') df = df.join(store, 'Store') # Merge in Google Trend information. google_trend_all = prepare_google_trend() df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend) # Merge in Google Trend for whole Germany. google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE'] df = df.join(google_trend_de, ['Year', 'Week']).select( df['*'], google_trend_all.trend.alias('trend_de')) # Merge in weather. weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ['State', 'Date']) # Fix null values. df = df \ .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \ .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \ .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \ .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1))) # Days & months competition was open, cap to 2 years. df = df.withColumn( 'CompetitionOpenSince', F.to_date( F.format_string('%s-%s-15', df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth))) df = df.withColumn( 'CompetitionDaysOpen', F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)))).otherwise(0)) df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # Days & weeks of promotion, cap to 25 weeks. df = df.withColumn( 'Promo2Since', F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (Promo2SinceWeek - 1) * 7)' )) df = df.withColumn( 'Promo2Days', F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since)))).otherwise(0)) df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType())) # Check that we did not lose any rows through inner joins. assert num_rows == df.count(), 'lost rows in joins' return df
from pyspark.sql import SparkSession from pyspark.sql.functions import expr, avg, col import pyspark.sql.functions as SQLFunctions import os import time os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3' spark = SparkSession.builder.getOrCreate() # Load data from a CSV filePath = "/home/varun/PycharmProjects/BatchProcessingAirQuality/weatherAUS.csv" df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(filePath) df = df.withColumn("Date", expr("to_date(Date)")) print(df.show(5)) # time.sleep(10) # Average rainfall overall avgRain = df.filter(SQLFunctions.col('Date') >= '2008-12-01').select( SQLFunctions.round(avg('Rainfall'), 2).alias("Avg. Rainfall")).show() # time.sleep(10) # Min and Max Temperatures where MaxTemp >= 10 Temp = df.filter(SQLFunctions.col('MaxTemp') >= '10').select( 'Date', 'MinTemp', 'MaxTemp').dropDuplicates(subset=['MaxTemp']).show(5) # time.sleep(10) # Average Temperature of the day where Wind Direction is North meanCols = [col('MaxTemp'), col('MinTemp')] avgCol = sum(x for x in meanCols) / len(meanCols) avgTempOfDay = df.filter(SQLFunctions.col('WindGustDir') == 'N').select(
FROM tC """).show() # %% [markdown] # ## DataFrames and Spark SQL Common Relational Operators # %% delays_path = os.path.join(DATA_DIRECTORY, "flights", "departuredelays.csv") airports_path = os.path.join(DATA_DIRECTORY, "flights", "airport-codes-na.txt") airports = spark.read.options(header='true', inferSchema='true', sep='\t').csv(airports_path) airports.createOrReplaceTempView("airports_na") delays = spark.read.options(header='true').csv(delays_path) delays = (delays.withColumn("delay", F.expr("CAST(delay as INT) as delay")).withColumn( "distance", F.expr("CAST(distance as INT) as distance"))) delays.createOrReplaceTempView("departureDelays") # Create temporary small table foo = delays.where( F.expr(""" origin == 'SEA' AND destination == 'SFO' AND date like '01010%' AND delay > 0 """)) foo.createOrReplaceTempView("foo")
gq_revised = gq_revised.filter((gq_revised.MAFID != "MAFID")) # Clean hu data hu_revised = pp10_hu_edited.select("_c0", "_c1", "_c2", "_c21", "_c22") hu_revised = hu_revised.withColumnRenamed( "_c0", "COLBLKST").withColumnRenamed("_c1", "LCO").withColumnRenamed( "_c2", "MAFID").withColumnRenamed("_c21", "EDIT_SEQ").withColumnRenamed( "_c22", "FINAL_POP") hu_revised = hu_revised.filter((hu_revised.MAFID != "MAFID")) # Clean gq file by removing extra digits not needed gq_revised = gq_revised.withColumn( "mafid_temp", sf.expr("substring(MAFID, 1, length(MAFID)-2)")) gq_revised = gq_revised.withColumn( "edit_seq2", sf.expr("substring(EDIT_SEQ, 1, length(EDIT_SEQ)-2)")) gq_revised = gq_revised.drop("MAFID", "GQTYPE", "PEGQTYPE", "FGQTYPE", "PP_GQ_MEDIAN_AGE", "EDIT_SEQ") gq_revised = gq_revised.withColumnRenamed("mafid_temp", "MAFID").withColumnRenamed( "edit_seq2", "EDIT_SEQ") # Perform Union of gq and hu gq_hu_union = hu_revised.union(gq_revised) # Read and clean ops file op_revised = spark.read.csv( "s3://uscb-decennial-ite-das/2010/cef/pp10_op.csv") op_revised = op_revised.select("_c0", "_c16", "_c17", "_c55", "_c68")
def as_of_join( entity_df: DataFrame, entity_event_timestamp_column: str, feature_table_df: DataFrame, feature_table: FeatureTable, ) -> DataFrame: """Perform an as of join between entity and feature table, given a maximum age tolerance. Join conditions: 1. Entity primary key(s) value matches. 2. Feature event timestamp is the closest match possible to the entity event timestamp, but must not be more recent than the entity event timestamp, and the difference must not be greater than max_age, unless max_age is not specified. 3. If more than one feature table rows satisfy condition 1 and 2, feature row with the most recent created timestamp will be chosen. 4. If none of the above conditions are satisfied, the feature rows will have null values. Args: entity_df (DataFrame): Spark dataframe representing the entities, to be joined with the feature tables. entity_event_timestamp_column (str): Column name in entity_df which represents event timestamp. feature_table_df (Dataframe): Spark dataframe representing the feature table. feature_table (FeatureTable): Feature table specification, which provide information on how the join should be performed, such as the entity primary keys and max age. Returns: DataFrame: Join result, which contains all the original columns from entity_df, as well as all the features specified in feature_table, where the feature columns will be prefixed with feature table name. Example: >>> entity_df.show() +------+-------------------+ |entity| event_timestamp| +------+-------------------+ | 1001|2020-09-02 00:00:00| +------+-------------------+ >>> feature_table_1_df.show() +------+-------+-------------------+-------------------+ |entity|feature| event_timestamp| created_timestamp| +------+-------+-------------------+-------------------+ | 10| 200|2020-09-01 00:00:00|2020-09-02 00:00:00| +------+-------+-------------------+-------------------+ | 10| 400|2020-09-01 00:00:00|2020-09-01 00:00:00| +------+-------+-------------------+-------------------+ >>> feature_table_1.max_age None >>> feature_table_1.name 'table1' >>> df = as_of_join(entity_df, "event_timestamp", feature_table_1_df, feature_table_1) >>> df.show() +------+-------------------+---------------+ |entity| event_timestamp|table1__feature| +------+-------------------+---------------+ | 1001|2020-09-02 00:00:00| 200| +------+-------------------+---------------+ >>> feature_table_2.df.show() +------+-------+-------------------+-------------------+ |entity|feature| event_timestamp| created_timestamp| +------+-------+-------------------+-------------------+ | 10| 200|2020-09-01 00:00:00|2020-09-02 00:00:00| +------+-------+-------------------+-------------------+ | 10| 400|2020-09-01 00:00:00|2020-09-01 00:00:00| +------+-------+-------------------+-------------------+ >>> feature_table_2.max_age 43200 >>> feature_table_2.name 'table2' >>> df = as_of_join(entity_df, "event_timestamp", feature_table_2_df, feature_table_2) >>> df.show() +------+-------------------+---------------+ |entity| event_timestamp|table2__feature| +------+-------------------+---------------+ | 1001|2020-09-02 00:00:00| null| +------+-------------------+---------------+ """ entity_with_id = entity_df.withColumn("_row_nr", monotonically_increasing_id()) feature_event_timestamp_column_with_prefix = ( f"{feature_table.name}__{EVENT_TIMESTAMP_ALIAS}") feature_created_timestamp_column_with_prefix = ( f"{feature_table.name}__{CREATED_TIMESTAMP_ALIAS}") projection = [ col(col_name).alias(f"{feature_table.name}__{col_name}") for col_name in feature_table_df.columns ] aliased_feature_table_df = feature_table_df.select(projection) join_cond = ( entity_with_id[entity_event_timestamp_column] >= aliased_feature_table_df[feature_event_timestamp_column_with_prefix]) if feature_table.max_age: join_cond = join_cond & ( aliased_feature_table_df[feature_event_timestamp_column_with_prefix] >= entity_with_id[entity_event_timestamp_column] - expr(f"INTERVAL {feature_table.max_age} seconds")) for key in feature_table.entity_names: join_cond = join_cond & ( entity_with_id[key] == aliased_feature_table_df[f"{feature_table.name}__{key}"]) conditional_join = entity_with_id.join(aliased_feature_table_df, join_cond, "leftOuter") for key in feature_table.entity_names: conditional_join = conditional_join.drop( aliased_feature_table_df[f"{feature_table.name}__{key}"]) window = Window.partitionBy( "_row_nr", *feature_table.entity_names).orderBy( col(feature_event_timestamp_column_with_prefix).desc(), col(feature_created_timestamp_column_with_prefix).desc(), ) filter_most_recent_feature_timestamp = conditional_join.withColumn( "_rank", row_number().over(window)).filter(col("_rank") == 1) return filter_most_recent_feature_timestamp.select(entity_df.columns + [ f"{feature_table.name}__{feature}" for feature in feature_table.feature_names ])
print(test_op) #%% """ check if col and expr result to same output """ bankDf = (spark.read.option('header', 'true').csv(dataset_folder + '/bank.csv')) print(bankDf.show(5)) #%% from pyspark.sql.functions import expr, col data1 = expr('(((balance + 5)* 100) > loan)') print(data1) data2 = (((col('balance') + 5) * 100) > col('loan')) print(data2) print(bankDf.columns) #%% """ Create a datafrane by creating row objs """ from pyspark.sql import Row from pyspark.sql.types import StructField, StructType, StringType, LongType myManualSchema = StructType([ StructField("some", StringType(), True),
def run_transactions(spark, config, cycle_date, account_type): """ This function writes the monthly csv files to its respective master :param spark: :param config: it has all the source and target file paths :param cycle_date: YYYYMM: cycle year and month that is being executed :param account_type: type of account that is being executed :return: writes the monthly transaction to its master csv file """ if account_type == 'citi': # extracts year from yyyymm cycle_year = '{}'.format(cycle_date[0:4]) # extracts month from yyyymm cycle_month_num = '{}'.format(cycle_date[4:6]) # gets the MON name from the month number cycle_month = calendar.month_name[int(cycle_month_num)] previous_year = int(cycle_year) - 1 # source path of citi csv files src_path = config["source"][ "citi"] + 'tabula-' + cycle_year + ' ' + cycle_month + '*.csv' # master path of citi data master_path = config["target"]["citi_master"] + cycle_date # source headers of citi headers = ["transdate", "Posting_date", "Description", "Amount"] src_df = spark.read.csv(src_path, sep=',') new_df = src_df.toDF(*headers).filter('Amount is not null') # handling Januray month transactions where year is missing in the trn date. if cycle_month_num == '01': new_df = new_df.withColumn( "transdate", F.expr( f"case when transdate like '%Dec%' then concat(transdate,'-',{previous_year})" f" else concat(transdate,'-',{cycle_year}) end")) else: new_df = new_df.withColumn( "transdate", F.expr(f"concat(transdate,'-',{cycle_year})")) src_df = convert_date_format(new_df, 'transdate', '%d-%b-%Y', '%Y%m%d') # writing it to master new_df = write_src_csv_to_master(src_df, master_path, account_type) elif account_type == 'discover': # src path of discover csv statements src_path = config["source"][ "discover"] + 'Discover-Statement-' + cycle_date + '*.csv' # master file path of discover master_path = config["target"]["discover_master"] + cycle_date src_df = spark.read.csv(src_path, header=True, sep=',') columns = [c.replace('. ', '_') for c in src_df.columns] src_df = src_df.toDF(*columns) # convert date format as needed src_df = convert_date_format(src_df, 'Trans_Date', '%m/%d/%Y', '%Y%m%d') new_df = write_src_csv_to_master(src_df, master_path, account_type) elif account_type == 'bofachk': # date formats as needed cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6]) trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6]) # src path for bofa chk files src_path = config["source"][ "bofa_checking"] + 'tabula-eStmt_' + cycle_date_input + '*.csv' # master path for bofa chk master_path = config["target"]["bofa_chk_master"] + trg_cycle_date src_df = spark.read.csv(src_path, header=True, sep=',').filter('Amount is not null') src_df = convert_date_format(src_df, 'Date', '%m/%d/%y', '%Y%m%d') new_df = write_src_csv_to_master(src_df, master_path, account_type) elif account_type == 'bofacredit': # extracting dates as needed cycle_year = '{}'.format(cycle_date[0:4]) cycle_month = '{}'.format(cycle_date[4:6]) previous_year = int(cycle_year) - 1 # cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6]) # trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6]) # # src = config["source"]["bofa_credit"] + 'eStmt_' + cycle_date_input + '-15.pdf' # master = config["target"]["bofa_cc_master"] + trg_cycle_date # # new_df = write_bofa_cc_to_master(spark, src, cycle_date) # # new_df = convert_date_format(new_df, 'Transaction_date', '%m/%d/%Y', '%Y%m%d') # # new_df = new_df.withColumn('act_type', F.lit(account_type)) # # new_df.coalesce(1).write.format("csv").mode("overwrite").save(master, header="true") headers = ["Transaction_date", "Posting_date", "Description", "Amount"] cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6]) trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6]) src_path = config["source"][ "bofa_credit"] + 'tabula-eStmt_' + cycle_date_input + '*.csv' master_path = config["target"]["bofa_cc_master"] + trg_cycle_date src_df = spark.read.csv(src_path, header=False, sep=',') src_df = src_df.toDF(*headers) # handling Januray month transactions where year is missing in the trn date. if cycle_month == '01': src_df = src_df.withColumn( "Transaction_date", F.expr( f"case when Transaction_date like '%Dec%' then concat(Transaction_date,'-',{previous_year})" f" else concat(Transaction_date,'-',{cycle_year}) end")) else: src_df = src_df.withColumn( "Transaction_date", F.expr(f"concat(Transaction_date,'-',{cycle_year})")) src_df = src_df.filter('AMOUNT is not null') src_df = convert_date_format(src_df, 'Transaction_date', '%d-%b-%Y', '%Y%m%d') new_df = write_src_csv_to_master(src_df, master_path, account_type) elif account_type == 'chase': # extracting dates as needed cycle_year = cycle_date[0:4] cycle_month = cycle_date[4:6] previous_year = int(cycle_year) - 1 trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6]) src_path = config["source"]["chase"] master = config["target"]["chase_master"] + trg_cycle_date headers = ["date", "description", "amount", "balance"] # gets the file with the match filename src_file_name = get_file_starting_with(spark, src_path, match_filename=cycle_date) src_path = os.path.join(src_path, src_file_name) src_df = spark.read.csv(src_path, header=True, sep=',').filter('AMOUNT is not null') new_df = src_df.toDF(*headers) #new_df = write_chase_chk_pdf_to_master(spark, src, cycle_date) # handling Januray month transactions where year is missing in the trn date. if cycle_month == '01': new_df = new_df.withColumn( "date", F.expr( f"case when date like '%Dec%' then concat(date,'-',{previous_year})" f" else concat(date,'-',{cycle_year}) end")) else: new_df = new_df.withColumn( "date", F.expr(f"concat(date,'-',{cycle_year})")) new_df = convert_date_format(new_df, 'DATE', '%d-%b-%Y', '%Y%m%d') new_df = new_df.withColumn('act_type', F.lit(account_type)) new_df.coalesce(1).write.format("csv").mode("overwrite").save( master, header="true") else: print( 'Please put in the right account_type: from citi discover bofachk bofacredit chase' ) new_df.show(200, False) pass
# this creates a temporary streaming view based on the streaming dataframe # it can later be queried with spark.sql, we will cover that in the next section vehicleCheckinStreamingDF.withColumn("value",from_json("value",vehicleCheckinSchema))\ .select(col('value.*')) \ .createOrReplaceTempView("VehicleCheckin") # Using spark.sql we can select any valid select statement from the spark view vehicleCheckinSelectStarDF = spark.sql( "select reservationId, locationName, truckNumber as checkinTruckNumber, status from VehicleCheckin" ) # Join the bank deposit and customer dataframes on the accountNumber fields checkinStatusDF = vehicleStatusSelectStarDF.join( vehicleCheckinSelectStarDF, expr(""" statusTruckNumber = checkinTruckNumber """)) # this takes the stream and "sinks" it to the console as it is updated one message at a time: # +-----------------+------------+-------------+---------------+-------------+------------+------------------+------+ # |statusTruckNumber| destination|milesFromShop|odometerReading|reservationId|locationName|checkinTruckNumber|status| # +-----------------+------------+-------------+---------------+-------------+------------+------------------+------+ # | 1445|Pennsylvania| 447| 297465|1602364379489| Michigan| 1445| In| # | 1445| Colardo| 439| 298038|1602364379489| Michigan| 1445| In| # | 1445| Maryland| 439| 298094|1602364379489| Michigan| 1445| In| # | 1445| Texas| 439| 298185|1602364379489| Michigan| 1445| In| # | 1445| Maryland| 439| 298234|1602364379489| Michigan| 1445| In| # | 1445| Nevada| 438| 298288|1602364379489| Michigan| 1445| In| # | 1445| Louisiana| 438| 298369|1602364379489| Michigan| 1445| In| # | 1445| Texas| 438| 298420|1602364379489| Michigan| 1445| In| # | 1445| Texas| 436| 298471|1602364379489| Michigan| 1445| In|
lmm_udf = fx.pandas_udf(lmm, returnType=DoubleType()) # COMMAND ---------- # DBTITLE 1,Prepare the input DataFrame """ Read in 1000genomes phase 3 chr 22 and split multiallelic sites to biallelic. Add the phenotypes by cross joining with the genomic DataFrame. The input to the lmm is the genotype represented as the number of alt alleles (0, 1, or 2). In this example, we remove all sites where some samples are missing (as represented by -1). """ df = glow.transform( \ "split_multiallelics", \ spark.read.format("vcf").load("/databricks-datasets/genomics/1kg-vcfs/*chr22*.vcf.gz") \ ) \ .crossJoin(spark.read.format("parquet").load("/databricks-datasets/genomics/1000G/phenotypes.normalized/")) \ .withColumn('genotype_states', fx.expr("genotype_states(genotypes)")) \ .where(~fx.array_contains(fx.col('genotype_states'), -1)) # COMMAND ---------- # DBTITLE 1,Run the UDF and display results by_pvalue = df.limit(1000).select("contigName", "start", "names", lmm_udf(df['genotype_states'], df['values']).alias("pValue"))\ .na.drop(subset=["pValue"])\ .orderBy("pValue", ascending=True) display(by_pvalue)
]).toDF("id", "name", "graduate_program", "spark_status")) graduateProgram = (spark.createDataFrame([ (0, "Masters", "School of Information", "UC Berkeley"), (2, "Masters", "EECS", "UC Berkeley"), (1, "Ph.D.", "EECS", "UC Berkeley") ]).toDF("id", "degree", "department", "school")) sparkStatus = (spark.createDataFrame([(500, "Vice President"), (250, "PMC Member"), (100, "Contributor") ]).toDF("id", "status")) person.createOrReplaceTempView("person") graduateProgram.createOrReplaceTempView("graduateProgram") sparkStatus.createOrReplaceTempView("sparkStatus") #joinExpression=person.graduate_program == graduateProgram.id joinExpression = expr("graduate_program = id") person.withColumnRenamed("id", "personId").join(graduateProgram, joinExpression, "inner").show() #person.join(graduateProgram,joinExpression,"inner").explain(extended=True) #person.join(graduateProgram,joinExpression,"outer").show() # ( # person.withColumnRenamed("id","personId") # .join(sparkStatus,expr("array_contains(spark_status,id)")).show() # ) spark.stop()
def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES") # e.g. 4015976448 mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join(find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar") spark = (pyspark.sql.SparkSession.builder.master( f"local[{os.cpu_count() - 1}]").config( "spark.eventLog.enabled", "true").config("spark.eventLog.dir", "/spark-events").config( "spark.sql.execution.arrow.pyspark.enabled", "true").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config( "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config("spark.driver.memory", f"{mem_gib}g").config( "spark.history.fs.logDirectory", "/spark-events").config( "spark.sql.execution.arrow.maxRecordsPerBatch", "1").config("spark.jars", tar_jar).config( "spark.local.dir", "/mnt/disks/spark-scratch/").getOrCreate()) spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue) _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue) licenseurl_df = licenseurl_df.select( [F.col("identifier"), F.col("text_document_id"), F.col("licenseurl")]) # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_" catalogue_df = catalogue_df.withColumn( "kaldi_normalized_uttid", F.concat_ws( "-", F.translate(catalogue_df.identifier, " /", "__"), F.translate(catalogue_df.audio_document_id, " /", "__"), ), ) # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv") if not FLAGS.work_dir.startswith("gs://"): os.makedirs(FLAGS.work_dir, exist_ok=True) wav_scp = os.path.join(FLAGS.work_dir, "wav.scp") ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir") if FLAGS.stage <= 0: catalogue_df = catalogue_df.cache() # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping) training_sample_rows = catalogue_df.collect() catalogue_df.unpersist() with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) create_wav_scp(posix_wav_scp, training_sample_rows, FLAGS.input_dir, ctm_out_dir) # /development/lingvo-source/output_ctm_dir/ # nvprof --analysis-metrics -o decoder-analysis.nvprof \ # We want only the best path, so we set lattice-beam to 0.1 # --main-q-capacity=35000 \ # Can get 266x RTF with this configuration. Keep it? # bath size of 100 and num channels of 100 works just fine if FLAGS.stage <= 1: if not FLAGS.work_dir.startswith("gs://"): os.makedirs(ctm_out_dir, exist_ok=True) with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, ctm_out_dir) posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, FLAGS.work_dir) num_gpus = 4 posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir, num_gpus) executor = ThreadPoolExecutor(max_workers=num_gpus) def run_gpu(posix_wav_scp_shard, gpu_number): cmd = f"""\ /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \ --frame-subsampling-factor=3 \ --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \ --max-active=7000 \ --beam=15.0 \ --lattice-beam=0.1 \ --acoustic-scale=1.0 \ --cuda-decoder-copy-threads=2 \ --cuda-worker-threads={os.cpu_count() // num_gpus} \ --segmentation=true \ --cuda-use-tensor-cores=true \ --max-batch-size=150 \ --num-channels=250 \ --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \ --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \ /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \ /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \ scp,p:{posix_wav_scp_shard} \ {posix_ctm_out_dir} """ env = deepcopy(os.environ) env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}" subprocess.check_call(shlex.split(cmd), env=env) for i, shard in enumerate(posix_wav_scp_shards): executor.submit(run_gpu, shard, i) executor.shutdown(wait=True) alignments_dir = os.path.join(FLAGS.alignments_work_dir, "alignments_json_jul_28") if FLAGS.stage <= 2: # TODO: Add options to DSAlign here dsalign_args = dsalign_main.parse_args( ["--output-wer", "--output-cer"]) # , "--output-sws", "--output-levenshtein"]) alphabet_normalized_path = ( "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt") align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path, 15_000, 3_000) ctm_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.ctm").load(ctm_out_dir)) ctm_df = ctm_df.withColumn( "kaldi_normalized_uttid", F.regexp_replace( F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""), ) ctm_df = ctm_df.withColumn("ctm_content", fix_text_udf(F.col("content"))).drop( "path", "length", "modificationTime", "content") ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid") downsampled_catalogue_df = ctm_df.drop("ctm_content") training_sample_rows = downsampled_catalogue_df.collect() transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path, training_sample_rows) transcripts_df = transcripts_df.withColumn( "transcript", normalize_english_text_udf(transcripts_df.transcript)) ctm_df = ctm_df.join(transcripts_df, ["identifier", "text_document_id"]) ctm_df = ctm_df.repartition(960) # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), # F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), # ctm_df.transcript, ctm_df.ctm_content)) alignments_df = ctm_df.withColumn( "alignments", align_udf( F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), ctm_df.transcript, ctm_df.ctm_content, ), ).drop("ctm_content") print("GALVEZ:schema") alignments_df.printSchema() sys.stdout.flush() alignments_df.write.mode("overwrite").format("json").save( alignments_dir) manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest") tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars") if FLAGS.stage <= 3: duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json" duplicates_df = spark.read.format("json").load(duplicate_data_path) alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.join( duplicates_df, on=(alignments_df.identifier == duplicates_df.identifier) & (alignments_df.text_document_id == duplicates_df.text_document_id), how="anti", ) if FLAGS.license_filter == "": pass else: if FLAGS.license_filter == "Not CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( ~is_cc_by_sa(F.col("licenseurl"))) elif FLAGS.license_filter == "CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( is_cc_by_sa(F.col("licenseurl"))) else: raise Exception("Unknown license_filter provided.") filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl") alignments_df = alignments_df.join( filtered_licenseurl_df, on=(alignments_df.identifier == filtered_licenseurl_df.identifier) & (alignments_df.text_document_id == filtered_licenseurl_df.text_document_id), how="inner", ) alignments_df = alignments_df.drop( filtered_licenseurl_df.identifier).drop( filtered_licenseurl_df.text_document_id) # We would like the number of partitions to be some large multiple # of the number of executors. Not every audio file is the same # length, so this helps with load balancing. alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType([ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ]))), ) alignments_df = alignments_df.drop("duration_ms") alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < FLAGS.max_duration_ms) & (alignment.duration_ms >= FLAGS.min_duration_ms) & (alignment.cer < FLAGS.max_cer) & (alignment.cer >= FLAGS.min_cer), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.alignments.duration_ms, ).cast( T.StructType([ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ])), ) alignments_df = alignments_df.repartition(960) abc = alignments_df.select( F.sum( F.expr( "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)" )) / 1000.0 / 60.0 / 60.0).collect() print("GALVEZ:total number of hours=", abc) sys.stdout.flush() alignments_df = alignments_df.select( alignments_df.identifier, alignments_df.audio_document_id, alignments_df.text_document_id, alignments_df.alignments, ) alignments_df = F.broadcast(alignments_df) audio_paths = F.concat( F.lit(FLAGS.input_gcs_path), F.lit("/"), F.col("identifier"), F.lit("/"), F.col("audio_document_id"), ) rows = alignments_df.select(audio_paths).collect() paths = [row[0] for row in rows] # [:1] # GALVEZ: WARNING test! # print(f"number of paths = {len(paths)}") audio_df = (spark.read.format("binaryFile").load(paths).drop( "modificationTime", "length")) alignments_audio_df = alignments_df.join(audio_df, audio_paths == audio_df.path) # from IPython import embed; embed() # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory # Remove "." becasue it has special meaning in webdataset format. # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient) name = F.concat(F.col("identifier"), F.lit("/"), F.col("audio_document_id")) # name = F.regexp_replace(name, r"/", "_SLASH_") name = F.regexp_replace(name, r"\.", "_DOT_") name = F.regexp_replace(name, r" ", "_SPACE_") # glob.glob("**/*.flac") pdf = df.select(name).collect() for name in pdf.name: assert len(name) < 4096 for chunk in "/".split(name): assert len(chunk) < 256 # name = F.regexp_replace(F.concat(F.col("identifier"), # F.lit("-"), # F.col("audio_document_id")), # r"(\.|/)", # "_" # ) # The name of each thing in the tar file. May not exceed 100 characters in length # substr indexes from 1! # name = name.substr( # F.length(name) - F.least(F.length(name), F.lit(88)) + 1, # F.least(F.length(name), F.lit(88)) # ) alignments_audio_df = alignments_audio_df.withColumn( "aligned_chunks", create_audio_segments_udf( alignments_audio_df.content, F.lit("mp3"), name, alignments_audio_df.alignments.start_ms, alignments_audio_df.alignments.end_ms, F.lit("flac"), ), ) a = alignments_audio_df.select( F.explode( F.arrays_zip("aligned_chunks.audio_name", "aligned_chunks.audio"))).select( "col.0", "col.1") a.write.mode("overwrite").format("tar").save(tars_dir) output_df = alignments_audio_df.select( alignments_audio_df.identifier, alignments_audio_df.audio_document_id, alignments_audio_df.text_document_id, F.struct( alignments_audio_df.alignments.label.alias("label"), create_audio_segment_names_udf( # Is F.size right here? name, F.size(alignments_audio_df.alignments.start_ms), F.lit("flac"), ).alias("name"), alignments_audio_df.alignments.duration_ms.alias( "duration_ms"), ).alias("training_data"), ) output_df = output_df.coalesce(960) # coalesce(1) seems to make the create_audio_segments_udf function run serially output_df.write.mode("overwrite").json(manifest_dir) repartitioned_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tars") tmp_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tmp_dir") if FLAGS.stage <= 4: tars_df = spark.read.format("tar").load(tars_dir) # .limit(100) number_of_rows = tars_df.count() spark2 = spark.newSession() spark2.conf.set( "spark.sql.execution.rangeExchange.sampleSizePerPartition", number_of_rows) spark2.conf.set("spark.sql.files.minPartitionNum", FLAGS.number_of_shards) # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100) # print("GALVEZ:", tars_df.select(F.col("key")).collect()) # import sys; sys.exit() tars_df = spark2.read.format("tar").load(tars_dir) # .limit(100) tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards, F.col("key")) # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs. # # tars_df = tars_df.persist() tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir) tars_df = spark2.read.format("tar").load( tmp_tars_dir) # .repartitionByRange() # coalesce(1024) # counts_df = ( # tars_df.withColumn("partitionId", F.spark_partition_id()) # .groupBy("partitionId") # .count() # ) # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0] # # Consider doing this in java # def drop_final_rows(rows): # for _ in range(num_rows_to_keep): # yield next(rows) # for _ in rows: # pass # return # print("GALVEZ:before=", tars_df.rdd.getNumPartitions()) # # , preservesPartitioning=True # tars_df = spark2.createDataFrame( # tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema # ) # print("GALVEZ:after=", tars_df.rdd.getNumPartitions()) # import sys # sys.stdout.flush() # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode. # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir) # manifest_df = spark2.read.json(manifest_dir) # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count() # print(f"GALVEZ:number_of_utterances={number_of_utterances}") # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard) nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo") nemo_single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo_single") if FLAGS.stage <= 5: json_df = spark.read.format("json").load(manifest_dir) nemo_df = json_df.select( F.explode( F.arrays_zip( F.col("training_data.name").alias("audio_filepath"), F.col("training_data.label").alias("text"), F.col("training_data.duration_ms").alias("duration_ms"), ))) nemo_df = nemo_df.select( F.col("col.name").alias("audio_filepath"), F.col("col.label").alias("text"), (F.col("col.duration_ms").cast(T.DoubleType()) / 1000.0).alias("duration"), F.lit(-1).alias("shard_id"), ) if False: tars_df = spark.read.format("tar").load(repartitioned_tars_dir) tars_df = tars_df.select(tars_df.key) nemo_df = F.broadcast(nemo_df) nemo_df = nemo_df.join( tars_df, F.col("audio_filepath") == F.col("key")).drop(F.col("key")) # TODO: Join against tar files that have been made to contain the # same number of files to filter out removed files nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir) nemo_single_df = spark.read.format("json").load(nemo_manifest_dir) nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save( nemo_single_manifest_dir) single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_single") single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single") # Create single tar file and single json file if FLAGS.stage <= 6: json_df = spark.read.format("json").load(manifest_dir) json_df.coalesce(1).write.format("json").mode("overwrite").save( single_manifest_dir) tars_df = spark.read.format("tar").load(tmp_tars_dir) tars_df.coalesce(1).write.format("tar").mode("overwrite").save( single_tar_dir)
spark.read.format("json") .load("data/flight_data/json/2015-summary.json") ) df.printSchema() df.createOrReplaceTempView("dfTable") df.select("DEST_COUNTRY_NAME").show(2) spark.sql(""" SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2 """).show() df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2) df.select(expr("DEST_COUNTRY_NAME as destination")).show(2) df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST")).show(2) df.selectExpr("DEST_COUNTRY_NAME as newColumn","DEST_COUNTRY_NAME").show(4) df.selectExpr( "*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry" ).show(5,truncate=False) df.selectExpr( "AVG(count)", "COUNT(DISTINCT(DEST_COUNTRY_NAME))" ).show()
# COMMAND ---------- from pyspark.sql.functions import instr DOTCodeFilter = col("StockCode") == "DOT" priceFilter = col("UnitPrice") > 600 descripFilter = instr(col("Description"), "POSTAGE") >= 1 df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\ .where("isExpensive")\ .select("unitPrice", "isExpensive").show(5) # COMMAND ---------- from pyspark.sql.functions import expr df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\ .where("isExpensive")\ .select("Description", "UnitPrice").show(5) # COMMAND ---------- from pyspark.sql.functions import expr, pow fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5 df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2) # COMMAND ---------- df.selectExpr( "CustomerId",
graduateProgram.join(person, joinExpression, joinType).show() # Be careful when you cross join on large datasets, it causes the explosion in the number of rows contained in result DF. # COMMAND ---------- # When performing joins, there are some specific challenges and some common questions that arise. # The rest of the notebook will provide answers to these common questions and then explain how, at a high level, Spark performs joins. # COMMAND ---------- # Even though this might seem like a challenge, it’s actually not. Any expression is a valid join expression, assuming that it returns a Boolean: # For ex: Joining by the id from the person DF and spark_status array from the SparkStatus DF. from pyspark.sql.functions import expr # As spark_status is an array column, using an expression to check if the spark_status contains an id value. person.withColumnRenamed("id", "personId")\ .join(sparkStatus, expr("array_contains(spark_status, id)")).show() # COMMAND ---------- # One of the tricky things that come up in joins is dealing with duplicate column names in your results DataFrame. # In a DataFrame, each column has a unique ID within Spark’s SQL Engine, Catalyst. This unique ID is purely internal and not something that you can directly reference. # This makes it quite difficult to refer to a specific column when you have a DataFrame with duplicate column names. # COMMAND ---------- # This can occur in two distinct situations: # The join expression that you specify does not remove one key from one of the input DataFrames and the keys have the same column name # Two columns on which you are not performing the join have the same name # COMMAND ----------