def test_repartition_df(self, input_df): result_df = repartition_df(dataframe=input_df, partition_by=["timestamp"]) # Only one partition id, meaning data is not partitioned assert input_df.select(spark_partition_id()).distinct().count() == 1 # Desired number of partitions assert result_df.select(spark_partition_id()).distinct().count() == 200
def test__repartition_df(self, spark_session, spark_context): # arrange start = datetime.datetime(year=1970, month=1, day=1) end = datetime.datetime(year=2020, month=12, day=31) random_dates = [ ( lambda: start + datetime. timedelta(seconds=random.randint( # noqa: S311 0, int( (end - start).total_seconds()))))().date().isoformat() for _ in range(10000) ] data = [{"timestamp": date} for date in random_dates] input_df = spark_session.read.json(spark_context.parallelize(data, 1), schema="timestamp timestamp") writer = HistoricalFeatureStoreWriter() # act result_df = writer._create_partitions(input_df) # assert # Only one partition id, meaning data is not partitioned assert input_df.select(spark_partition_id()).distinct().count() == 1 # Desired number of partitions assert result_df.select(spark_partition_id()).distinct().count() == 200
def main(spark: SparkSession, inputfile: str, output_dir: str): logger.info(spark.version) logger.info(inputfile) logger.info(output_dir) flights_df = (spark.read.format("parquet").load(inputfile)) # Write DataFrame in parquet format logger.info("flights_df : Number of Partitions : " + str(flights_df.rdd.getNumPartitions())) flights_df.groupBy(spark_partition_id()).count().show() # flights_df.write.mode('overwrite').parquet(f"{output_dir}org/") flights_partition_df = flights_df.repartition(5) logger.info("flights_partition_df : Number of Partitions : " + str(flights_partition_df.rdd.getNumPartitions())) flights_partition_df.groupBy(spark_partition_id()).count().show() # Write DataFrame with Partitions flights_partition_df.write \ .mode('overwrite') \ .partitionBy("OP_CARRIER", "ORIGIN") \ .parquet(f"{output_dir}part_data/") # Write DataFrame with Partitions and control file sizes flights_partition_df.write \ .mode('overwrite') \ .partitionBy("OP_CARRIER", "ORIGIN") \ .option("maxRecordsPerFile", 10000) \ .parquet(f"{output_dir}part_size_data/") logger.info("done")
def attach_default_index(sdf, default_index_type=None): """ This method attaches a default index to Spark DataFrame. Spark does not have the index notion so corresponding column should be generated. There are several types of default index can be configured by `compute.default_index_type`. """ if default_index_type is None: default_index_type = get_option("compute.default_index_type") if default_index_type == "sequence": sequential_index = F.row_number().over( Window.orderBy(NATURAL_ORDER_COLUMN_NAME)) - 1 scols = [scol_for(sdf, column) for column in sdf.columns] return sdf.select(sequential_index.alias(SPARK_INDEX_NAME_FORMAT(0)), *scols) elif default_index_type == "distributed-sequence": # 1. Calculates counts per each partition ID. `counts` here is, for instance, # { # 1: 83, # 6: 83, # 3: 83, # ... # } counts = map(lambda x: (x["key"], x["count"]), sdf.groupby(F.spark_partition_id().alias("key")).count().collect()) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. # We just need a one-by-one sequential id. # sort by partition key. sorted_counts = sorted(counts, key=lambda x: x[0]) # get cumulative sum in an order of partition key. cumulative_counts = accumulate(map(lambda count: count[1], sorted_counts)) # zip it with partition key. sums = dict(zip(map(lambda count: count[0], sorted_counts), cumulative_counts)) return_schema = StructType( [StructField(SPARK_INDEX_NAME_FORMAT(0), LongType())] + list(sdf.schema)) columns = [f.name for f in return_schema] # 3. Group by partition id and assign each range. def default_index(pdf): current_partition_max = sums[pdf["__spark_partition_id"].iloc[0]] offset = len(pdf) pdf[SPARK_INDEX_NAME_FORMAT(0)] = list(range( current_partition_max - offset, current_partition_max)) return pdf[columns] grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(default_index) sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id()) return sdf.groupBy("__spark_partition_id").apply(grouped_map_func) elif default_index_type == "distributed": scols = [scol_for(sdf, column) for column in sdf.columns] return sdf.select( F.monotonically_increasing_id().alias(SPARK_INDEX_NAME_FORMAT(0)), *scols) else: raise ValueError("'compute.default_index_type' should be one of 'sequence'," " 'distributed-sequence' and 'distributed'")
def test_repartition_sort_df_processors_partitions(self, input_df): result_df = repartition_sort_df( dataframe=input_df, partition_by=["timestamp"], order_by=["timestamp"], num_partitions=50, ) # Only one partition id, meaning data is not partitioned assert input_df.select(spark_partition_id()).distinct().count() == 1 # Desired number of partitions assert result_df.select(spark_partition_id()).distinct().count() == 50
def encode_shares( batch_id, n_data, public_key_hex_internal, public_key_hex_external, input, output_a, output_b, ): click.echo("Running encode shares") spark = spark_session() shares = (spark.read.json(input).withColumn( "pid", F.spark_partition_id()).groupBy("pid").applyInPandas( lambda pdf: udf.encode(batch_id, n_data, public_key_hex_internal, public_key_hex_external, pdf), schema="a: binary, b: binary", ).withColumn("id", F.udf(lambda: str(uuid4()), returnType="string")())) shares.cache() row = shares.first() dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) * n_rows * scale * 1.0 / 10**6) num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb) click.echo(f"writing {num_partitions} partitions") repartitioned = shares.repartitionByRange(num_partitions, "id").cache() repartitioned.select("id", F.base64("a").alias("payload")).write.json( output_a, mode="overwrite") repartitioned.select("id", F.base64("b").alias("payload")).write.json( output_b, mode="overwrite")
def partition_iterator(sdf): import pyspark.sql.functions as F sdf_part = sdf.withColumn('partition', F.spark_partition_id()) sdf_part.cache() for part in range(sdf.rdd.getNumPartitions()): yield sdf_part.where(F.col('partition') == part).drop( 'partition').rdd.toLocalIterator()
def _launch_analysis(self, ds, df, udf, columns): histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1 return ds, df.select(udf(*columns).alias('histos')) \ .withColumn('hpid', fn.spark_partition_id() % histo_map_parts) \ .repartition(histo_map_parts, 'hpid') \ .groupBy('hpid').apply(reduce_histos) \ .groupBy().agg(agg_histos('histos')) \ .toPandas()
def attach_distributed_sequence_column(sdf, column_name): """ This method attaches a Spark column that has a sequence in a distributed manner. This is equivalent to the column assigned when default index type 'distributed-sequence'. >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark() >>> sdf = _InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence") >>> sdf.sort("sequence").show() # doctest: +NORMALIZE_WHITESPACE +--------+---+ |sequence| 0| +--------+---+ | 0| a| | 1| b| | 2| c| +--------+---+ """ scols = [scol_for(sdf, column) for column in sdf.columns] # 1. Calculates counts per each partition ID. `counts` here is, for instance, # { # 1: 83, # 6: 83, # 3: 83, # ... # } sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id()) counts = map( lambda x: (x["key"], x["count"]), sdf.groupby(sdf["__spark_partition_id"].alias("key")).count().collect(), ) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. # We just need a one-by-one sequential id. # sort by partition key. sorted_counts = sorted(counts, key=lambda x: x[0]) # get cumulative sum in an order of partition key. cumulative_counts = [0] + list(accumulate(map(lambda count: count[1], sorted_counts))) # zip it with partition key. sums = dict(zip(map(lambda count: count[0], sorted_counts), cumulative_counts)) # 3. Attach offset for each partition. @pandas_udf(LongType(), PandasUDFType.SCALAR) def offset(id): current_partition_offset = sums[id.iloc[0]] return pd.Series(current_partition_offset).repeat(len(id)) sdf = sdf.withColumn("__offset__", offset("__spark_partition_id")) # 4. Calculate row_number in each partition. w = Window.partitionBy("__spark_partition_id").orderBy(F.monotonically_increasing_id()) row_number = F.row_number().over(w) sdf = sdf.withColumn("__row_number__", row_number) # 5. Calculate the index. return sdf.select(F.expr("__offset__ + __row_number__ - 1").alias(column_name), *scols)
def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=1024)\ .repartition(num_parts, *part_on)\ .withColumn('id', f.spark_partition_id())\ .withColumn('hashed', f.hash(*part_on))\ .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)))
def _is_monotonic(self, order): assert order in ("increasing", "decreasing") sdf = self._internal.spark_frame sdf = ( sdf.select( F.spark_partition_id().alias( "__partition_id" ), # Make sure we use the same partition id in the whole job. F.col(NATURAL_ORDER_COLUMN_NAME), self.spark_column.alias("__origin"), ) .select( F.col("__partition_id"), F.col("__origin"), self._is_locally_monotonic_spark_column(order).alias( "__comparison_within_partition" ), ) .groupby(F.col("__partition_id")) .agg( F.min(F.col("__origin")).alias("__partition_min"), F.max(F.col("__origin")).alias("__partition_max"), F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True))).alias( "__comparison_within_partition" ), ) ) # Now we're windowing the aggregation results without partition specification. # The number of rows here will be as the same of partitions, which is expected # to be small. window = Window.orderBy(F.col("__partition_id")).rowsBetween(-1, -1) if order == "increasing": comparison_col = F.col("__partition_min") >= F.lag(F.col("__partition_max"), 1).over( window ) else: comparison_col = F.col("__partition_min") <= F.lag(F.col("__partition_max"), 1).over( window ) sdf = sdf.select( comparison_col.alias("__comparison_between_partitions"), F.col("__comparison_within_partition"), ) ret = sdf.select( F.min(F.coalesce(F.col("__comparison_between_partitions"), F.lit(True))) & F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True))) ).collect()[0][0] if ret is None: return True else: return ret
def test_total_share(spark, root, args): raw = spark.read.json(str(root / "server_a" / "raw")) internal = spark.read.json( str(root / "server_a" / "intermediate" / "internal" / "verify2") ) external = spark.read.json( str(root / "server_a" / "intermediate" / "external" / "verify2") ) aggregates = ( raw.select("id", F.unbase64("payload").alias("shares")) .join(internal.select("id", F.unbase64("payload").alias("internal")), on="id") .join(external.select("id", F.unbase64("payload").alias("external")), on="id") .repartition(2) .withColumn("pid", F.spark_partition_id()) .groupBy("pid") .applyInPandas( lambda pdf: udf.aggregate( args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, pdf, ), schema="payload binary, error int, total int", ) ) aggregates.show() rows = aggregates.collect() assert len(rows) == 2 assert {2, 3} == set(r.total for r in rows) assert all(r.error == 0 for r in rows) total_share = aggregates.groupBy().applyInPandas( lambda pdf: udf.total_share( args.batch_id, args.n_data, args.server_id, args.private_key_hex, args.shared_secret, args.public_key_hex_internal, args.public_key_hex_external, pdf, ), schema="payload binary, error int, total int", ) total_share.show() rows = total_share.collect() assert len(rows) == 1 assert len(rows[0].payload) > 0 assert rows[0].total == 5 assert rows[0].error == 0
def _launch_analysis(self, ds, df, udf, columns): histo_map_parts = (df.rdd.getNumPartitions() // 20) + 1 return ( ds, df.select(udf(*columns).alias("histos")) .withColumn("hpid", fn.spark_partition_id() % histo_map_parts) .repartition(histo_map_parts, "hpid") .groupBy("hpid") .apply(reduce_histos) .groupBy() .agg(agg_histos("histos")) .toPandas(), )
def __show_partitions_count(self, cardo_context, result): """ :type result: list of CardoDataFrame """ for dataframe_index, cardo_dataframe in enumerate(result): cardo_context.logger.debug( 'showing partitions of dataframe #{}'.format(dataframe_index)) if cardo_dataframe.payload_type == 'dataframe': partitions = cardo_dataframe.dataframe.groupBy( F.spark_partition_id()).count().collect() for partition in partitions: cardo_context.logger.debug('partition id #{}: {}'.format( partition[0], partition[1])) else: cardo_context.logger.debug( 'Cannot show partition status for {}'.format( cardo_dataframe.payload_type))
def aggregate( batch_id, n_data, server_id, private_key_hex, shared_secret, public_key_hex_internal, public_key_hex_external, input, input_internal, input_external, output, ): """Generate an aggregate share from a batch of verified SNIPs""" click.echo("Running aggregate") spark = spark_session() shares = spark.read.json(input) internal = spark.read.json(input_internal) external = spark.read.json(input_external) args = [ batch_id, n_data, server_id, private_key_hex, b64decode(shared_secret), public_key_hex_internal, public_key_hex_external, ] (shares.join(internal.withColumnRenamed("payload", "internal"), on="id").join( external.withColumnRenamed("payload", "external"), on="id").select( F.unbase64("payload").alias("shares"), F.unbase64("internal").alias("internal"), F.unbase64("external").alias("external"), F.spark_partition_id().alias("pid"), ).groupBy("pid").applyInPandas( lambda pdf: udf.aggregate(*args, pdf), schema="payload: binary, error: int, total: int", ).groupBy().applyInPandas( lambda pdf: udf.total_share(*args, pdf), schema="payload: binary, error: int, total: int", ).withColumn("payload", F.base64("payload"))).write.json( output, mode="overwrite")
def tf_serving_with_broadcasted_model(df, model_base_path=None, model_version=None, model_full_path=None, signature_def_key=None): import pyspark.sql.functions as F import pyspark.sql.types as T model = load_model(model_base_path, model_version, model_full_path, signature_def_key) tf_output_schema = fetch_tensors_spark_schema(model.fetch_tensors) output_schema = T.StructType(df.schema.fields + tf_output_schema.fields) graph_def, feed_names, fetch_names, extra_ops = GraphDefPredictor.export_model( model) graph_def_serialized_bc = df.rdd.context.broadcast(graph_def) def func(pandas_df): """ Batch inference on a panda dataframe """ predictor_model = GraphDefPredictor( graph_def_serialized_bc.value, feed_names, fetch_names, extra_ops) return pandas_model_inference(predictor_model, pandas_df, output_schema.fieldNames()) inference = F.pandas_udf(func, output_schema, F.PandasUDFType.GROUPED_MAP) return df.groupby(F.spark_partition_id()).apply(inference)
from pyspark.sql import SparkSession from lib.logger import Log4j from pyspark.sql.functions import spark_partition_id if __name__ == "__main__": conf = get_spark_app_config() spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() logger = Log4j(spark) logger.info("Starting the pyspark application") logger.info("Reading data from csv files") movieCsvDf = load_movie_csv_df(spark) movieCsvDf.show(5) logger.info("Csv Schema: " + movieCsvDf.schema.simpleString()) logger.info("Writing avro data to output path") logger.info("Number of partitions before :" + str(movieCsvDf.rdd.getNumPartitions())) movieCsvDf.groupBy(spark_partition_id()).count().show() partitionDf = movieCsvDf.repartition(5) logger.info("Number of partitions after :" + str(partitionDf.rdd.getNumPartitions())) partitionDf.groupBy(spark_partition_id()).count().show() write_movie_df(partitionDf) write_movie_json_df(partitionDf) logger.info("Completing the pyspark application")
def to_redshift( self, dataframe: DataFrame, path: str, connection: Any, schema: str, table: str, iam_role: str, diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[str] = None, min_num_partitions: int = 200, mode: str = "append", ) -> None: """ Load Spark Dataframe as a Table on Amazon Redshift :param dataframe: Pandas Dataframe :param path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/) :param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection()) :param schema: The Redshift Schema for the table :param table: The name of the desired Redshift table :param iam_role: AWS IAM role with the related permissions :param diststyle: Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"] (https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html) :param distkey: Specifies a column name or positional number for the distribution key :param sortstyle: Sorting can be "COMPOUND" or "INTERLEAVED" (https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html) :param sortkey: List of columns to be sorted :param min_num_partitions: Minimal number of partitions :param mode: append or overwrite :return: None """ logger.debug(f"Minimum number of partitions : {min_num_partitions}") if path[-1] != "/": path += "/" self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound) spark: SparkSession = self._session.spark_session casts: Dict[str, str] = Spark._extract_casts(dataframe.dtypes) dataframe = Spark.date2timestamp(dataframe) dataframe.cache() num_rows: int = dataframe.count() logger.info(f"Number of rows: {num_rows}") num_partitions: int if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE: num_partitions = 1 else: num_slices: int = self._session.redshift.get_number_of_slices( redshift_conn=connection) logger.debug(f"Number of slices on Redshift: {num_slices}") num_partitions = num_slices while num_partitions < min_num_partitions: num_partitions += num_slices logger.debug(f"Number of partitions calculated: {num_partitions}") spark.conf.set("spark.sql.execution.arrow.enabled", "true") session_primitives = self._session.primitives par_col_name: str = "aws_data_wrangler_internal_partition_id" @pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP) def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame: # Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for # a temporary workaround while waiting for Apache Arrow updates # https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0 os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" del pandas_dataframe[par_col_name] paths: List[str] = session_primitives.session.pandas.to_parquet( dataframe=pandas_dataframe, path=path, preserve_index=False, mode="append", procs_cpu_bound=1, procs_io_bound=1, cast_columns=casts) return pd.DataFrame.from_dict({"objects_paths": paths}) df_objects_paths: DataFrame = dataframe.repartition( numPartitions=num_partitions) # type: ignore df_objects_paths: DataFrame = df_objects_paths.withColumn( par_col_name, spark_partition_id()) # type: ignore df_objects_paths: DataFrame = df_objects_paths.groupby( par_col_name).apply(write) # type: ignore objects_paths: List[str] = list( df_objects_paths.toPandas()["objects_paths"]) dataframe.unpersist() num_files_returned: int = len(objects_paths) if num_files_returned != num_partitions: raise MissingBatchDetected( f"{num_files_returned} files returned. {num_partitions} expected." ) logger.debug(f"List of objects returned: {objects_paths}") logger.debug( f"Number of objects returned from UDF: {num_files_returned}") manifest_path: str = f"{path}manifest.json" self._session.redshift.write_load_manifest( manifest_path=manifest_path, objects_paths=objects_paths, procs_io_bound=self._procs_io_bound) self._session.redshift.load_table(dataframe=dataframe, dataframe_type="spark", manifest_path=manifest_path, schema_name=schema, table_name=table, redshift_conn=connection, preserve_index=False, num_files=num_partitions, iam_role=iam_role, diststyle=diststyle, distkey=distkey, sortstyle=sortstyle, sortkey=sortkey, mode=mode, cast_columns=casts) self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound)
tot_Cnt =bizDF.count() print("Total No of Rows: ", tot_Cnt) unq_Cnt =bizDF.drop_duplicates().count() print("Unique No of Rows: ", unq_Cnt) # COMMAND ---------- # MAGIC %md ##### Check for Data Skewness # COMMAND ---------- from pyspark.sql.functions import spark_partition_id # get no of partitions implictPart = bizDF.rdd.getNumPartitions() print("Implict no of partitions:", implictPart) #get each partition size partitions =bizDF.withColumn("Partition_id", spark_partition_id()).groupBy("Partition_id").count().orderBy("Partition_id") # COMMAND ---------- # MAGIC %md ##### Convert Json to Parquet # COMMAND ---------- # MAGIC %md Since json is storage heavy and we are converting the raw data to parquet # COMMAND ---------- outPath = "/mnt/preprocess_business" bizDF.write.parquet(path=outPath, mode="overwrite",compression="snappy")
if __name__ == "__main__": spark = SparkSession \ .builder \ .master("local[3]") \ .appName("SparkSchemaDemo") \ .getOrCreate() logger = Log4j(spark) flightTimeParquetDF = spark.read \ .format("parquet") \ .load("dataSource/flight*.parquet") logger.info("Num Partitions before: " + str(flightTimeParquetDF.rdd.getNumPartitions())) flightTimeParquetDF.groupBy(spark_partition_id()).count().show() partitionedDF = flightTimeParquetDF.repartition(5) logger.info("Num Partitions after: " + str(partitionedDF.rdd.getNumPartitions())) partitionedDF.groupBy(spark_partition_id()).count().show() partitionedDF.write \ .format("avro") \ .mode("overwrite") \ .option("path", "dataSink/avro/") \ .save() flightTimeParquetDF.write \ .format("json") \ .mode("overwrite") \
def test_part_id(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, short_gen, num_slices=8).select( f.col('a'), f.spark_partition_id()))
def main(): start_time = datetime.now() # copy gnaf tables to CSV pg_conn = psycopg2.connect(local_pg_connect_string) pg_cur = pg_conn.cursor() sql = """COPY ( SELECT longitude, latitude, gnaf_pid, state FROM gnaf_202008.{} ) TO STDOUT WITH CSV""" # sql = """COPY ( # SELECT gnaf_pid, street_locality_pid, locality_pid, alias_principal, primary_secondary, building_name, # lot_number, flat_number, level_number, number_first, number_last, street_name, street_type, # street_suffix, address, locality_name, postcode, state, locality_postcode, confidence, # legal_parcel_id, mb_2011_code, mb_2016_code, latitude, longitude, geocode_type, reliability # FROM gnaf_202008.{} # ) TO STDOUT WITH CSV""" # address principals with open(os.path.join(output_path, "gnaf_light.csv"), 'w') as csv_file: pg_cur.copy_expert(sql.format("address_principals"), csv_file) # pg_cur.copy_expert(sql.format("address_principals") + " HEADER", csv_file) # address aliases with open(os.path.join(output_path, "gnaf_light.csv"), 'a') as csv_file: pg_cur.copy_expert(sql.format("address_aliases"), csv_file) pg_cur.close() pg_conn.close() logger.info("\t - GNAF points exported to CSV: {}".format(datetime.now() - start_time)) start_time = datetime.now() # upload Sedona (geospark) JARs upload_jars() spark = (SparkSession.builder.master("local[*]").appName("query").config( "spark.sql.session.timeZone", "UTC").config("spark.sql.debug.maxToStringFields", 100).config( "spark.serializer", KryoSerializer.getName).config( "spark.kryo.registrator", GeoSparkKryoRegistrator.getName).config( "spark.cores.max", cpu_count()).config("spark.sql.adaptive.enabled", "true").config("spark.driver.memory", "8g").getOrCreate()) # Register Apache Sedona (geospark) UDTs and UDFs GeoSparkRegistrator.registerAll(spark) logger.info("\t - PySpark {} session initiated: {}".format( spark.sparkContext.version, datetime.now() - start_time)) start_time = datetime.now() # load gnaf points df = spark.read \ .option("header", True) \ .option("inferSchema", True) \ .csv(input_file_name) # df.printSchema() # df.show() # # manually assign field types (not needed here as inferSchema works) # df2 = (df # .withColumn("confidence", df.confidence.cast(t.ShortType())) # .withColumn("mb_2011_code", df.mb_2011_code.cast(t.LongType())) # .withColumn("mb_2016_code", df.mb_2016_code.cast(t.LongType())) # .withColumn("reliability", df.reliability.cast(t.ShortType())) # .withColumn("longitude", df.longitude.cast(t.DoubleType())) # .withColumn("latitude", df.latitude.cast(t.DoubleType())) # ) # # df2.printSchema() # # df2.show() # add point geometries and partition by longitude into 400-500k row partitions gnaf_df = df.withColumn("geom", f.expr("ST_Point(longitude, latitude)")) # .withColumnRenamed("gnaf_pid", "id") # .withColumn("partition_id", (f.percent_rank().over(Window.partitionBy().orderBy("longitude")) * f.lit(100.0)) # .cast(t.ShortType())) \ # .repartitionByRange(100, "partition_id") \ # gnaf_df.printSchema() # check partition counts gnaf_df.groupBy(f.spark_partition_id()).count().show() # write gnaf to gzipped parquet export_to_parquet(gnaf_df, "gnaf") # export PG boundary tables to parquet export_bdys(spark, "commonwealth_electorates", "ce_pid") export_bdys(spark, "local_government_areas", "lga_pid") export_bdys(spark, "local_government_wards", "ward_pid") export_bdys(spark, "state_lower_house_electorates", "se_lower_pid") export_bdys(spark, "state_upper_house_electorates", "se_upper_pid") # cleanup spark.stop() logger.info( "\t - GNAF and boundaries exported to gzipped parquet files: {}". format(datetime.now() - start_time))
def _attach_distributed_sequence_column(sdf, column_name): """ >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark() >>> sdf = InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence") >>> sdf.sort("sequence").show() # doctest: +NORMALIZE_WHITESPACE +--------+---+ |sequence| 0| +--------+---+ | 0| a| | 1| b| | 2| c| +--------+---+ """ scols = [scol_for(sdf, column) for column in sdf.columns] spark_partition_column = verify_temp_column_name( sdf, "__spark_partition_id__") offset_column = verify_temp_column_name(sdf, "__offset__") row_number_column = verify_temp_column_name(sdf, "__row_number__") # 1. Calculates counts per each partition ID. `counts` here is, for instance, # { # 1: 83, # 6: 83, # 3: 83, # ... # } sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id()) # Checkpoint the DataFrame to fix the partition ID. sdf = sdf.localCheckpoint(eager=False) counts = map( lambda x: (x["key"], x["count"]), sdf.groupby( sdf[spark_partition_column].alias("key")).count().collect(), ) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. # We just need a one-by-one sequential id. # sort by partition key. sorted_counts = sorted(counts, key=lambda x: x[0]) # get cumulative sum in an order of partition key. cumulative_counts = [0] + list( accumulate(map(lambda count: count[1], sorted_counts))) # zip it with partition key. sums = dict( zip(map(lambda count: count[0], sorted_counts), cumulative_counts)) # 3. Attach offset for each partition. @pandas_udf(LongType(), PandasUDFType.SCALAR) def offset(id): current_partition_offset = sums[id.iloc[0]] return pd.Series(current_partition_offset).repeat(len(id)) sdf = sdf.withColumn(offset_column, offset(spark_partition_column)) # 4. Calculate row_number in each partition. w = Window.partitionBy(spark_partition_column).orderBy( F.monotonically_increasing_id()) row_number = F.row_number().over(w) sdf = sdf.withColumn(row_number_column, row_number) # 5. Calculate the index. return sdf.select((sdf[offset_column] + sdf[row_number_column] - 1).alias(column_name), *scols)
def show_partition_id(df, col): return df.select(col, spark_partition_id().alias("partition_id")).show(1000, True)
def main(): # Read SparkDriver.properties as ConfigParser() object appconfig = ConfigParser() appconfig.read(filenames='SparkDriver.properties') print(f'Properties file sections: {appconfig.sections()}') sanityChecks(config_file=appconfig) # Create Spark Session object spark = getSparkSessionObject(appconfig=appconfig) # Read XPATH Mappings given in Csv xpaths_mapping_df: DataFrame = readFromSource( spark=spark, opt={ 'location': f'{str(appconfig["Xml"]["XpathMappingsCsvFilePath"]).strip()}', 'filetype': 'csv', 'header': True, 'inferSchema': True }) # Using mapper module build spark sql queries from Xpath Mappings Csv spark_sql_query = buildQueriesFromXpath(df=xpaths_mapping_df) # You can also use other way by creating External Table building DDL using function buildDdlFromXpath # spark_sql_ddl = buildDdlFromXpath(appconfig=appconfig, df=xpaths_mapping_df) # Read actual huge multi-line XML file as XmlInputFormat determine row tag, eliminate new lines so that every # start and end tag comes in one line in a Dataframe xml_df: DataFrame = mapXmlAsHadoopFile( location=str(appconfig['Xml']['FileLocation'])) # Determine revised partitions for Dataframe as XML data is skew total_records = xml_df.count() total_paritions = xml_df.rdd.getNumPartitions() total_records_per_partition = xml_df.groupBy( spark_partition_id()).count().select('count').collect() total_executors = int( spark.conf.get("spark.executor.instances", default="12").strip()) total_cores = int( spark.conf.get("spark.executor.cores", default="3").strip()) total_paritions_revised = total_cores * total_executors print(f"total_records = {total_records}") print(f"total_paritions = {total_paritions}") print(f"total_records_per_partition = {total_records_per_partition}") print(f"total_executors = {total_executors}") print(f"total_cores = {total_cores}") print(f"total_paritions_revised = {total_paritions_revised}") xml_df = xml_df.repartition(total_paritions_revised) # Execute the query in spark sql writedf: DataFrame = spark.sql(spark_sql_query['query']) # You can also use ddl and execute as spark SQL # spark.sql(spark_sql_ddl['ddl']) # spark.sql('LOAD DATA INPATH f"{str(appconfig['Xml']['FileLocation'])}" INTO xmltable') # Write data writedf.write.mode('overwrite').parquet( path=str(appconfig['Xml']['TargetWritePath']))
'uniq_key', trim( regexp_replace( regexp_replace( upper( concat_ws('\x00', coalesce('LNAME', lit('')), coalesce('Address', lit('')))), r'[^\x00\s\w]+', ''), r'\s+', ' ')))) # tweak the number of repartitioning N based on realy data size N = 5 # use dense_rank to calculate the in-partition idx w1 = Window.partitionBy('partition_id').orderBy('uniq_key') df1 = df.repartition(N, 'uniq_key') \ .withColumn('partition_id', spark_partition_id()) \ .withColumn('idx', dense_rank().over(w1)) # get number of unique rows (based on Address+LNAME) which is max_idx # and then grab the running SUM of this cnt -> rcnt # partition_id: spark partition id # idx: calculated in-partition id # cnt: number of unique ids in the same partition fmax('idx') # rcnt: starting_id for a partition(something like a running count): coalesce(fsum('cnt').over(w1),lit(0)) # w1: WindowSpec to calculate the above rcnt w2 = Window.partitionBy().orderBy('partition_id').rowsBetween( Window.unboundedPreceding, -1) df2 = df1.groupby('partition_id') \ .agg(fmax('idx').alias('cnt')) \ .withColumn('rcnt', coalesce(fsum('cnt').over(w2),lit(0)))
def apply_batch(self, func, args=(), **kwds): """ Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. Koalas internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ks.DataFrame[int]: ... return pd.DataFrame([len(pdf)]) ... >>> df = ks.DataFrame({'A': range(1000)}) >>> df.koalas.apply_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... 10 83 11 83 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ks.DataFrame[float, float]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas friendly style as below: >>> def plus_one(x) -> ks.DataFrame["a": float, "b": float]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to apply to each pandas frame. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- DataFrame See Also -------- DataFrame.apply: For row/columnwise operations. DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Series.koalas.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def query_func(pdf) -> ks.DataFrame[int, int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) c0 c1 0 1 2 >>> def query_func(pdf) -> ks.DataFrame["A": int, "B": int]: ... return pdf.query('A == 1') >>> df.koalas.apply_batch(query_func) A B 0 1 2 You can also omit the type hints so Koalas infers the return schema as below: >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1')) A B 0 1 2 You can also specify extra arguments. >>> def calculation(pdf, y, z) -> ks.DataFrame[int, int]: ... return pdf ** y + z >>> df.koalas.apply_batch(calculation, args=(10,), z=20) c0 c1 0 21 1044 1 59069 1048596 2 9765645 60466196 You can also use ``np.ufunc`` and built-in functions as input. >>> df.koalas.apply_batch(np.add, args=(10,)) A B 0 11 12 1 13 14 2 15 16 >>> (df * -1).koalas.apply_batch(abs) A B 0 1 2 1 3 4 2 5 6 """ # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate? from databricks.koalas.groupby import GroupBy from databricks.koalas.frame import DataFrame from databricks import koalas as ks if not isinstance(func, types.FunctionType): assert callable( func), "the first argument should be a callable function." f = func func = lambda *args, **kwargs: f(*args, **kwargs) spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0" original_func = func func = lambda o: original_func(o, *args, **kwds) self_applied = DataFrame(self._kdf._internal.resolved_copy) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = ks.get_option("compute.shortcut_limit") pdf = self_applied.head(limit + 1)._to_internal_pandas() applied = func(pdf) if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(applied)) kdf = ks.DataFrame(applied) if len(pdf) <= limit: return kdf return_schema = kdf._internal.to_internal_spark_frame.schema if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=True) # If schema is inferred, we can restore indexes too. internal = kdf._internal.with_new_sdf(sdf) else: return_type = infer_return_type(original_func) return_schema = return_type.tpe is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " "hints; however, the return type was %s." % return_sig) if should_use_map_in_pandas: output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=False) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema) else: sdf = GroupBy._spark_group_map_apply( self_applied, func, (F.spark_partition_id(), ), return_schema, retain_index=False) # Otherwise, it loses index. internal = InternalFrame(spark_frame=sdf, index_map=None) return DataFrame(internal)
# -------------------------------------------------------------------------------------------------- schema = StructType([ StructField("id", IntegerType()), StructField("type", StringType()), StructField("quantity", IntegerType()) ]) df = spark.read.option('header', 'true').csv(data_dir + 'dummy', schema=schema) # -------------------------------------------------------------------------------------------------- # Partition # TODO: foreachPartition # TODO: sortWithinPartitions df.rdd.getNumPartitions() df.select('*', F.spark_partition_id().alias("pid")).show() df.where(df.type == 'apple').show() # where = filter # Select df.select('*', 'id').show() df.select(f.concat(df.id, F.lit('-'), df.type).alias('s')).show() df.select(f.format_string('%d-%s', df.id, df.type).alias('test_foramt')).show() df.select(df.id.cast(StringType())).printSchema() df.select( "*", F.when(df.quantity > 50, 'High').when(df.quantity < 30, 'Low').otherwise('Medium').name('Q')).show() # Filter
print("Total No of Rows: ", tot_Cnt) unq_Cnt = userDF.drop_duplicates().count() print("Unique No of Rows: ", unq_Cnt) # COMMAND ---------- # MAGIC %md ##### Check for Data Skewness # COMMAND ---------- from pyspark.sql.functions import spark_partition_id # get no of partitions implictPart = userDF.rdd.getNumPartitions() print("Implict no of partitions:", implictPart) #get each partition size partitions = userDF.withColumn("Partition_id", spark_partition_id()).groupBy( "Partition_id").count().orderBy("Partition_id") #partitions =userDF.withColumn("Partition_id", spark_partition_id()) #distPartitions = partitions.select("partition_id").distinct() display(partitions) # COMMAND ---------- # MAGIC %md Here we could see data are skweed resulting in the smaller size of partitions. # COMMAND ---------- # MAGIC %md ##### Repartitioning the data to avoid data skewness # COMMAND ----------
def attach_default_index(sdf): """ This method attaches a default index to Spark DataFrame. Spark does not have the index notion so corresponding column should be generated. There are three types of default index that can be controlled by `DEFAULT_INDEX` environment variable. - one-by-one: It implements an one-by-one sequence by Window function without specifying partition. Therefore, it ends up with whole partition in single node. This index type should be avoided when the data is large. This is default. - distributed-one-by-one: It implements an one-by-one sequence by group-by and group-map approach. It still generates a one-by-one sequential index globally. If the default index must be an one-by-one sequence in a large dataset, this index has to be used. Note that if more data are added to the data source after creating this index, then it does not guarantee the sequential index. - distributed: It implements a monotonically increasing sequence simply by using Spark's `monotonically_increasing_id` function. If the index does not have to be a one-by-one sequence, this index should be used. Performance-wise, this index almost does not have any penalty comparing to other index types. Note that we cannot use this type of index for combining two dataframes because it is not guaranteed to have the same indexes in two dataframes. """ default_index_type = os.environ.get("DEFAULT_INDEX", "one-by-one") if default_index_type == "one-by-one": sequential_index = F.row_number().over( Window.orderBy(F.monotonically_increasing_id().asc())) - 1 scols = [scol_for(sdf, column) for column in sdf.columns] return sdf.select(sequential_index.alias("__index_level_0__"), *scols) elif default_index_type == "distributed-one-by-one": # 1. Calculates counts per each partition ID. `counts` here is, for instance, # { # 1: 83, # 6: 83, # 3: 83, # ... # } counts = map( lambda x: (x["key"], x["count"]), sdf.groupby( F.spark_partition_id().alias("key")).count().collect()) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. # We just need a one-by-one sequential id. # sort by partition key. sorted_counts = sorted(counts, key=lambda x: x[0]) # get cumulative sum in an order of partition key. cumulative_counts = accumulate( map(lambda count: count[1], sorted_counts)) # zip it with partition key. sums = dict( zip(map(lambda count: count[0], sorted_counts), cumulative_counts)) # 3. Group by partition id and assign each range. def default_index(pdf): current_partition_max = sums[ pdf["__spark_partition_id"].iloc[0]] offset = len(pdf) pdf["__index_level_0__"] = list( range(current_partition_max - offset, current_partition_max)) return pdf.drop(columns=["__spark_partition_id"]) return_schema = StructType( [StructField("__index_level_0__", LongType())] + list(sdf.schema)) grouped_map_func = pandas_udf( return_schema, PandasUDFType.GROUPED_MAP)(default_index) sdf = sdf.withColumn("__spark_partition_id", F.spark_partition_id()) return sdf.groupBy("__spark_partition_id").apply(grouped_map_func) elif default_index_type == "distributed": scols = [scol_for(sdf, column) for column in sdf.columns] return sdf.select( F.monotonically_increasing_id().alias("__index_level_0__"), *scols) else: raise ValueError( "'DEFAULT_INDEX' environment variable should be one of 'one-by-one'," " 'distributed-one-by-one' and 'distributed'")