def create_dataframe(df_metadata: DataFrameMetadata): spark = Session().get_session() spark_context = Session().get_context() # Create an empty RDD empty_rdd = spark_context.emptyRDD() print("url", df_metadata.file_url) # Use petastorm to create dataframe with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): spark.createDataFrame(empty_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('overwrite') \ .parquet(df_metadata.file_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() spark_context = Session().get_context() # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.file_url, df_metadata.schema.petastorm_schema): # Convert a list of rows to RDD rows_rdd = spark_context.parallelize( rows).map(lambda x: dict_to_spark_row( df_metadata.schema.petastorm_schema, x)) spark.createDataFrame(rows_rdd, df_metadata.schema.pyspark_schema) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.file_url)
def append_rows(df_metadata: DataFrameMetadata, rows): spark = Session().get_session() # Convert a list of rows to RDD rows_df = spark.createDataFrame(rows, df_metadata.get_dataframe_pyspark_schema()) rows_rdd = rows_df.rdd # Use petastorm to appends rows with materialize_dataset(spark, df_metadata.get_dataframe_file_url(), df_metadata.get_dataframe_petastorm_schema()): spark.createDataFrame(rows_rdd, df_metadata.get_dataframe_pyspark_schema()) \ .coalesce(1) \ .write \ .mode('append') \ .parquet(df_metadata.get_dataframe_file_url())