def upload_as_parquet(df) -> Dataset: """ Generate a random parquet """ suffix = rand_string(length=10) rand_name = f"tmp_parquet_{suffix}" df.write.mode("errorifexists").format("parquet").saveAsTable(rand_name) parquet_url = get_table_url(rand_name) logger.info(f"Saved parquet to {parquet_url}") return Dataset(parquet_url=parquet_url)
def upload_as_parquet(df) -> Dataset: """ Generate a random parquet. Fails if cannot generate a non-existent name. """ # get a random tmp name and check if it exists sqlCtx = get_spark_session() success = False for _ in range(MAX_UPLOAD_PARQUET_TRIES): suffix = rand_string(length=UPLOAD_PARQUET_TMP_SUFFIX_LEN) rand_name = f"tmp_parquet_{suffix}" if not sqlCtx.catalog._jcatalog.tableExists(rand_name): success = True break if not success: raise Exception(f"Failed to find name after {MAX_UPLOAD_PARQUET_TRIES} tries.") # perform the write df.write.mode("errorifexists").format("parquet").saveAsTable(rand_name) parquet_url = get_table_url(rand_name) logger.info(f"Saved parquet to {parquet_url}") return Dataset(parquet_url=parquet_url)