def _write_dataframe_to_s3(config, logger, df: pyspark.sql.DataFrame, df_name: str) -> None: """ Converts a PySpark DataFrame to Pandas, before writing out to a CSV file stored in Amazon S3, in the given bucket pulled from the config object. """ logger.warn(f'About to write dataframe: {df_name} as CSV to S3') # Convert Pyspark dataframe to Pandas pd_df = df.toPandas() # Get S3 details s3 = boto3.resource('s3', aws_access_key_id=config['AWS']['AWS_ACCESS_KEY_ID'], aws_secret_access_key=config['AWS']['AWS_SECRET_ACCESS_KEY']) #Write Pandas df to CSV stored locally csv_buff = StringIO() pd_df.to_csv(csv_buff, sep=',', index = False) # Write to S3 s3.Object(config['S3']['BUCKET_NAME'], f'{df_name}.csv').put(Body=csv_buff.getvalue()) logger.warn(f'Finished writing dataframe: {df_name} as CSV to S3')
def assert_test_dfs_equal(expected_df: pyspark.sql.DataFrame, generated_df: pyspark.sql.DataFrame) -> None: """ Used to compare two dataframes (typically, in a unit test). Better than the direct df1.equals(df2) method, as this function allows for tolerances in the floating point columns, and is also more descriptive with which parts of the two dataframes are in disagreement. :param expected_df: First dataframe to compare :param generated_df: Second dataframe to compare """ row_limit = 10000 e_count = expected_df.count() g_count = generated_df.count() if (e_count > row_limit) or (g_count > row_limit): raise Exception( f"One or both of the dataframes passed has too many rows (>{row_limit})." f"Please limit your test sizes to be lower than this number.") assert e_count == g_count, "The dataframes have a different number of rows." expected_pdf = expected_df.toPandas() generated_pdf = generated_df.toPandas() assert list(expected_pdf.columns) == list(generated_pdf.columns), \ "The two dataframes have different columns." for col in expected_pdf.columns: error_msg = f"The columns with name: `{col}` were not equal." if expected_pdf[col].dtype.type == np.object_: assert expected_pdf[[col]].equals(generated_pdf[[col]]), error_msg else: # Numpy will not equate nulls on both sides. Filter them out. expected_pdf = expected_pdf[expected_pdf[col].notnull()] generated_pdf = generated_pdf[generated_pdf[col].notnull()] try: is_close = np.allclose(expected_pdf[col].values, generated_pdf[col].values) except ValueError: logging.error( f"Problem encountered while equating column '{col}'.") raise assert is_close, error_msg