def test_write_without_set_schema(): dataiku_source = DataikuSource() dataset_name = "fixtures" s3_dir = "s3://birgittatestbucket/sourcetests" fixtures_mock = MagicMock() catalog = Catalog() catalog.add_field('fooint', description='Foo int', example=39) schema = Schema([['fooint', 'bigint']], catalog) dataframe.write(fixtures_mock, dataset_name, prefix=s3_dir, schema=schema, skip_cast=True, set_schema=False, dataframe_source=dataiku_source) dapi_dataset_mock.set_schema.assert_not_called()
def test_equal(spark_session, fixtures, expected): s3_source = S3Source(format='parquet') dataset_name = "fixtures" s3_dir = "s3://birgittatestbucket/sourcetests" fixtures_mock = MagicMock() fixtures_mock.write.format().mode().save.return_value = None dataframe.write(fixtures_mock, dataset_name, prefix=s3_dir, dataframe_source=s3_source) spark_session_mock = MagicMock() spark_session_mock.read.format().load.return_value = fixtures out_df = dataframe.get(spark_session_mock, dataset_name, prefix=s3_dir, dataframe_source=s3_source) assert not dfdiff.diff(out_df, expected)
def test_write(): # dapi_dataset_mock = mock.MagicMock() # project_obj_mock.get_dataset.return_value = dapi_dataset_mock dataiku_source = DataikuSource() dataset_name = "fixtures" s3_dir = "s3://birgittatestbucket/sourcetests" fixtures_mock = MagicMock() catalog = Catalog() catalog.add_field('fooint', description='Foo int', example=39) schema = Schema([['fooint', 'bigint']], catalog) dataframe.write(fixtures_mock, dataset_name, prefix=s3_dir, schema=schema, skip_cast=True, dataframe_source=dataiku_source) dataiku_schema = dkuschema.to_dataiku(schema) dapi_dataset_mock.set_schema.assert_called_once_with(dataiku_schema)
def assert_write( dataframe_source, spark_session, df, expected_df, # noqa F401 dataset_name, schema, error_pattern=None): ds_name = "schema_assert_write_" + dataset_name if not error_pattern: dataframe.write(df, ds_name, schema=schema, dataframe_source=dataframe_source) else: with pytest.raises(SchemaError) as e_info: dataframe.write(df, ds_name, schema=schema, dataframe_source=dataframe_source) cleaned_err_val = str(e_info.value).replace("\n", " ").replace("'", "") assert re.match(error_pattern, cleaned_err_val)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # -*- coding: utf-8 -*- import warnings from birgitta import spark as bspark from birgitta.dataframe import dataframe from newsltd_etl.projects.chronicle.datasets.contract_data import dataset as ds_contract_data # noqa 501 from newsltd_etl.projects.chronicle.datasets.contracts import dataset as ds_contracts # noqa 501 from pyspark.sql import functions as F warnings.filterwarnings('ignore') # supress python warnings # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE spark_session = bspark.session() contract_data_df = dataframe.get(spark_session, ds_contract_data.name) with_flag = contract_data_df.withColumn('current_flag', F.lit(1)) to_output_df = with_flag.select( F.col('customerid').alias('customer_id'), F.concat(F.lit('G47'), F.col('cellphone')).alias('phone'), F.col('accountid').alias('chronicle_account_id'), F.col('groupid').alias('group_account_id'), F.col('priceplan_code'), F.col('startdate_yyyymmdd').cast('date').alias('start_date'), F.col('enddate_yyyymmdd').cast('date').alias('end_date'), F.col('current_flag'), F.col('status').alias('client_status_code')) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Write recipe outputs dataframe.write(to_output_df, ds_contracts.name, schema=ds_contracts.schema)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Get or create sparkcontext and set up sqlcontext spark_session = bspark.session() # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE contracts = dataframe.get(spark_session, ds_contracts.name, cast_binary_to_str=True) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN # ## Convert timestamps to dates # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE contractsf = contracts.filter(contracts.brand_code != 44) \ .withColumnRenamed("contract_prod_code", "product_code") \ .withColumn("start_date", F.col("start_date").cast('date')) \ .withColumn("end_date", F.col("end_date").cast('date')) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN # ## Add product category # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE ## Add "product_category" contractsf = contractsf.withColumn("product_category", F.lit("contract")) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN # ## Write to output data # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE dataframe.write(contractsf, ds_filtered_contracts.name, schema=ds_filtered_contracts.schema)
# Adjust end_date to be equal to today, if it is after today start_when = f.when(f.col("end_date") > today_date, today_date) \ .otherwise(f.col("end_date")) contracts = contracts.withColumn('end_date_adj', start_when) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN # # Join with date_dimension, thereby splitting into daily granularity # This will split into days. # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE t_dim_days = datedim.select(f.col('datetimestamp_parsed').alias("datestamp"), "day_in_week") \ .withColumn("datestamp", f.col("datestamp").cast('date')) join_conditions = [(contracts.start_date_adj <= t_dim_days.datestamp) & (contracts.end_date_adj >= t_dim_days.datestamp)] contracts_daily = contracts.join( t_dim_days, join_conditions, 'left').drop('start_date_adj').drop('end_date_adj') # Debug contracts_daily with our utility functions dfdbg.profile(contracts_daily, "contracts_name") # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Write recipe outputs dataframe.write(contracts_daily, ds_daily_contract_states.name, schema=ds_daily_contract_states.schema)