Exemple #1
0
def test_write_without_set_schema():
    dataiku_source = DataikuSource()
    dataset_name = "fixtures"
    s3_dir = "s3://birgittatestbucket/sourcetests"
    fixtures_mock = MagicMock()
    catalog = Catalog()
    catalog.add_field('fooint', description='Foo int', example=39)
    schema = Schema([['fooint', 'bigint']], catalog)
    dataframe.write(fixtures_mock,
                    dataset_name,
                    prefix=s3_dir,
                    schema=schema,
                    skip_cast=True,
                    set_schema=False,
                    dataframe_source=dataiku_source)
    dapi_dataset_mock.set_schema.assert_not_called()
Exemple #2
0
def test_equal(spark_session, fixtures, expected):
    s3_source = S3Source(format='parquet')
    dataset_name = "fixtures"
    s3_dir = "s3://birgittatestbucket/sourcetests"
    fixtures_mock = MagicMock()
    fixtures_mock.write.format().mode().save.return_value = None
    dataframe.write(fixtures_mock,
                    dataset_name,
                    prefix=s3_dir,
                    dataframe_source=s3_source)
    spark_session_mock = MagicMock()
    spark_session_mock.read.format().load.return_value = fixtures
    out_df = dataframe.get(spark_session_mock,
                           dataset_name,
                           prefix=s3_dir,
                           dataframe_source=s3_source)
    assert not dfdiff.diff(out_df, expected)
Exemple #3
0
def test_write():
    # dapi_dataset_mock = mock.MagicMock()
    # project_obj_mock.get_dataset.return_value = dapi_dataset_mock
    dataiku_source = DataikuSource()
    dataset_name = "fixtures"
    s3_dir = "s3://birgittatestbucket/sourcetests"
    fixtures_mock = MagicMock()
    catalog = Catalog()
    catalog.add_field('fooint', description='Foo int', example=39)
    schema = Schema([['fooint', 'bigint']], catalog)
    dataframe.write(fixtures_mock,
                    dataset_name,
                    prefix=s3_dir,
                    schema=schema,
                    skip_cast=True,
                    dataframe_source=dataiku_source)
    dataiku_schema = dkuschema.to_dataiku(schema)
    dapi_dataset_mock.set_schema.assert_called_once_with(dataiku_schema)
Exemple #4
0
def assert_write(
        dataframe_source,
        spark_session,
        df,
        expected_df,  # noqa F401
        dataset_name,
        schema,
        error_pattern=None):
    ds_name = "schema_assert_write_" + dataset_name
    if not error_pattern:
        dataframe.write(df,
                        ds_name,
                        schema=schema,
                        dataframe_source=dataframe_source)
    else:
        with pytest.raises(SchemaError) as e_info:
            dataframe.write(df,
                            ds_name,
                            schema=schema,
                            dataframe_source=dataframe_source)
        cleaned_err_val = str(e_info.value).replace("\n", " ").replace("'", "")
        assert re.match(error_pattern, cleaned_err_val)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import warnings

from birgitta import spark as bspark
from birgitta.dataframe import dataframe
from newsltd_etl.projects.chronicle.datasets.contract_data import dataset as ds_contract_data  # noqa 501
from newsltd_etl.projects.chronicle.datasets.contracts import dataset as ds_contracts  # noqa 501
from pyspark.sql import functions as F
warnings.filterwarnings('ignore')  # supress python warnings

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
spark_session = bspark.session()
contract_data_df = dataframe.get(spark_session, ds_contract_data.name)

with_flag = contract_data_df.withColumn('current_flag', F.lit(1))

to_output_df = with_flag.select(
    F.col('customerid').alias('customer_id'),
    F.concat(F.lit('G47'), F.col('cellphone')).alias('phone'),
    F.col('accountid').alias('chronicle_account_id'),
    F.col('groupid').alias('group_account_id'), F.col('priceplan_code'),
    F.col('startdate_yyyymmdd').cast('date').alias('start_date'),
    F.col('enddate_yyyymmdd').cast('date').alias('end_date'),
    F.col('current_flag'),
    F.col('status').alias('client_status_code'))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
dataframe.write(to_output_df, ds_contracts.name, schema=ds_contracts.schema)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#  Get or create sparkcontext and set up sqlcontext
spark_session = bspark.session()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
contracts = dataframe.get(spark_session,
                     ds_contracts.name,
                     cast_binary_to_str=True)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Convert timestamps to dates

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE

contractsf = contracts.filter(contracts.brand_code != 44) \
        .withColumnRenamed("contract_prod_code", "product_code") \
        .withColumn("start_date", F.col("start_date").cast('date')) \
        .withColumn("end_date", F.col("end_date").cast('date'))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN

# ## Add product category
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
## Add "product_category"
contractsf = contractsf.withColumn("product_category", F.lit("contract"))
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Write to output data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
dataframe.write(contractsf,
                ds_filtered_contracts.name,
                schema=ds_filtered_contracts.schema)
# Adjust end_date to be equal to today, if it is after today
start_when = f.when(f.col("end_date") > today_date,
                    today_date) \
              .otherwise(f.col("end_date"))
contracts = contracts.withColumn('end_date_adj', start_when)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# # Join with date_dimension, thereby splitting into daily granularity
# This will split into days.

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
t_dim_days = datedim.select(f.col('datetimestamp_parsed').alias("datestamp"),
                            "day_in_week") \
               .withColumn("datestamp", f.col("datestamp").cast('date'))

join_conditions = [(contracts.start_date_adj <= t_dim_days.datestamp)
                   & (contracts.end_date_adj >= t_dim_days.datestamp)]

contracts_daily = contracts.join(
    t_dim_days, join_conditions,
    'left').drop('start_date_adj').drop('end_date_adj')

# Debug contracts_daily with our utility functions
dfdbg.profile(contracts_daily, "contracts_name")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
dataframe.write(contracts_daily,
                ds_daily_contract_states.name,
                schema=ds_daily_contract_states.schema)