def check_compatibility(df, old_df, format_output, partition_col=None):
    """
    Check if `df` and `old_df` have compatible schemas.

    Only the check for `parquet` has been implemeted, due to field ordering

    :param df: A Spark dataframe, the most recent one
    :param old_df: A Spark dataframe, the oldest
    :param str format_output: Which output format the data will be written to
    :param str partition_col: The way old data is partitioned
    :return: Whether the schemas are equal and compatible
    :rtype: tuple[bool, bool]
    """
    schema_equal = schema.are_schemas_equal(df, old_df, partition_col=partition_col)

    if not schema_equal and format_output != 'parquet':
        raise NotImplementedError("Only `parquet` schema evolution is supported")
    elif not schema_equal:
        new_schema = df.schema.jsonValue()
        old_schema = old_df.schema.jsonValue()
        schema_compatible = schema.are_schemas_compatible(new_schema, old_schema,
                                                          remove_from_old=partition_col)
    else:
        schema_compatible = True
    return schema_equal, schema_compatible
Example #2
0
def test_compare_fields_added_not_nullable_field(sample_df):
    old_schema = json.loads(sample_df.schema.json())
    new_schema = json.loads(sample_df.schema.json())
    new_schema['fields'].append({'metadata': {}, 'name': 'new_field',
                                 'nullable': False, 'type': 'string'})
    assert not schema.are_schemas_compatible(new_schema, old_schema)
Example #3
0
def test_compare_fields_remove_from_old(sample_df):
    old_schema = json.loads(sample_df.schema.json())
    new_schema = json.loads(sample_df.schema.json())
    old_schema['fields'].append({'metadata': {}, 'name': 'dt',
                                 'nullable': False, 'type': 'string'})
    assert schema.are_schemas_compatible(new_schema, old_schema, remove_from_old='dt')
Example #4
0
def test_compare_fields_different_complex_field_type(sample_df):
    old_schema = json.loads(sample_df.schema.json())
    new_schema = json.loads(sample_df.schema.json())
    new_schema['fields'][0]['type']['type'] = 'array'
    assert not schema.are_schemas_compatible(new_schema, old_schema)