Beispiel #1
0
def split_types(context, raw_df: SparkDF):
    expected_df = context.resources.pyspark.spark_session.createDataFrame([], HN_ACTION_SCHEMA)
    # Schema validation
    yield ExpectationResult(
        success=set(raw_df.schema) == set(expected_df.schema),
        label="hn_data_schema_check",
        description="Check if the source schema is expected",
        metadata={
            "Expected data schema": expected_df._jdf.schema().treeString(),  # type: ignore # pylint: disable=protected-access
            "Actual data schema": raw_df._jdf.schema().treeString(),  # type: ignore # pylint: disable=protected-access
        },
    )

    # Split data based on the values in the 'type' column
    type_values = raw_df.select("type").distinct().rdd.flatMap(lambda x: x).collect()
    comment_df = raw_df.where(raw_df["type"] == "comment")
    story_df = raw_df.where(raw_df["type"] == "story")

    yield ExpectationResult(
        success=comment_df.count() > 0 and story_df.count() > 0,
        label="hn_data_split_types_check",
        description="Expect the hacker news data has at least 1 'comment' entry and at least 1 'story' entry.",
        metadata={
            "number of raw rows": raw_df.count(),
            "number of comment rows": comment_df.count(),
            "number of story rows": story_df.count(),
            "Unique values in the 'type' column": (", ").join(type_values),
        },
    )

    yield Output(comment_df, "comments")
    yield Output(story_df, "stories")
Beispiel #2
0
def check_users_and_groups_one_fails_one_succeeds(_context):
    yield ExpectationResult(
        success=True,
        label="user_expectations",
        description="Battery of expectations for user",
        metadata={
            "table_summary": {
                "columns": {
                    "name": {"nulls": 0, "empty": 0, "values": 123, "average_length": 3.394893},
                    "time_created": {"nulls": 1, "empty": 2, "values": 120, "average": 1231283},
                }
            },
        },
    )

    yield ExpectationResult(
        success=False,
        label="groups_expectations",
        description="Battery of expectations for groups",
        metadata={
            "table_summary": {
                "columns": {
                    "name": {"nulls": 1, "empty": 0, "values": 122, "average_length": 3.394893},
                    "time_created": {"nulls": 1, "empty": 2, "values": 120, "average": 1231283},
                }
            }
        },
    )
Beispiel #3
0
def check_users_and_groups_one_fails_one_succeeds(_context):
    yield ExpectationResult(
        success=True,
        label='user_expectations',
        description='Battery of expectations for user',
        metadata_entries=[
            EventMetadataEntry.json(
                label='table_summary',
                data={
                    'columns': {
                        'name': {'nulls': 0, 'empty': 0, 'values': 123, 'average_length': 3.394893},
                        'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283},
                    }
                },
            )
        ],
    )

    yield ExpectationResult(
        success=False,
        label='groups_expectations',
        description='Battery of expectations for groups',
        metadata_entries=[
            EventMetadataEntry.json(
                label='table_summary',
                data={
                    'columns': {
                        'name': {'nulls': 1, 'empty': 0, 'values': 122, 'average_length': 3.394893},
                        'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283},
                    }
                },
            )
        ],
    )
Beispiel #4
0
def join_q2_data(
    context,
    april_data: DataFrame,
    may_data: DataFrame,
    june_data: DataFrame,
    master_cord_data: DataFrame,
) -> DataFrame:

    dfs = {"april": april_data, "may": may_data, "june": june_data}

    missing_things = []

    for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({"month": month, "missing_column": required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label="airport_ids_present",
        description="Sequence IDs present in incoming monthly flight data.",
        metadata_entries=[
            EventMetadataEntry.json(label="metadata", data={"missing_columns": missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label="flight_data_same_shape",
        metadata_entries=[
            EventMetadataEntry.json(label="metadata", data={"columns": april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView("q2_data")

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_")
    dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data")

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_")
    origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data")

    full_data = context.resources.pyspark.spark_session.sql(
        """
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        """
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
Beispiel #5
0
def join_q2_data(
    context,
    april_data: DataFrame,
    may_data: DataFrame,
    june_data: DataFrame,
    master_cord_data: DataFrame,
) -> DataFrame:

    dfs = {'april': april_data, 'may': may_data, 'june': june_data}

    missing_things = []

    for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({'month': month, 'missing_column': required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label='airport_ids_present',
        description='Sequence IDs present in incoming monthly flight data.',
        metadata_entries=[
            EventMetadataEntry.json(label='metadata', data={'missing_columns': missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label='flight_data_same_shape',
        metadata_entries=[
            EventMetadataEntry.json(label='metadata', data={'columns': april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView('q2_data')

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'DEST_')
    dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data')

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'ORIGIN_')
    origin_prefixed_master_cord_data.createOrReplaceTempView('origin_cord_data')

    full_data = context.resources.spark.sql(
        '''
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        '''
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
Beispiel #6
0
 def materialization_and_expectation(_context):
     yield Materialization.file(path='/path/to/foo',
                                description='This is a table.')
     yield Materialization.file(path='/path/to/bar')
     yield ExpectationResult(success=True,
                             label='row_count',
                             description='passed')
     yield ExpectationResult(True)
     yield Output(True)
Beispiel #7
0
 def materialization_and_expectation(_context):
     yield Materialization(
         label='all_types',
         description='a materialization with all metadata types',
         metadata_entries=[
             EventMetadataEntry.text('text is cool', 'text'),
             EventMetadataEntry.url('https://bigty.pe/neato', 'url'),
             EventMetadataEntry.fspath('/tmp/awesome', 'path'),
             EventMetadataEntry.json({'is_dope': True}, 'json'),
         ],
     )
     yield ExpectationResult(success=True, label='row_count', description='passed')
     yield ExpectationResult(True)
     yield Output(True)
Beispiel #8
0
 def materialization_and_expectation(_context):
     yield AssetMaterialization(
         asset_key="all_types",
         description="a materialization with all metadata types",
         metadata_entries=[
             EventMetadataEntry.text("text is cool", "text"),
             EventMetadataEntry.url("https://bigty.pe/neato", "url"),
             EventMetadataEntry.fspath("/tmp/awesome", "path"),
             EventMetadataEntry.json({"is_dope": True}, "json"),
         ],
     )
     yield ExpectationResult(success=True, label="row_count", description="passed")
     yield ExpectationResult(True)
     yield Output(True)
Beispiel #9
0
def cache_file_from_s3(context, bucket_data):
    target_key = context.solid_config.get('file_key',
                                          bucket_data['key'].split('/')[-1])

    file_cache = context.resources.file_cache

    file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):

        with get_temp_file_name() as tmp_file:
            context.resources.s3.session.download_file(
                Bucket=bucket_data['bucket'],
                Key=bucket_data['key'],
                Filename=tmp_file)

            context.log.info('File downloaded to {}'.format(tmp_file))

            with open(tmp_file, 'rb') as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info('File handle written at : {}'.format(
                    file_handle.path_desc))
    else:
        context.log.info('File {} already present in cache'.format(
            file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label='file_handle_exists',
        metadata_entries=[
            EventMetadataEntry.path(path=file_handle.path_desc,
                                    label=target_key)
        ],
    )
    yield Output(file_handle)
Beispiel #10
0
def _expect_all_resources_in_result(
        resources: ResourcesDataFrame,
        result: UtilizationDataFrame) -> ExpectationResult:
    input_ids = set(resources.resource_id)
    output_ids = set(result.resource_id)
    missing_ids = input_ids - output_ids
    missing_count = len(missing_ids)

    entries = [
        EventMetadataEntry.json(
            {
                'input': len(input_ids),
                'output': len(output_ids),
                'missing': missing_count
            },
            label='Summary Counts')
    ]
    if missing_count > 0:
        entries.append(
            EventMetadataEntry.json({'ids': list(missing_ids)},
                                    label='Missing Resources'))

    return ExpectationResult(
        success=(missing_count == 0),
        label='Found All Resources',
        description=
        'Check if all the input resource ids were found in the Azure Monitor Logs workspace.',
        metadata_entries=entries)
Beispiel #11
0
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle:
    target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1])

    file_cache = context.resources.file_cache

    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        with get_temp_file_name() as tmp_file:
            context.resources.s3.download_file(
                Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file
            )

            context.log.info("File downloaded to {}".format(tmp_file))

            with open(tmp_file, "rb") as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)],
    )
    yield Output(target_file_handle)
Beispiel #12
0
 def logs_events(context):
     context.log_event(AssetMaterialization("first"))
     context.log_event(Materialization("second"))
     context.log_event(ExpectationResult(success=True))
     context.log_event(AssetObservation("fourth"))
     yield AssetMaterialization("fifth")
     yield Output("blah")
Beispiel #13
0
 def emit_failed_expectation(_context):
     yield ExpectationResult(
         success=False,
         name='always_false',
         message='Failure',
         result_metadata={'reason': 'Relentless pessimism.'},
     )
Beispiel #14
0
 def emit_successful_expectation(_context):
     yield ExpectationResult(
         success=True,
         name='always_true',
         message='Successful',
         result_metadata={'reason': 'Just because.'},
     )
Beispiel #15
0
 def ge_validation_solid(context, pandas_df):
     data_context = context.resources.ge_data_context
     suite = data_context.get_expectation_suite(suite_name)
     batch_kwargs = {
         "dataset": pandas_df,
         "datasource": datasource_name,
     }
     batch = data_context.get_batch(batch_kwargs, suite)
     run_id = {
         "run_name": datasource_name + " run",
         "run_time": datetime.datetime.utcnow(),
     }
     results = data_context.run_validation_operator(
         "action_list_operator", assets_to_validate=[batch], run_id=run_id)
     res = convert_to_json_serializable(
         results.list_validation_results())[0]
     nmeta = EventMetadataEntry.json(
         {
             'overall': res['statistics'],
             'individual': res['results']
         },
         'constraint-metadata',
     )
     yield ExpectationResult(success=res["success"],
                             metadata_entries=[nmeta])
     yield Output(res)
Beispiel #16
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        "users",
        "groups",
        "events",
        "friends",
        "pages",
        "fans",
        "event_admins",
        "group_admins",
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key="table_info",
            metadata_entries=[
                EventMetadataEntry.path(label="table_path",
                                        path="/path/to/{}.raw".format(table))
            ],
        )
        yield ExpectationResult(
            success=True,
            label="{table}.row_count".format(table=table),
            description="Row count passed for {table}".format(table=table),
        )
Beispiel #17
0
 def ge_validation_solid(context, dataset):
     data_context = context.resources.ge_data_context
     if validation_operator_name is not None:
         validation_operator = validation_operator_name
     else:
         data_context.add_validation_operator(
             "ephemeral_validation",
             {"class_name": "ActionListValidationOperator", "action_list": []},
         )
         validation_operator = "ephemeral_validation"
     suite = data_context.get_expectation_suite(suite_name)
     final_batch_kwargs = batch_kwargs or {"dataset": dataset}
     if "datasource" in batch_kwargs:
         context.log.warning(
             "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "
             "parameter of the solid factory instead."
         )
     final_batch_kwargs["datasource"] = datasource_name
     batch = data_context.get_batch(final_batch_kwargs, suite)
     run_id = {
         "run_name": datasource_name + " run",
         "run_time": datetime.datetime.utcnow(),
     }
     results = data_context.run_validation_operator(
         validation_operator, assets_to_validate=[batch], run_id=run_id
     )
     res = convert_to_json_serializable(results.list_validation_results())[0]
     md_str = render_multiple_validation_result_pages_markdown(
         validation_operator_result=results, run_info_at_end=True,
     )
     meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results")
     yield ExpectationResult(
         success=res["success"], metadata_entries=[meta_stats,],
     )
     yield Output(res)
Beispiel #18
0
 def emit_failed_expectation(_context):
     yield ExpectationResult(
         success=False,
         label="always_false",
         description="Failure",
         metadata={"data": {"reason": "Relentless pessimism."}},
     )
Beispiel #19
0
    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {
                runtime_method_type: dataset
            },
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results)
        md_str = "".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry("Expectation Results",
                                   value=MetadataValue.md(md_str))
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())
def expect_column_to_be_integers(
    data_frame: LessSimpleDataFrame, column_name: str
) -> ExpectationResult:
    bad_values = []
    for idx in range(len(data_frame)):
        line = data_frame[idx]
        if not isinstance(line[column_name], int):
            bad_values.append((idx, str(line[column_name])))
    return ExpectationResult(
        success=(not bad_values),
        label='col_{column_name}_is_int'.format(column_name=column_name),
        description=(
            'Check whether type of column {column_name} in '
            'LessSimpleDataFrame is int'
        ).format(column_name=column_name),
        metadata_entries=[
            EventMetadataEntry.json(
                {'index': idx, 'bad_value': value},
                'bad_value',
                'Bad value in column {column_name}'.format(
                    column_name=column_name
                ),
            )
            for (idx, value) in bad_values
        ],
    )
Beispiel #21
0
 def emit_successful_expectation(_context):
     yield ExpectationResult(
         success=True,
         label="always_true",
         description="Successful",
         metadata={"data": {"reason": "Just because."}},
     )
Beispiel #22
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        'users',
        'groups',
        'events',
        'friends',
        'pages',
        'fans',
        'event_admins',
        'group_admins',
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key='table_info',
            metadata_entries=[
                EventMetadataEntry.path(label='table_path',
                                        path='/path/to/{}.raw'.format(table))
            ],
        )
        yield ExpectationResult(
            success=True,
            label='{table}.row_count'.format(table=table),
            description='Row count passed for {table}'.format(table=table),
        )
Beispiel #23
0
def create_expectation_result(label, ge_evr):
    check.dict_param(ge_evr, 'ge_evr', key_type=str)
    check.param_invariant('success' in ge_evr, 'ge_evr')
    return ExpectationResult(
        success=ge_evr['success'],
        label=label,
        metadata_entries=[EventMetadataEntry.json(ge_evr, label='evr')],
    )
def cache_properies_from_rest_api(
    context, properties: PropertyDataFrame, target_key: String
) -> FileHandle:

    property_list = []
    date = datetime.today().strftime('%y%m%d')
    date_time = datetime.now().strftime("%y%m%d_%H%M%S")
    for p in properties:

        # Is it possible to do a range instead of each seperately?
        json_prop = requests.get(context.solid_config['immo24_api_en'] + p['id']).json()

        # add metadata if flat, house, detatched-house, etc.
        json_prop['propertyDetails']['propertyType'] = p['propertyType']
        json_prop['propertyDetails']['isBuyRent'] = p['rentOrBuy']

        # add metadata from search
        json_prop['propertyDetails']['propertyId'] = p['id']
        json_prop['propertyDetails']['searchCity'] = p['city']
        json_prop['propertyDetails']['searchRadius'] = p['radius']
        json_prop['propertyDetails']['searchDate'] = date
        json_prop['propertyDetails']['searchDateTime'] = date_time

        property_list.append(json_prop)

    filename = (
        property_list[0]['propertyDetails']['searchDate']
        + '_'
        + property_list[0]['propertyDetails']['searchCity']
        + '_'
        + property_list[0]['propertyDetails']['isBuyRent']
        + '_'
        + str(property_list[0]['propertyDetails']['searchRadius'])
        + '_'
        + property_list[0]['propertyDetails']['propertyType']
        + '.gz'
    )
    target_key = target_key + '/' + filename

    '''caching to file
    '''
    file_cache = context.resources.file_cache
    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        json_zip_writer(property_list, target_key)
        context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[
            EventMetadataEntry.path(path=target_file_handle.path_desc, label=target_key)
        ],
    )
    yield Output(target_file_handle)
Beispiel #25
0
 def emit_failed_expectation(_context):
     yield ExpectationResult(
         success=False,
         label="always_false",
         description="Failure",
         metadata_entries=[
             EventMetadataEntry.json(label="data", data={"reason": "Relentless pessimism."})
         ],
     )
Beispiel #26
0
 def emit_successful_expectation(_context):
     yield ExpectationResult(
         success=True,
         label="always_true",
         description="Successful",
         metadata_entries=[
             EventMetadataEntry.json(label="data", data={"reason": "Just because."})
         ],
     )
Beispiel #27
0
 def emit_failed_expectation(_context):
     yield ExpectationResult(
         success=False,
         label='always_false',
         description='Failure',
         metadata_entries=[
             EventMetadataEntry.json(label='data', data={'reason': 'Relentless pessimism.'})
         ],
     )
Beispiel #28
0
 def emit_successful_expectation(_context):
     yield ExpectationResult(
         success=True,
         label='always_true',
         description='Successful',
         metadata_entries=[
             EventMetadataEntry.json(label='data', data={'reason': 'Just because.'})
         ],
     )
Beispiel #29
0
def emit_events_solid(input_num):
    a_num = input_num + 1
    yield ExpectationResult(
        success=a_num > 0, label="positive", description="A num must be positive"
    )
    yield AssetMaterialization(
        asset_key="persisted_string", description="Let us pretend we persisted the string somewhere"
    )
    yield Output(value=a_num, output_name="a_num")
Beispiel #30
0
def emit_events_solid(_, input_num):
    a_num = input_num + 1
    a_string = 'foo'
    yield ExpectationResult(
        success=a_num > 0, label='positive', description='A num must be positive'
    )
    yield Materialization(
        label='persisted_string', description='Let us pretend we persisted the string somewhere'
    )
    yield Output(value=a_num, output_name='a_num')
    yield Output(value=a_string, output_name='a_string')