def split_types(context, raw_df: SparkDF): expected_df = context.resources.pyspark.spark_session.createDataFrame([], HN_ACTION_SCHEMA) # Schema validation yield ExpectationResult( success=set(raw_df.schema) == set(expected_df.schema), label="hn_data_schema_check", description="Check if the source schema is expected", metadata={ "Expected data schema": expected_df._jdf.schema().treeString(), # type: ignore # pylint: disable=protected-access "Actual data schema": raw_df._jdf.schema().treeString(), # type: ignore # pylint: disable=protected-access }, ) # Split data based on the values in the 'type' column type_values = raw_df.select("type").distinct().rdd.flatMap(lambda x: x).collect() comment_df = raw_df.where(raw_df["type"] == "comment") story_df = raw_df.where(raw_df["type"] == "story") yield ExpectationResult( success=comment_df.count() > 0 and story_df.count() > 0, label="hn_data_split_types_check", description="Expect the hacker news data has at least 1 'comment' entry and at least 1 'story' entry.", metadata={ "number of raw rows": raw_df.count(), "number of comment rows": comment_df.count(), "number of story rows": story_df.count(), "Unique values in the 'type' column": (", ").join(type_values), }, ) yield Output(comment_df, "comments") yield Output(story_df, "stories")
def check_users_and_groups_one_fails_one_succeeds(_context): yield ExpectationResult( success=True, label="user_expectations", description="Battery of expectations for user", metadata={ "table_summary": { "columns": { "name": {"nulls": 0, "empty": 0, "values": 123, "average_length": 3.394893}, "time_created": {"nulls": 1, "empty": 2, "values": 120, "average": 1231283}, } }, }, ) yield ExpectationResult( success=False, label="groups_expectations", description="Battery of expectations for groups", metadata={ "table_summary": { "columns": { "name": {"nulls": 1, "empty": 0, "values": 122, "average_length": 3.394893}, "time_created": {"nulls": 1, "empty": 2, "values": 120, "average": 1231283}, } } }, )
def check_users_and_groups_one_fails_one_succeeds(_context): yield ExpectationResult( success=True, label='user_expectations', description='Battery of expectations for user', metadata_entries=[ EventMetadataEntry.json( label='table_summary', data={ 'columns': { 'name': {'nulls': 0, 'empty': 0, 'values': 123, 'average_length': 3.394893}, 'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283}, } }, ) ], ) yield ExpectationResult( success=False, label='groups_expectations', description='Battery of expectations for groups', metadata_entries=[ EventMetadataEntry.json( label='table_summary', data={ 'columns': { 'name': {'nulls': 1, 'empty': 0, 'values': 122, 'average_length': 3.394893}, 'time_created': {'nulls': 1, 'empty': 2, 'values': 120, 'average': 1231283}, } }, ) ], )
def join_q2_data( context, april_data: DataFrame, may_data: DataFrame, june_data: DataFrame, master_cord_data: DataFrame, ) -> DataFrame: dfs = {"april": april_data, "may": may_data, "june": june_data} missing_things = [] for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({"month": month, "missing_column": required_column}) yield ExpectationResult( success=not bool(missing_things), label="airport_ids_present", description="Sequence IDs present in incoming monthly flight data.", metadata_entries=[ EventMetadataEntry.json(label="metadata", data={"missing_columns": missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label="flight_data_same_shape", metadata_entries=[ EventMetadataEntry.json(label="metadata", data={"columns": april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0 ) sampled_q2_data.createOrReplaceTempView("q2_data") dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_") dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data") origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_") origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data") full_data = context.resources.pyspark.spark_session.sql( """ SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID """ ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def join_q2_data( context, april_data: DataFrame, may_data: DataFrame, june_data: DataFrame, master_cord_data: DataFrame, ) -> DataFrame: dfs = {'april': april_data, 'may': may_data, 'june': june_data} missing_things = [] for required_column in ['DestAirportSeqID', 'OriginAirportSeqID']: for month, df in dfs.items(): if required_column not in df.columns: missing_things.append({'month': month, 'missing_column': required_column}) yield ExpectationResult( success=not bool(missing_things), label='airport_ids_present', description='Sequence IDs present in incoming monthly flight data.', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'missing_columns': missing_things}) ], ) yield ExpectationResult( success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns), label='flight_data_same_shape', metadata_entries=[ EventMetadataEntry.json(label='metadata', data={'columns': april_data.columns}) ], ) q2_data = april_data.union(may_data).union(june_data) sampled_q2_data = q2_data.sample( withReplacement=False, fraction=context.solid_config['subsample_pct'] / 100.0 ) sampled_q2_data.createOrReplaceTempView('q2_data') dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'DEST_') dest_prefixed_master_cord_data.createOrReplaceTempView('dest_cord_data') origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, 'ORIGIN_') origin_prefixed_master_cord_data.createOrReplaceTempView('origin_cord_data') full_data = context.resources.spark.sql( ''' SELECT * FROM origin_cord_data LEFT JOIN ( SELECT * FROM q2_data LEFT JOIN dest_cord_data ON q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID ) q2_dest_data ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID ''' ) yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))
def materialization_and_expectation(_context): yield Materialization.file(path='/path/to/foo', description='This is a table.') yield Materialization.file(path='/path/to/bar') yield ExpectationResult(success=True, label='row_count', description='passed') yield ExpectationResult(True) yield Output(True)
def materialization_and_expectation(_context): yield Materialization( label='all_types', description='a materialization with all metadata types', metadata_entries=[ EventMetadataEntry.text('text is cool', 'text'), EventMetadataEntry.url('https://bigty.pe/neato', 'url'), EventMetadataEntry.fspath('/tmp/awesome', 'path'), EventMetadataEntry.json({'is_dope': True}, 'json'), ], ) yield ExpectationResult(success=True, label='row_count', description='passed') yield ExpectationResult(True) yield Output(True)
def materialization_and_expectation(_context): yield AssetMaterialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ EventMetadataEntry.text("text is cool", "text"), EventMetadataEntry.url("https://bigty.pe/neato", "url"), EventMetadataEntry.fspath("/tmp/awesome", "path"), EventMetadataEntry.json({"is_dope": True}, "json"), ], ) yield ExpectationResult(success=True, label="row_count", description="passed") yield ExpectationResult(True) yield Output(True)
def cache_file_from_s3(context, bucket_data): target_key = context.solid_config.get('file_key', bucket_data['key'].split('/')[-1]) file_cache = context.resources.file_cache file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.session.download_file( Bucket=bucket_data['bucket'], Key=bucket_data['key'], Filename=tmp_file) context.log.info('File downloaded to {}'.format(tmp_file)) with open(tmp_file, 'rb') as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info('File handle written at : {}'.format( file_handle.path_desc)) else: context.log.info('File {} already present in cache'.format( file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label='file_handle_exists', metadata_entries=[ EventMetadataEntry.path(path=file_handle.path_desc, label=target_key) ], ) yield Output(file_handle)
def _expect_all_resources_in_result( resources: ResourcesDataFrame, result: UtilizationDataFrame) -> ExpectationResult: input_ids = set(resources.resource_id) output_ids = set(result.resource_id) missing_ids = input_ids - output_ids missing_count = len(missing_ids) entries = [ EventMetadataEntry.json( { 'input': len(input_ids), 'output': len(output_ids), 'missing': missing_count }, label='Summary Counts') ] if missing_count > 0: entries.append( EventMetadataEntry.json({'ids': list(missing_ids)}, label='Missing Resources')) return ExpectationResult( success=(missing_count == 0), label='Found All Resources', description= 'Check if all the input resource ids were found in the Azure Monitor Logs workspace.', metadata_entries=entries)
def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle: target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1]) file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): with get_temp_file_name() as tmp_file: context.resources.s3.download_file( Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file ) context.log.info("File downloaded to {}".format(tmp_file)) with open(tmp_file, "rb") as tmp_file_object: file_cache.write_file_object(target_key, tmp_file_object) context.log.info("File handle written at : {}".format(target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format(target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)], ) yield Output(target_file_handle)
def logs_events(context): context.log_event(AssetMaterialization("first")) context.log_event(Materialization("second")) context.log_event(ExpectationResult(success=True)) context.log_event(AssetObservation("fourth")) yield AssetMaterialization("fifth") yield Output("blah")
def emit_failed_expectation(_context): yield ExpectationResult( success=False, name='always_false', message='Failure', result_metadata={'reason': 'Relentless pessimism.'}, )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, name='always_true', message='Successful', result_metadata={'reason': 'Just because.'}, )
def ge_validation_solid(context, pandas_df): data_context = context.resources.ge_data_context suite = data_context.get_expectation_suite(suite_name) batch_kwargs = { "dataset": pandas_df, "datasource": datasource_name, } batch = data_context.get_batch(batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( "action_list_operator", assets_to_validate=[batch], run_id=run_id) res = convert_to_json_serializable( results.list_validation_results())[0] nmeta = EventMetadataEntry.json( { 'overall': res['statistics'], 'individual': res['results'] }, 'constraint-metadata', ) yield ExpectationResult(success=res["success"], metadata_entries=[nmeta]) yield Output(res)
def many_materializations_and_passing_expectations(_context): tables = [ "users", "groups", "events", "friends", "pages", "fans", "event_admins", "group_admins", ] for table in tables: yield AssetMaterialization( asset_key="table_info", metadata_entries=[ EventMetadataEntry.path(label="table_path", path="/path/to/{}.raw".format(table)) ], ) yield ExpectationResult( success=True, label="{table}.row_count".format(table=table), description="Row count passed for {table}".format(table=table), )
def ge_validation_solid(context, dataset): data_context = context.resources.ge_data_context if validation_operator_name is not None: validation_operator = validation_operator_name else: data_context.add_validation_operator( "ephemeral_validation", {"class_name": "ActionListValidationOperator", "action_list": []}, ) validation_operator = "ephemeral_validation" suite = data_context.get_expectation_suite(suite_name) final_batch_kwargs = batch_kwargs or {"dataset": dataset} if "datasource" in batch_kwargs: context.log.warning( "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` " "parameter of the solid factory instead." ) final_batch_kwargs["datasource"] = datasource_name batch = data_context.get_batch(final_batch_kwargs, suite) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = data_context.run_validation_operator( validation_operator, assets_to_validate=[batch], run_id=run_id ) res = convert_to_json_serializable(results.list_validation_results())[0] md_str = render_multiple_validation_result_pages_markdown( validation_operator_result=results, run_info_at_end=True, ) meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results") yield ExpectationResult( success=res["success"], metadata_entries=[meta_stats,], ) yield Output(res)
def emit_failed_expectation(_context): yield ExpectationResult( success=False, label="always_false", description="Failure", metadata={"data": {"reason": "Relentless pessimism."}}, )
def _ge_validation_fn(context, dataset): data_context = context.resources.ge_data_context validator_kwargs = { "datasource_name": datasource_name, "data_connector_name": data_connector_name, "data_asset_name": datasource_name or data_asset_name, "runtime_parameters": { runtime_method_type: dataset }, "batch_identifiers": batch_identifiers, "expectation_suite_name": suite_name, **extra_kwargs, } validator = data_context.get_validator(**validator_kwargs) run_id = { "run_name": datasource_name + " run", "run_time": datetime.datetime.utcnow(), } results = validator.validate(run_id=run_id) validation_results_page_renderer = ValidationResultsPageRenderer( run_info_at_end=True) rendered_document_content_list = validation_results_page_renderer.render( validation_results=results) md_str = "".join( DefaultMarkdownPageView().render(rendered_document_content_list)) meta_stats = MetadataEntry("Expectation Results", value=MetadataValue.md(md_str)) yield ExpectationResult( success=bool(results["success"]), metadata_entries=[meta_stats], ) yield Output(results.to_json_dict())
def expect_column_to_be_integers( data_frame: LessSimpleDataFrame, column_name: str ) -> ExpectationResult: bad_values = [] for idx in range(len(data_frame)): line = data_frame[idx] if not isinstance(line[column_name], int): bad_values.append((idx, str(line[column_name]))) return ExpectationResult( success=(not bad_values), label='col_{column_name}_is_int'.format(column_name=column_name), description=( 'Check whether type of column {column_name} in ' 'LessSimpleDataFrame is int' ).format(column_name=column_name), metadata_entries=[ EventMetadataEntry.json( {'index': idx, 'bad_value': value}, 'bad_value', 'Bad value in column {column_name}'.format( column_name=column_name ), ) for (idx, value) in bad_values ], )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, label="always_true", description="Successful", metadata={"data": {"reason": "Just because."}}, )
def many_materializations_and_passing_expectations(_context): tables = [ 'users', 'groups', 'events', 'friends', 'pages', 'fans', 'event_admins', 'group_admins', ] for table in tables: yield AssetMaterialization( asset_key='table_info', metadata_entries=[ EventMetadataEntry.path(label='table_path', path='/path/to/{}.raw'.format(table)) ], ) yield ExpectationResult( success=True, label='{table}.row_count'.format(table=table), description='Row count passed for {table}'.format(table=table), )
def create_expectation_result(label, ge_evr): check.dict_param(ge_evr, 'ge_evr', key_type=str) check.param_invariant('success' in ge_evr, 'ge_evr') return ExpectationResult( success=ge_evr['success'], label=label, metadata_entries=[EventMetadataEntry.json(ge_evr, label='evr')], )
def cache_properies_from_rest_api( context, properties: PropertyDataFrame, target_key: String ) -> FileHandle: property_list = [] date = datetime.today().strftime('%y%m%d') date_time = datetime.now().strftime("%y%m%d_%H%M%S") for p in properties: # Is it possible to do a range instead of each seperately? json_prop = requests.get(context.solid_config['immo24_api_en'] + p['id']).json() # add metadata if flat, house, detatched-house, etc. json_prop['propertyDetails']['propertyType'] = p['propertyType'] json_prop['propertyDetails']['isBuyRent'] = p['rentOrBuy'] # add metadata from search json_prop['propertyDetails']['propertyId'] = p['id'] json_prop['propertyDetails']['searchCity'] = p['city'] json_prop['propertyDetails']['searchRadius'] = p['radius'] json_prop['propertyDetails']['searchDate'] = date json_prop['propertyDetails']['searchDateTime'] = date_time property_list.append(json_prop) filename = ( property_list[0]['propertyDetails']['searchDate'] + '_' + property_list[0]['propertyDetails']['searchCity'] + '_' + property_list[0]['propertyDetails']['isBuyRent'] + '_' + str(property_list[0]['propertyDetails']['searchRadius']) + '_' + property_list[0]['propertyDetails']['propertyType'] + '.gz' ) target_key = target_key + '/' + filename '''caching to file ''' file_cache = context.resources.file_cache target_file_handle = file_cache.get_file_handle(target_key) if file_cache.overwrite or not file_cache.has_file_object(target_key): json_zip_writer(property_list, target_key) context.log.info("File handle written at : {}".format(target_file_handle.path_desc)) else: context.log.info("File {} already present in cache".format(target_file_handle.path_desc)) yield ExpectationResult( success=file_cache.has_file_object(target_key), label="file_handle_exists", metadata_entries=[ EventMetadataEntry.path(path=target_file_handle.path_desc, label=target_key) ], ) yield Output(target_file_handle)
def emit_failed_expectation(_context): yield ExpectationResult( success=False, label="always_false", description="Failure", metadata_entries=[ EventMetadataEntry.json(label="data", data={"reason": "Relentless pessimism."}) ], )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, label="always_true", description="Successful", metadata_entries=[ EventMetadataEntry.json(label="data", data={"reason": "Just because."}) ], )
def emit_failed_expectation(_context): yield ExpectationResult( success=False, label='always_false', description='Failure', metadata_entries=[ EventMetadataEntry.json(label='data', data={'reason': 'Relentless pessimism.'}) ], )
def emit_successful_expectation(_context): yield ExpectationResult( success=True, label='always_true', description='Successful', metadata_entries=[ EventMetadataEntry.json(label='data', data={'reason': 'Just because.'}) ], )
def emit_events_solid(input_num): a_num = input_num + 1 yield ExpectationResult( success=a_num > 0, label="positive", description="A num must be positive" ) yield AssetMaterialization( asset_key="persisted_string", description="Let us pretend we persisted the string somewhere" ) yield Output(value=a_num, output_name="a_num")
def emit_events_solid(_, input_num): a_num = input_num + 1 a_string = 'foo' yield ExpectationResult( success=a_num > 0, label='positive', description='A num must be positive' ) yield Materialization( label='persisted_string', description='Let us pretend we persisted the string somewhere' ) yield Output(value=a_num, output_name='a_num') yield Output(value=a_string, output_name='a_string')