def test_column_mapping_has_extra_column(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name2", ], GII_FIELD_NAME_COLUMN_NAME: [ "external_column_name1", "external_column_name2", ], "random_column_name": [ "why is this here", "what is this supposed to be", ], }) validation_failures = column_mapping_validator.validate( column_mapping_df) self.assertTrue(validation_failures) self.assertEqual( { "expect_table_columns_to_match_ordered_list": { "expected_list": [ "Internal Column Name", "Mission Impact Field Name", ], "failed_vals": ["random_column_name"], } }, validation_failures, )
def test_column_mapping_is_missing_column(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name2", ] }) validation_failures = column_mapping_validator.validate( column_mapping_df) self.assertTrue(validation_failures) self.assertEqual( { "expect_table_columns_to_match_ordered_list": { "expected_list": [ "Internal Column Name", "Mission Impact Field Name", ], "failed_vals": [None], } }, validation_failures, )
def test_external_column_names_are_invalid(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name2", ], GII_FIELD_NAME_COLUMN_NAME: [ "external_column_name1", "external_column_name_not_real", ], }) validation_failures = column_mapping_validator.validate( column_mapping_df) self.assertTrue(validation_failures) self.assertEqual( { "expect_column_values_to_be_in_set": { "column_name": "Mission Impact Field Name", "failed_vals": ["external_column_name_not_real"], } }, validation_failures, )
def test_internal_column_names_are_mapped_multiple_times(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name1", ], GII_FIELD_NAME_COLUMN_NAME: [ "external_column_name1", "external_column_name2", ], }) validation_failures = column_mapping_validator.validate( column_mapping_df) self.assertTrue(validation_failures) self.assertEqual( { "expect_column_values_to_be_unique": { "column_name": "Internal Column Name", "failed_vals": ["internal_column_name1"], } }, validation_failures, )
def test_external_column_names_are_valid(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name2", ], GII_FIELD_NAME_COLUMN_NAME: [ "external_column_name1", "external_column_name2", ], }) validation_failures = column_mapping_validator.validate( column_mapping_df) self.assertFalse(validation_failures)
def test_column_mapping_has_correct_columns(self): column_mapping_validator = ColumnMappingValidator(TEST_SCHEMA) column_mapping_df = pd.DataFrame( data={ INTERNAL_COLUMN_NAME_COLUMN_NAME: [ "internal_column_name1", "internal_column_name2", ], GII_FIELD_NAME_COLUMN_NAME: [ "external_column_name1", "external_column_name2", ], }) validation_failures = column_mapping_validator.validate( column_mapping_df) # No validation failures means that column mappings are valid. self.assertFalse(validation_failures)
def simple_pipeline( member_id: str, row_format: bool, multiple_val_delimiter: str, data: Dict[str, pd.DataFrame], schema: Schema, column_mapping: pd.DataFrame, source_field_mappings: FieldMappings, ): """Simple pipeline to transform Mission Impact data to prepare it for upload to the Gateway system. Note that log changes are not noops; logs are captured and sent directly to users of the web endpoint, as well as saved by the Airflow server, so readability of all logs of level INFO and above is critical. Parameters ---------- member_id : str The organization's Member ID. row_format : bool Whether the data is organized using the Mission Impact Row Format (if this is false, it implies that the data is organized using the Mission Impact Column Format). multiple_val_delimiter : str The separator for multiple values in the dataset. data : Dict[str, pd.DataFrame] Dictionary of {dataset name -> dataset}. schema: Schema Mission Impact Table Schema column_mapping : pd.DataFrame Column Mapping. source_field_mappings : FieldMappings Field Mappings. Returns ------- type Returns the transformed dataset and any resolved field mappings. """ return_val = {} # Validate Table Schema table_schema.validate_schema(schema) # Validate Column Mappings validation_failures = ColumnMappingValidator( schema, row_format).validate(column_mapping) if validation_failures: logging.error( "The pipeline could not finish, because some of your column mappings are not valid. Please review the following names in your Column Mappings Google sheet:<br>" ) logging.error(email.format_validation_failures(validation_failures)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_COLUMN_MAPPING_INVALID_EMAIL_TASK_ID return return_val column_mapping = ColumnMappingLoader.convert_column_mapping_dataframe_to_dict( column_mapping) # Validate Field Mappings validation_failures: Dict[str, Dict] = FieldMappingValidator( schema).validate_multiple(source_field_mappings) if validation_failures: logging.error("Field mappings are not valid!") for _, validation_failure in validation_failures.items(): logging.error(email.format_validation_failures(validation_failure)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_FIELD_MAPPING_INVALID_EMAIL_TASK_ID return return_val # Validate Data Shape validation_failures = DatasetShapeValidator( schema, column_mapping, row_format).validate_multiple_dataset_shape(data) if validation_failures: logging.error("Dataset shape is not valid!") for _, validation_failure in validation_failures.items(): logging.error(email.format_validation_failures(validation_failure)) return_val[EMAIL_METADATA_KEY] = validation_failures return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_DATA_SHAPE_INVALID_EMAIL_TASK_ID return return_val # Shape Data shape_transformer: DatasetShapeTransformer = DatasetShapeTransformer( member_id, schema, column_mapping, row_format, multiple_val_delimiter) # TODO: Move concatentation of multiple datasets into DatasetShapeTransformer # Combine all of the datasets into one combined_shaped_dataset: pd.DataFrame = pd.concat( [ shape_transformer.transform_dataset_shape(df) for df in data.values() ], ignore_index=True, sort=True, ) combined_shaped_dataset = combined_shaped_dataset.fillna("") # Generate Field mappings generated_field_mappings: FieldMappings = FieldMappingGenerator( schema).generate_mappings_from_dataset(combined_shaped_dataset) # Resolve Field Mappings resolved_field_mappings: FieldMappings = FieldMappingResolver.resolve_mappings( generated_field_mappings, source_field_mappings, overwrite=False, remove_unapproved_source_mappings=True, ) return_val[FIELD_MAPPINGS_RETURN_KEY] = resolved_field_mappings # Validate Field Mapping Approvals validation_failures: Dict[str, Dict] = FieldMappingApprovalValidator( ).validate_multiple(resolved_field_mappings) if validation_failures: logging.error( 'The pipeline could not finish, because some of your field mappings do not have approved values. Most likely, your data has new responses, which require new mappings. Go to your Field Mappings Google sheet, and approve the new mappings by toggling "No" to "Yes" on the following fields:<br>' ) logging.error( email.format_unapproved_mappings(resolved_field_mappings)) # No need to store email metadata, since resolved field mappings are used to generate # field mapping approval needed email. return_val[ FAILURE_EMAIL_TASK_ID_KEY] = SEND_FIELD_MAPPING_APPROVAL_EMAIL_TASK_ID return return_val # Process Data transformed_dataset, invalid_values, dropped_rows = DataProcessor( resolved_field_mappings, schema).process(combined_shaped_dataset) final_shaped_dataset = GatewayDatasetShapeTransformer( schema).transform_dataset_shape(transformed_dataset) # Store number of rows in processed data, plus dropped data info. logging.warning("<br>" + email.format_successful_upload( final_shaped_dataset.shape[0], dropped_rows, invalid_values)) return_val[EMAIL_METADATA_KEY] = { NUM_ROWS_TO_UPLOAD_KEY: final_shaped_dataset.shape[0], DROPPED_ROWS_KEY: dropped_rows, DROPPED_VALUES_KEY: invalid_values, } return_val[DATASET_RETURN_KEY] = final_shaped_dataset return return_val