def test_convert_one_organisation_metadata_file_to_organisation_lookup(): s3_manager = Mock() transfer_classifier_io = TransferClassifierIO(s3_data_manager=s3_manager) s3_manager.read_json.return_value = _ORGANISATION_METADATA_DICT_FIRST_MONTH actual_metadatas = transfer_classifier_io.read_ods_metadata_files( s3_uris=[_S3_URI]) actual_organisation_lookup = actual_metadatas.get_lookup( _DATE_ANCHOR_YEAR_MONTH) expected_first_month_practices = [ PracticeDetails(ods_code="ABC", name="A Practice", asids=["123"]) ] expected_first_month_ccgs = [ CcgDetails(ods_code="XYZ", name="A CCG", practices=["ABC"]) ] expected_organisation_lookup = OrganisationLookup( expected_first_month_practices, expected_first_month_ccgs) assert actual_organisation_lookup.practice_ods_code_from_asid( "123") == expected_organisation_lookup.practice_ods_code_from_asid( "123") s3_manager.read_json.assert_called_once_with(_S3_URI)
def test_read_spine_messages_reads_multiple_messages(): csv_rows = [build_spine_item(guid=f"guid{i}") for i in range(10)] mock_s3_conn = MockS3( objects=[ MockS3Object( bucket="test_bucket", key="data/1.csv.gz", contents=_spine_csv_gz(csv_rows[:4]), ), MockS3Object( bucket="test_bucket", key="data/2.csv.gz", contents=_spine_csv_gz(csv_rows[4:]), ), ] ) io = TransferClassifierIO(s3_data_manager=S3DataManager(mock_s3_conn)) expected_guids = [f"guid{i}" for i in range(10)] actual_messages = io.read_spine_messages( ["s3://test_bucket/data/1.csv.gz", "s3://test_bucket/data/2.csv.gz"] ) actual_guids = [message.guid for message in actual_messages] assert actual_guids == expected_guids
def test_write_transfers_correctly_writes_all_fields(): mock_s3 = MockS3() s3_data_manager = S3DataManager(mock_s3) io = TransferClassifierIO(s3_data_manager) transfer = Transfer( conversation_id="1234", sla_duration=timedelta(days=1), requesting_practice=Practice(asid="123", supplier="Supplier A", ods_code="A12"), sending_practice=Practice(asid="456", supplier="Supplier B", ods_code="B12"), sender_error_codes=[1, None], final_error_codes=[None, 32], intermediate_error_codes=[], outcome=TransferOutcome( status=TransferStatus.PROCESS_FAILURE, failure_reason=TransferFailureReason.FINAL_ERROR), date_requested=datetime(year=2021, month=3, day=5), date_completed=None, last_sender_message_timestamp=None, ) io.write_transfers(transfers=[transfer], s3_uri="s3://a_bucket/some_data.parquet", metadata=_SOME_METADATA) expected_table = { "conversation_id": ["1234"], "sla_duration": [86400], "requesting_practice_asid": ["123"], "requesting_practice_ods_code": ["A12"], "sending_practice_asid": ["456"], "sending_practice_ods_code": ["B12"], "requesting_supplier": ["Supplier A"], "sending_supplier": ["Supplier B"], "sender_error_codes": [[1, None]], "final_error_codes": [[None, 32]], "intermediate_error_codes": [[]], "status": ["Process failure"], "failure_reason": ["Final error"], "date_requested": [datetime(year=2021, month=3, day=5)], "date_completed": [None], "last_sender_message_timestamp": [None], } actual_table = mock_s3.object( "a_bucket", "some_data.parquet").read_parquet().to_pydict() assert actual_table == expected_table
def test_convert_two_organisation_metadata_files_to_organisation_lookup_mapping( ): s3_manager = Mock() transfer_classifier_io = TransferClassifierIO(s3_data_manager=s3_manager) s3_manager.read_json.side_effect = [ _ORGANISATION_METADATA_DICT_FIRST_MONTH, _ORGANISATION_METADATA_DICT_ADDITIONAL_MONTH, ] actual_metadatas = transfer_classifier_io.read_ods_metadata_files( s3_uris=[_S3_URI, _S3_URI_ADDITIONAL_MONTH]) expected_first_month_practices = [ PracticeDetails(ods_code="ABC", name="A Practice", asids=["123"]) ] expected_first_month_ccgs = [ CcgDetails(ods_code="XYZ", name="A CCG", practices=["ABC"]) ] expected_first_organisation_lookup = OrganisationLookup( expected_first_month_practices, expected_first_month_ccgs) actual_first_organisation_lookup = actual_metadatas.get_lookup( _DATE_ANCHOR_YEAR_MONTH) assert actual_first_organisation_lookup.practice_ods_code_from_asid( "123" ) == expected_first_organisation_lookup.practice_ods_code_from_asid("123") expected_second_month_practices = [ PracticeDetails(ods_code="A12345", name="GP Practice", asids=["123456789123"]) ] expected_second_month_ccgs = [ CcgDetails(ods_code="22A", name="CCG", practices=["A12345"]) ] expected_second_organisation_lookup = OrganisationLookup( expected_second_month_practices, expected_second_month_ccgs) actual_second_organisation_lookup = actual_metadatas.get_lookup( _DATE_ANCHOR_ADDITIONAL_YEAR_MONTH) assert actual_second_organisation_lookup.practice_ods_code_from_asid( "A12345" ) == expected_second_organisation_lookup.practice_ods_code_from_asid( "A12345") expected_s3_manager_read_json_calls = [ call(_S3_URI), call(_S3_URI_ADDITIONAL_MONTH) ] s3_manager.read_json.assert_has_calls(expected_s3_manager_read_json_calls)
def test_write_transfers_writes_metadata(): mock_s3 = MockS3() s3_data_manager = S3DataManager(mock_s3) metadata = {a_string(): a_string()} io = TransferClassifierIO(s3_data_manager) io.write_transfers(transfers=[build_transfer()], s3_uri="s3://a_bucket/some_data.parquet", metadata=metadata) actual_meta_data = mock_s3.object("a_bucket", "some_data.parquet").get_metadata() assert actual_meta_data == metadata
def __init__(self, config: TransferClassifierConfig): s3 = boto3.resource("s3", endpoint_url=config.s3_endpoint_url) s3_manager = S3DataManager(s3) self._reporting_window = ReportingWindow(config.start_datetime, config.end_datetime, config.conversation_cutoff) self._config = config self._uris = TransferClassifierS3UriResolver( gp2gp_spine_bucket=config.input_spine_data_bucket, transfers_bucket=config.output_transfer_data_bucket, ods_metadata_bucket=config.input_ods_metadata_bucket, ) self._io = TransferClassifierIO(s3_manager)
def test_read_spine_messages_reads_single_message_correctly(): csv_row = build_spine_item( time="2019-12-31T23:37:55.334+0000", conversation_id="abc", guid="message_a", interaction_id="an_interaction_id", message_sender="sender_x", message_recipient="recipient_y", message_ref="NotProvided", jdi_event="NONE", raw="", from_system="SupplierA", to_system="Unknown", ) mock_s3_conn = MockS3( objects=[ MockS3Object( bucket="test_bucket", key="data/1.csv.gz", contents=_spine_csv_gz([csv_row]) ) ] ) io = TransferClassifierIO(s3_data_manager=S3DataManager(mock_s3_conn)) expected_spine_message = Message( time=datetime(2019, 12, 31, 23, 37, 55, 334000, tzutc()), conversation_id="abc", guid="message_a", interaction_id="an_interaction_id", from_party_asid="sender_x", to_party_asid="recipient_y", message_ref=None, error_code=None, from_system="SupplierA", to_system="Unknown", ) actual = io.read_spine_messages(["s3://test_bucket/data/1.csv.gz"]) assert list(actual) == [expected_spine_message]
def test_write_transfers_correctly_writes_multiple_rows(): mock_s3 = MockS3() s3_data_manager = S3DataManager(mock_s3) io = TransferClassifierIO(s3_data_manager) transfers = [ build_transfer(conversation_id="a"), build_transfer(conversation_id="b"), build_transfer(conversation_id="c"), ] io.write_transfers(transfers=transfers, s3_uri="s3://a_bucket/multi_row.parquet", metadata=_SOME_METADATA) expected_conversation_ids = ["a", "b", "c"] actual_conversation_ids = (mock_s3.object( "a_bucket", "multi_row.parquet").read_parquet().to_pydict().get("conversation_id")) assert actual_conversation_ids == expected_conversation_ids
class TransferClassifier: def __init__(self, config: TransferClassifierConfig): s3 = boto3.resource("s3", endpoint_url=config.s3_endpoint_url) s3_manager = S3DataManager(s3) self._reporting_window = ReportingWindow(config.start_datetime, config.end_datetime, config.conversation_cutoff) self._config = config self._uris = TransferClassifierS3UriResolver( gp2gp_spine_bucket=config.input_spine_data_bucket, transfers_bucket=config.output_transfer_data_bucket, ods_metadata_bucket=config.input_ods_metadata_bucket, ) self._io = TransferClassifierIO(s3_manager) def _read_spine_messages(self) -> Iterator[Message]: input_paths = self._uris.spine_messages(self._reporting_window) return self._io.read_spine_messages(input_paths) def _read_ods_metadata(self) -> OrganisationMetadataMonthly: input_paths = self._uris.ods_metadata(self._reporting_window) return self._io.read_ods_metadata_files(input_paths) def _write_transfers( self, transfers: Iterator[Transfer], daily_start_datetime: datetime, cutoff: timedelta, metadata: Dict[str, str], ): output_path = self._uris.gp2gp_transfers( daily_start_datetime=daily_start_datetime, cutoff=cutoff) self._io.write_transfers(transfers, output_path, metadata) def _construct_json_log_date_range_info(self) -> dict: reporting_window_dates = self._reporting_window.get_dates() reporting_window_overflow_dates = self._reporting_window.get_overflow_dates( ) return { "config_start_datetime": convert_to_datetime_string(self._config.start_datetime), "config_end_datetime": convert_to_datetime_string(self._config.end_datetime), "conversation_cutoff": str(self._config.conversation_cutoff), "reporting_window_dates": convert_to_datetimes_string(reporting_window_dates), "reporting_window_overflow_dates": convert_to_datetimes_string(reporting_window_overflow_dates), } def run(self): transfer_observability_probe = TransferObservabilityProbe( logger=module_logger) log_date_range_info = self._construct_json_log_date_range_info() logger.info( "Attempting to classify conversations for a date range", extra={ "event": "ATTEMPTING_CLASSIFY_CONVERSATIONS_FOR_A_DATE_RANGE", **log_date_range_info, }, ) spine_messages = self._read_spine_messages() ods_metadata_monthly = self._read_ods_metadata() transfer_service = TransferService( message_stream=spine_messages, cutoff=self._config.conversation_cutoff, observability_probe=transfer_observability_probe, ) conversations = transfer_service.group_into_conversations() gp2gp_conversations = transfer_service.parse_conversations_into_gp2gp_conversations( conversations) for daily_start_datetime in self._reporting_window.get_dates(): metadata = { "cutoff-days": str(self._config.conversation_cutoff.days), "build-tag": self._config.build_tag, "start-datetime": convert_to_datetime_string(daily_start_datetime), "end-datetime": convert_to_datetime_string(daily_start_datetime + timedelta(days=1)), "ods-metadata-month": f"{daily_start_datetime.year}-{daily_start_datetime.month}", } conversations_started_in_reporting_window = filter_conversations_by_day( gp2gp_conversations, daily_start_datetime) organisation_lookup = ods_metadata_monthly.get_lookup( (daily_start_datetime.year, daily_start_datetime.month)) transfers = transfer_service.convert_to_transfers( conversations_started_in_reporting_window, organisation_lookup=organisation_lookup) self._write_transfers( transfers=transfers, daily_start_datetime=daily_start_datetime, cutoff=self._config.conversation_cutoff, metadata=metadata, ) logger.info( "Successfully classified conversations for a date range", extra={ "event": "CLASSIFIED_CONVERSATIONS_FOR_A_DATE_RANGE", **log_date_range_info, }, )