def test_update_iterable(self): data_dicts = [ {"id": "A", "message": "hello"}, {"id": "B", "message": "hello"}, {"id": "A", "message": "hi"} ] data = [ TracedData(d, Metadata("test_user", "data_generator", time.time())) for d in data_dicts ] updates_dicts = [ {"id": "A", "gender": "male"}, {"id": "B", "gender": "female", "age": 20} ] updates = [ TracedData(d, Metadata("test_user", "data_generator", time.time())) for d in updates_dicts ] TracedData.update_iterable("test_user", "id", data, updates, "demographics") expected_dicts = [ {"id": "A", "message": "hello", "gender": "male"}, {"id": "B", "message": "hello", "gender": "female", "age": 20}, {"id": "A", "message": "hi", "gender": "male"} ] for td, expected_dict in zip(data, expected_dicts): self.assertDictEqual(dict(td.items()), expected_dict)
def combine_raw_datasets(user, messages_datasets, surveys_datasets): data = [] for messages_dataset in messages_datasets: data.extend(messages_dataset) for surveys_dataset in surveys_datasets: TracedData.update_iterable(user, "avf_phone_id", data, surveys_dataset, "survey_responses") return data
def combine_raw_datasets(user, shows_datasets, survey_datasets): data = [] for show_dataset in shows_datasets: data.extend(show_dataset) for survey in survey_datasets: TracedData.update_iterable(user, "avf_phone_id", data, survey, "survey_responses") return data
def test__sha_with_prev(self): self.assertEqual( TracedData._sha_with_prev( {"phone": "+441632000001", "age": 20}, "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" ), "7e7f3e31168dd8587dac8a58858b17d7644c21400b91ae000f3fcb0f6f8017d4" ) self.assertEqual( TracedData._sha_with_prev({"phone": "+441632000001", "age": 20}, None), "5e106f6389b42724efb754067be30d67473ce7f443464c565a8e4d57e62d1fd3" )
def test__traced_repr(self): demog_1_td = self.generate_demog_1_td() self.assertDictEqual( TracedData._replace_traced_with_sha({"phone": "+441632000001", "demog_1": demog_1_td}), {"phone": "+441632000001", "demog_1": demog_1_td._sha} )
def convert_facebook_comments_to_traced_data(user, dataset_name, raw_comments, facebook_uuid_table): log.info(f"Converting {len(raw_comments)} Facebook comments to TracedData...") facebook_uuids = {comment["from"]["id"] for comment in raw_comments} facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch(facebook_uuids) traced_comments = [] # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able # to see Facebook user ids or not. for comment in raw_comments: comment["created_time"] = isoparse(comment["created_time"]).isoformat() validators.validate_utc_iso_string(comment["created_time"]) comment_dict = { "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]] } for k, v in comment.items(): comment_dict[f"{dataset_name}.{k}"] = v traced_comments.append( TracedData(comment_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info(f"Converted {len(traced_comments)} Facebook comments to TracedData") return traced_comments
def test_fold_groups(self): data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i)) for i, c in enumerate(["a", "b", "c", "d", "e"])] groups = [ [data[0]], [data[1], data[2], data[3]], [data[4]] ] def fold_fn(td_1, td_2): td_1 = td_1.copy() td_2 = td_2.copy() folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])} td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10)) td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11)) folded = td_1 td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12)) return folded folded_data = FoldTracedData.fold_groups(groups, fold_fn) self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"}) self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"}) self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def generate_test_data(): test_data = list(generate_traced_data_iterable()) test_data[1].append_data({"Gender": "f", "Gender_Coded": "Female"}, Metadata("test_user", "gender_coder", 10)) test_data[2].append_traced_data("Age_Data", TracedData({"age": 4}, Metadata("test_user", "age_generator", 11)), Metadata("test_user", "age_merger", 12)) return test_data
def import_csv_to_traced_data_iterable(user, f): """ Loads a CSV into new TracedData objects. :param user: Identifier of user running this program. :type user: str :param f: File to import from. :type f: file-like :return: TracedData objects imported from the provided file. :rtype: generator of TracedData """ for row in csv.DictReader(f): yield TracedData( dict(row), Metadata(user, Metadata.get_call_location(), time.time()))
def import_jsonl_to_traced_data_iterable(f): """ Imports a JSONL file to TracedData objects. Note that the JSONL file must be a serialized representation of TracedData objects in the format as produced by TracedDataJsonIO.export_traced_data_iterable_to_json. :param f: File to import JSONL from. :type f: file-like :return: TracedData objects deserialized from the JSONL file. :rtype: generator of TracedData """ data = [] for line in f: data.append(TracedData.deserialize(json.loads(line))) return data
def test_join_iterables(self): data_1 = [ TracedData( {"id": "A", "gender": "male", "age": 55}, Metadata("test_user", Metadata.get_call_location(), time.time()) ), TracedData( {"id": "B", "age": 19}, Metadata("test_user", Metadata.get_call_location(), time.time()) ) ] data_2 = [ TracedData( {"id": "C", "country": "Somalia"}, Metadata("test_user", Metadata.get_call_location(), time.time()) ), TracedData( {"id": "A", "country": "Kenya", "gender": "female"}, Metadata("test_user", Metadata.get_call_location(), time.time()) ) ] # Joining should file because item with id 'A' has conflicting genders self.assertRaises(AssertionError, lambda: TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2")) # Fix the gender conflict problem, and test that the join now works as expected. data_2[1].append_data({"gender": "male"}, Metadata("test_user", Metadata.get_call_location(), time.time())) merged = TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2") merged_dicts = [dict(td.items()) for td in merged] expected_dicts = [ {"id": "B", "age": 19}, {"id": "C", "country": "Somalia"}, {"id": "A", "gender": "male", "age": 55, "country": "Kenya"} ] self.assertEqual(len(merged_dicts), len(expected_dicts)) for merged, expected in zip(merged_dicts, expected_dicts): self.assertDictEqual(merged, expected) # Modify data_1 to include multiple TracedData objects with the same join key, and ensure joining then fails. data_1[0].append_data({"id": "B"}, Metadata("test_user", Metadata.get_call_location(), time.time())) self.assertRaises(AssertionError, lambda: TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2"))
def test_fold_traced_data(self): td_1_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "abc", "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.FALSE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.YES, "other_1": "other 1", "other_2": "other 2" } td_2_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.NO, "other_1": "other", } td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0)) td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1)) fold_strategies = { "equal_1": FoldStrategies.assert_equal, "equal_2": FoldStrategies.assert_equal, "concat": FoldStrategies.concatenate, "bool_1": FoldStrategies.boolean_or, "bool_2": FoldStrategies.boolean_or, "matrix_1": FoldStrategies.matrix, "matrix_2": FoldStrategies.matrix, "yes_no_1": FoldStrategies.yes_no_amb, "yes_no_2": FoldStrategies.yes_no_amb } folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies) # Test input tds unchanged self.assertDictEqual(dict(td_1.items()), td_1_dict) self.assertDictEqual(dict(td_2.items()), td_2_dict) # Test folded td has expected values self.assertDictEqual( dict(folded_td.items()), { "equal_1": 4, "equal_2": "xyz", "concat": "abc;def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT } )
echo_mobile_start_date = session.datetime_to_echo_mobile_datetime(user_start_date) echo_mobile_end_date = session.datetime_to_echo_mobile_datetime(user_end_date) report = session.messages_report( echo_mobile_start_date.strftime("%Y-%m-%d"), echo_mobile_end_date.strftime("%Y-%m-%d"), direction=MessageDirection.Incoming) finally: # Delete the background task we made when generating the report session.delete_session_background_tasks() # Parse the downloaded report into a list of TracedData objects, de-identifying in the process. messages = [] for row in csv.DictReader(StringIO(report)): row["avf_phone_id"] = phone_uuids.add_phone(row["Phone"]) del row["Phone"] messages.append(TracedData(dict(row), Metadata(user, Metadata.get_call_location(), time.time()))) # Convert times to ISO for td in messages: td.append_data( {"Date": session.echo_mobile_date_to_iso(td["Date"])}, Metadata(user, Metadata.get_call_location(), time.time()) ) # Filter out messages sent outwith the desired time range. messages = list(filter(lambda td: echo_mobile_start_date <= isoparse(td["Date"]) < echo_mobile_end_date, messages)) # Add a unique id to each message for td in messages: td.append_data( {"avf_message_id": message_uuids.add_message(
def test_export_import_one_single_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"}, {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"}, {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f: gender_scheme = CodeScheme.from_firebase_map(json.load(f)) # Apply the English gender cleaner with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \ mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock: time_mock.return_value = "2018-11-02T15:00:07+00:00" location_mock.return_value = "english.DemographicCleaner.clean_gender" CleaningUtils.apply_cleaner_to_traced_data_iterable( "test_user", messages, "gender_raw", "gender_coded", english.DemographicCleaner.clean_gender, gender_scheme ) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f) # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] expected_code_ids = [ gender_scheme.get_code_with_match_value("female").code_id, # Manually approved auto-code gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # Empty raw message gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually assigned code which isn't checked gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # No raw message gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id, # Manually Not Coded gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually un-coded ] self.assertListEqual(imported_code_ids, expected_code_ids) # Add an element with the same raw text but a conflicting messages.append(TracedData({ "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00", "gender_coded": CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label", date_time_utc="2018-11-03T13:40:50Z").to_dict() }, Metadata("test_user", Metadata.get_call_location(), time.time()))) TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") with open(file_path, "w") as f: try: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) except AssertionError as e: assert str(e) == "Messages with the same id " \ "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \ "labels for coded_key 'gender_coded'" return self.fail("Exporting data with conflicting labels did not fail")
somali.DemographicCleaner.clean_yes_no, "Cholera_Vaccination (Text) - wt_practice": somali.DemographicCleaner.clean_yes_no, "Trustworthy_Advisors (Text) - wt_practice": None } # Load data from JSON file with open(demog_1_input_path, "r") as f: demog_1_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) with open(demog_2_input_path, "r") as f: demog_2_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) with open(practice_input_path, "r") as f: practice_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Join the survey data on "avf_phone_id" demog_data = TracedData.join_iterables(user, "avf_phone_id", demog_1_data, demog_2_data, "wt_demog_2") all_survey_data = TracedData.join_iterables(user, "avf_phone_id", demog_data, practice_data, "wt_practice") # Clean the survey responses for td in all_survey_data: for key, cleaner in cleaning_plan.items(): if cleaner is not None and key in td: td.append_data({"{}_clean".format(key): cleaner(td[key])}, Metadata(user, Metadata.get_call_location(), time.time())) # Mark missing entries in the raw data as true missing for td in all_survey_data: for key in cleaning_plan:
print("Number of mislocated messages found: {}".format( len(mislocated_messages))) # Sort by ascending order of modification date. mislocated_messages = list(mislocated_messages) mislocated_messages.reverse() # Convert the mislocated messages to de-identified TracedData traced_messages = [] for message in mislocated_messages: traced_messages.append( TracedData( { "avf_phone_id": phone_uuids.add_phone(message.urn), "id": message.id, "text": message.text, "created_on": message.created_on.isoformat(), "sent_on": message.sent_on.isoformat(), "modified_on": message.modified_on.isoformat() }, Metadata(user, Metadata.get_call_location(), time.time()))) # Make the traced messages look like week 4 runs def format_label(parameter): """ Creates a week 4 key for the given parameter. >>> format_label("Text") 'S06E04_Cholera_Recurrency (Text) - wt_s06e04_activation' """ category_title = "S06E04_Cholera_Recurrency" flow_name = "wt_s06e04_activation"
def generate_appended_traced_data(): message_data = {"phone": "+441632000001", "message": "Hello AVF!"} message_td = TracedData(message_data, Metadata("test_user", "run_fetcher", 0)) message_td.append_data({"message": "hello avf"}, Metadata("test_user", "message_cleaner", 1)) demog_1_data = {"phone": "+441632000001", "gender": "woman", "age": "twenty"} demog_1_td = TracedData(demog_1_data, Metadata("test_user", "run_fetcher", 2)) demog_1_td.append_data({"gender": "female", "age": 20}, Metadata("test_user", "demog_cleaner", 3)) demog_2_data = {"phone": "+441632000001", "country": "Kenyan citizen"} demog_2_td = TracedData(demog_2_data, Metadata("test_user", "run_fetcher", 4)) demog_2_td.append_data({"country": "Kenya"}, Metadata("test_user", "demog_cleaner", 5)) message_td.append_traced_data("demog_1", demog_1_td, Metadata("test_user", "demog_1_append", 6)) message_td.append_traced_data("demog_2", demog_2_td, Metadata("test_user", "demog_2_append", 7)) return message_td
def generate_message_td(): message_data = {"phone": "+441632000001", "message": "Hello AVF!"} message_td = TracedData(message_data, Metadata("test_user", "run_fetcher", time.time())) message_td.append_data({"message": "hello avf"}, Metadata("test_user", "message_cleaner", time.time())) return message_td
def test_export_import_one_multi_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"}, {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"}, {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"}, {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"} ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f: msg_scheme = CodeScheme.from_firebase_map(json.load(f)) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) for td in imported_messages: self.assertEqual(len(td["msg_coded"]), 1) imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) # Test that reading the same file-pointer twice without moving it back to the start of the file fails try: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) self.fail("Re-using the same file pointer didn't raise an assertion error") except AssertionError as e: self.assertEqual(str(e), "File-pointer not at byte 0. " "Should you have used e.g. `f.seek(0)` before calling this method?") # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [] for td in imported_messages: imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]]) expected_code_ids = [ [msg_scheme.get_code_with_match_value("food").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id] ] for x, y in zip(imported_code_ids, expected_code_ids): self.assertEqual(len(x), len(y)) self.assertSetEqual(set(x), set(y))
def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None): """ Converts raw data fetched from Rapid Pro to TracedData. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param raw_runs: Raw run objects to convert to TracedData. :type raw_runs: list of temba_client.v2.types.Run :param raw_contacts: Raw contact objects to use when converting to TracedData. :type raw_contacts: list of temba_client.v2.types.Contact :param phone_uuids: Phone number <-> UUID table. :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable :param test_contacts: Rapid Pro contact UUIDs of test contacts. Runs from any of those test contacts will be tagged with {'test_run': True} :type test_contacts: list of str | None :return: Raw data fetched from Rapid Pro converted to TracedData. :rtype: list of TracedData """ if test_contacts is None: test_contacts = [] log.info(f"Converting {len(raw_runs)} raw runs to TracedData...") contacts_lut = {c.uuid: c for c in raw_contacts} runs_with_uuids = [] phone_numbers = [] for run in raw_runs: if run.contact.uuid not in contacts_lut: # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`. # I have only observed this happen for contacts which were created very recently. # This test skips the run in this case; it should be included next time this script is executed. log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', " f"but this id is not present in the downloaded contacts") continue contact_urns = contacts_lut[run.contact.uuid].urns if len(contact_urns) == 0: log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} " f"(Rapid Pro Contact UUID: {run.contact.uuid})") continue phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0])) runs_with_uuids.append(run) phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers) traced_runs = [] for run in runs_with_uuids: contact_urns = contacts_lut[run.contact.uuid].urns run_dict = { "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])], f"run_id - {run.flow.name}": run.id } for category, response in run.values.items(): run_dict[category.title() + " (Category) - " + run.flow.name] = response.category run_dict[category.title() + " (Value) - " + run.flow.name] = response.value # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports. run_dict[category.title() + " (Text) - " + run.flow.name] = response.input run_dict[category.title() + " (Name) - " + run.flow.name] = response.name run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat() run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id if run.contact.uuid in test_contacts: run_dict["test_run"] = True else: assert len(contact_urns) == 1, \ f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})" run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat() run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat() run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat() run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type traced_runs.append( TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info(f"Converted {len(traced_runs)} raw runs to TracedData") return traced_runs
) parser.add_argument( "json_output_path", metavar="json-output-path", help="Path to a JSON file to write processed messages to") args = parser.parse_args() user = args.user json_input_path = args.json_input_path survey_input_path = args.survey_input_path json_output_path = args.json_output_path # Load messages with open(json_input_path, "r") as f: messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Load surveys with open(survey_input_path, "r") as f: surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Add survey data to the messages TracedData.update_iterable(user, "avf_phone_id", messages, surveys, "survey_responses") # Write json output IOUtils.ensure_dirs_exist_for_file(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(messages, f, pretty_print=True)
bossaso_output_path = args.bossaso_output_path baidoa_output_path = args.baidoa_output_path # Load the phone number <-> uuid table log.info( f"Loading the phone number <-> uuid table from file '{phone_number_uuid_table_path}'..." ) with open(phone_number_uuid_table_path, "r") as f: phone_number_uuid_table = PhoneNumberUuidTable.load(f) log.info(f"Loaded {len(phone_number_uuid_table.numbers())} contacts") # Load the ADSS traced data log.info(f"Loading ADSS traced data from file '{traced_data_path}'...") with open(traced_data_path, "r") as f: # Manually deserialise the traced data because ADSS used an older serialiser data = [TracedData.deserialize(d) for d in json.load(f)] log.info(f"Loaded {len(data)} traced data objects") # Search the TracedData for the bossaso/baidoa contacts bossaso_uuids = set() baidoa_uuids = set() log.info("Searching for participants from Bossaso or Baidoa") for td in data: if td["district_coded"] == "STOP": continue if td["district_coded"][ "CodeID"] == CodeSchemes.SOMALIA_DISTRICT.get_code_with_match_value( "bossaso").code_id: bossaso_uuids.add(td["uid"]) elif td["district_coded"][
1: "wt_s06e1_activation", 2: "wt_s06e2_activation", 3: "wt_s06e03_activation", 4: "wt_s06e04_activation", 5: "wt_s06e05_activation" } # Produce output columns for each input message all_messages = [] all_show_keys = {1: set(), 2: set(), 3: set(), 4: set(), 5: set()} trustworthy_advisors_keys = set() outbreak_keys = set() trustworthy_advisors_raw_key = "Trustworthy_Advisors (Text) - wt_practice" for show_number, show_name in shows.items(): show_messages = load_show(show_name) TracedData.update_iterable(user, "avf_phone_id", show_messages, surveys, "surveys") for td in show_messages: AnalysisKeys.set_analysis_keys(user, show_number, td) AnalysisKeys.set_matrix_analysis_keys(user, all_show_keys[show_number], show_number, td) AnalysisKeys.set_matrix_keys( user, td, trustworthy_advisors_keys, "{}_coded".format(trustworthy_advisors_raw_key), "trustworthy_advisors_clean") AnalysisKeys.set_matrix_keys( user, td, outbreak_keys, "{}_outbreak_coded".format(trustworthy_advisors_raw_key),
def make_traced_data(dicts, start_time=0): return [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i + start_time)) for i, d in enumerate(dicts)]
def generate_demog_1_td(): demog_1_data = {"phone": "+441632000001", "gender": "woman", "age": "twenty"} demog_1_td = TracedData(demog_1_data, Metadata("test_user", "run_fetcher", time.time())) demog_1_td.append_data({"gender": "female", "age": 20}, Metadata("test_user", "demog_cleaner", time.time())) return demog_1_td
def generate_demog_2_td(): demog_2_data = {"phone": "+441632000001", "country": "Kenyan citizen"} demog_2_td = TracedData(demog_2_data, Metadata("test_user", "run_fetcher", time.time())) demog_2_td.append_data({"country": "Kenya"}, Metadata("test_user", "demog_cleaner", time.time())) return demog_2_td
with open(demog_surveys_input_path, "r") as f: surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Filter out people who haven't answered the fgd_cc consent question fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc" fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td] # Filter out people that we have exported in the past prev_contacts = {td["Phone Number"] for td in prev_exports} fgd_cc_data = [ td for td in fgd_cc_data if "+{}".format( phone_uuids.get_phone(td["avf_phone_id"])) not in prev_contacts ] # Apply the demog surveys to the fgd_cc data TracedData.update_iterable(user, "avf_phone_id", fgd_cc_data, surveys, "surveys") # Annotate fgd_cc_data with whether or not the respondent's is from Mogadishu mogadishu_districts = [ "mogadishu", "mogadisho", # TODO: Remove need for this by correcting Coda file "boondheere", "cabdiasis", "daynile", "dharkenley", "heliwa", "hodan", "hawl wadaag", "karaan", "shangaani", "shibis",
def generate_test_data(): """Returns a new TracedData object with example id, phone, and gender fields""" data = {"id": "0", "phone": "+441632000001", "gender": "man"} return TracedData(data, Metadata("test_user", "run_fetcher", time.time()))
def generate_traced_data_iterable(): for i, text in enumerate(["female", "m", "WoMaN", "27", "female"]): d = {"URN": "+001234500000" + str(i), "Gender": text} yield TracedData(d, Metadata("test_user", "data_generator", i))