Esempio n. 1
0
    def test_update_iterable(self):
        data_dicts = [
            {"id": "A", "message": "hello"},
            {"id": "B", "message": "hello"},
            {"id": "A", "message": "hi"}
        ]
        data = [
            TracedData(d, Metadata("test_user", "data_generator", time.time()))
            for d in data_dicts
        ]

        updates_dicts = [
            {"id": "A", "gender": "male"},
            {"id": "B", "gender": "female", "age": 20}
        ]
        updates = [
            TracedData(d, Metadata("test_user", "data_generator", time.time()))
            for d in updates_dicts
        ]

        TracedData.update_iterable("test_user", "id", data, updates, "demographics")

        expected_dicts = [
            {"id": "A", "message": "hello", "gender": "male"},
            {"id": "B", "message": "hello", "gender": "female", "age": 20},
            {"id": "A", "message": "hi", "gender": "male"}
        ]

        for td, expected_dict in zip(data, expected_dicts):
            self.assertDictEqual(dict(td.items()), expected_dict)
    def combine_raw_datasets(user, messages_datasets, surveys_datasets):
        data = []

        for messages_dataset in messages_datasets:
            data.extend(messages_dataset)

        for surveys_dataset in surveys_datasets:
            TracedData.update_iterable(user, "avf_phone_id", data, surveys_dataset, "survey_responses")

        return data
    def combine_raw_datasets(user, shows_datasets, survey_datasets):
        data = []

        for show_dataset in shows_datasets:
            data.extend(show_dataset)

        for survey in survey_datasets:
            TracedData.update_iterable(user, "avf_phone_id", data, survey,
                                       "survey_responses")

        return data
Esempio n. 4
0
    def test__sha_with_prev(self):
        self.assertEqual(
            TracedData._sha_with_prev(
                {"phone": "+441632000001", "age": 20},
                "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08"
            ),
            "7e7f3e31168dd8587dac8a58858b17d7644c21400b91ae000f3fcb0f6f8017d4"
        )

        self.assertEqual(
            TracedData._sha_with_prev({"phone": "+441632000001", "age": 20}, None),
            "5e106f6389b42724efb754067be30d67473ce7f443464c565a8e4d57e62d1fd3"
        )
Esempio n. 5
0
    def test__traced_repr(self):
        demog_1_td = self.generate_demog_1_td()

        self.assertDictEqual(
            TracedData._replace_traced_with_sha({"phone": "+441632000001", "demog_1": demog_1_td}),
            {"phone": "+441632000001", "demog_1": demog_1_td._sha}
        )
Esempio n. 6
0
def convert_facebook_comments_to_traced_data(user, dataset_name, raw_comments, facebook_uuid_table):
    log.info(f"Converting {len(raw_comments)} Facebook comments to TracedData...")

    facebook_uuids = {comment["from"]["id"] for comment in raw_comments}
    facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch(facebook_uuids)

    traced_comments = []
    # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able
    # to see Facebook user ids or not.
    for comment in raw_comments:
        comment["created_time"] = isoparse(comment["created_time"]).isoformat()
        validators.validate_utc_iso_string(comment["created_time"])

        comment_dict = {
            "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]]
        }
        for k, v in comment.items():
            comment_dict[f"{dataset_name}.{k}"] = v

        traced_comments.append(
            TracedData(comment_dict,
                       Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())))

    log.info(f"Converted {len(traced_comments)} Facebook comments to TracedData")

    return traced_comments
    def test_fold_groups(self):
        data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i))
                for i, c in enumerate(["a", "b", "c", "d", "e"])]

        groups = [
            [data[0]],
            [data[1], data[2], data[3]],
            [data[4]]
        ]

        def fold_fn(td_1, td_2):
            td_1 = td_1.copy()
            td_2 = td_2.copy()

            folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])}

            td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10))
            td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11))

            folded = td_1
            td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12))

            return folded

        folded_data = FoldTracedData.fold_groups(groups, fold_fn)

        self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"})
        self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"})
        self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a Recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
Esempio n. 9
0
    def generate_test_data():
        test_data = list(generate_traced_data_iterable())

        test_data[1].append_data({"Gender": "f", "Gender_Coded": "Female"},
                                 Metadata("test_user", "gender_coder", 10))

        test_data[2].append_traced_data("Age_Data",
                                        TracedData({"age": 4}, Metadata("test_user", "age_generator", 11)),
                                        Metadata("test_user", "age_merger", 12))

        return test_data
Esempio n. 10
0
    def import_csv_to_traced_data_iterable(user, f):
        """
        Loads a CSV into new TracedData objects.

        :param user: Identifier of user running this program.
        :type user: str
        :param f: File to import from.
        :type f: file-like
        :return: TracedData objects imported from the provided file.
        :rtype: generator of TracedData
        """
        for row in csv.DictReader(f):
            yield TracedData(
                dict(row),
                Metadata(user, Metadata.get_call_location(), time.time()))
Esempio n. 11
0
    def import_jsonl_to_traced_data_iterable(f):
        """
        Imports a JSONL file to TracedData objects.

        Note that the JSONL file must be a serialized representation of TracedData objects in the format
        as produced by TracedDataJsonIO.export_traced_data_iterable_to_json.

        :param f: File to import JSONL from.
        :type f: file-like
        :return: TracedData objects deserialized from the JSONL file.
        :rtype: generator of TracedData
        """
        data = []
        for line in f:
            data.append(TracedData.deserialize(json.loads(line)))
        return data
Esempio n. 12
0
    def test_join_iterables(self):
        data_1 = [
            TracedData(
                {"id": "A", "gender": "male", "age": 55},
                Metadata("test_user", Metadata.get_call_location(), time.time())
            ),
            TracedData(
                {"id": "B", "age": 19},
                Metadata("test_user", Metadata.get_call_location(), time.time())
            )
        ]

        data_2 = [
            TracedData(
                {"id": "C", "country": "Somalia"},
                Metadata("test_user", Metadata.get_call_location(), time.time())
            ),
            TracedData(
                {"id": "A", "country": "Kenya", "gender": "female"},
                Metadata("test_user", Metadata.get_call_location(), time.time())
            )
        ]

        # Joining should file because item with id 'A' has conflicting genders
        self.assertRaises(AssertionError, lambda: TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2"))

        # Fix the gender conflict problem, and test that the join now works as expected.
        data_2[1].append_data({"gender": "male"}, Metadata("test_user", Metadata.get_call_location(), time.time()))
        merged = TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2")

        merged_dicts = [dict(td.items()) for td in merged]
        expected_dicts = [
            {"id": "B", "age": 19},
            {"id": "C", "country": "Somalia"},
            {"id": "A", "gender": "male", "age": 55, "country": "Kenya"}
        ]
        
        self.assertEqual(len(merged_dicts), len(expected_dicts))

        for merged, expected in zip(merged_dicts, expected_dicts):
            self.assertDictEqual(merged, expected)

        # Modify data_1 to include multiple TracedData objects with the same join key, and ensure joining then fails.
        data_1[0].append_data({"id": "B"}, Metadata("test_user", Metadata.get_call_location(), time.time()))
        self.assertRaises(AssertionError, lambda: TracedData.join_iterables("test_user", "id", data_1, data_2, "data_2"))
    def test_fold_traced_data(self):
        td_1_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc",
                "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.FALSE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.YES,
                "other_1": "other 1", "other_2": "other 2"
             }

        td_2_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.NO,
                "other_1": "other",
            }

        td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0))
        td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1))

        fold_strategies = {
            "equal_1": FoldStrategies.assert_equal,
            "equal_2": FoldStrategies.assert_equal,
            "concat": FoldStrategies.concatenate,
            "bool_1": FoldStrategies.boolean_or,
            "bool_2": FoldStrategies.boolean_or,
            "matrix_1": FoldStrategies.matrix,
            "matrix_2": FoldStrategies.matrix,
            "yes_no_1": FoldStrategies.yes_no_amb,
            "yes_no_2": FoldStrategies.yes_no_amb
        }
        folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies)

        # Test input tds unchanged
        self.assertDictEqual(dict(td_1.items()), td_1_dict)
        self.assertDictEqual(dict(td_2.items()), td_2_dict)
        
        # Test folded td has expected values
        self.assertDictEqual(
            dict(folded_td.items()),
            {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc;def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT
            }
        )
        echo_mobile_start_date = session.datetime_to_echo_mobile_datetime(user_start_date)
        echo_mobile_end_date = session.datetime_to_echo_mobile_datetime(user_end_date)

        report = session.messages_report(
            echo_mobile_start_date.strftime("%Y-%m-%d"), echo_mobile_end_date.strftime("%Y-%m-%d"),
            direction=MessageDirection.Incoming)
    finally:
        # Delete the background task we made when generating the report
        session.delete_session_background_tasks()

    # Parse the downloaded report into a list of TracedData objects, de-identifying in the process.
    messages = []
    for row in csv.DictReader(StringIO(report)):
        row["avf_phone_id"] = phone_uuids.add_phone(row["Phone"])
        del row["Phone"]
        messages.append(TracedData(dict(row), Metadata(user, Metadata.get_call_location(), time.time())))

    # Convert times to ISO
    for td in messages:
        td.append_data(
            {"Date": session.echo_mobile_date_to_iso(td["Date"])},
            Metadata(user, Metadata.get_call_location(), time.time())
        )

    # Filter out messages sent outwith the desired time range.
    messages = list(filter(lambda td: echo_mobile_start_date <= isoparse(td["Date"]) < echo_mobile_end_date, messages))

    # Add a unique id to each message
    for td in messages:
        td.append_data(
            {"avf_message_id": message_uuids.add_message(
Esempio n. 15
0
    def test_export_import_one_single_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"},
            {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"},
            {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
            {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f:
            gender_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Apply the English gender cleaner
        with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \
                mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock:
            time_mock.return_value = "2018-11-02T15:00:07+00:00"
            location_mock.return_value = "english.DemographicCleaner.clean_gender"

            CleaningUtils.apply_cleaner_to_traced_data_iterable(
                "test_user", messages, "gender_raw", "gender_coded",
                english.DemographicCleaner.clean_gender, gender_scheme
            )

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})

        na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f)

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        expected_code_ids = [
            gender_scheme.get_code_with_match_value("female").code_id,  # Manually approved auto-code
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # Empty raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually assigned code which isn't checked
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # No raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id,  # Manually Not Coded
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually un-coded
        ]
        self.assertListEqual(imported_code_ids, expected_code_ids)

        # Add an element with the same raw text but a conflicting
        messages.append(TracedData({
            "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00",
            "gender_coded": CleaningUtils.make_label_from_cleaner_code(
                gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label",
                date_time_utc="2018-11-03T13:40:50Z").to_dict()
        }, Metadata("test_user", Metadata.get_call_location(), time.time())))
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        with open(file_path, "w") as f:
            try:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)
            except AssertionError as e:
                assert str(e) == "Messages with the same id " \
                                 "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \
                                 "labels for coded_key 'gender_coded'"
                return
            self.fail("Exporting data with conflicting labels did not fail")
        somali.DemographicCleaner.clean_yes_no,
        "Cholera_Vaccination (Text) - wt_practice":
        somali.DemographicCleaner.clean_yes_no,
        "Trustworthy_Advisors (Text) - wt_practice": None
    }

    # Load data from JSON file
    with open(demog_1_input_path, "r") as f:
        demog_1_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
    with open(demog_2_input_path, "r") as f:
        demog_2_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
    with open(practice_input_path, "r") as f:
        practice_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Join the survey data on "avf_phone_id"
    demog_data = TracedData.join_iterables(user, "avf_phone_id", demog_1_data,
                                           demog_2_data, "wt_demog_2")
    all_survey_data = TracedData.join_iterables(user, "avf_phone_id",
                                                demog_data, practice_data,
                                                "wt_practice")

    # Clean the survey responses
    for td in all_survey_data:
        for key, cleaner in cleaning_plan.items():
            if cleaner is not None and key in td:
                td.append_data({"{}_clean".format(key): cleaner(td[key])},
                               Metadata(user, Metadata.get_call_location(),
                                        time.time()))

    # Mark missing entries in the raw data as true missing
    for td in all_survey_data:
        for key in cleaning_plan:
    print("Number of mislocated messages found: {}".format(
        len(mislocated_messages)))

    # Sort by ascending order of modification date.
    mislocated_messages = list(mislocated_messages)
    mislocated_messages.reverse()

    # Convert the mislocated messages to de-identified TracedData
    traced_messages = []
    for message in mislocated_messages:
        traced_messages.append(
            TracedData(
                {
                    "avf_phone_id": phone_uuids.add_phone(message.urn),
                    "id": message.id,
                    "text": message.text,
                    "created_on": message.created_on.isoformat(),
                    "sent_on": message.sent_on.isoformat(),
                    "modified_on": message.modified_on.isoformat()
                }, Metadata(user, Metadata.get_call_location(), time.time())))

    # Make the traced messages look like week 4 runs
    def format_label(parameter):
        """
        Creates a week 4 key for the given parameter.

        >>> format_label("Text")
        'S06E04_Cholera_Recurrency (Text) - wt_s06e04_activation'
        """
        category_title = "S06E04_Cholera_Recurrency"
        flow_name = "wt_s06e04_activation"
Esempio n. 18
0
def generate_appended_traced_data():
    message_data = {"phone": "+441632000001", "message": "Hello AVF!"}
    message_td = TracedData(message_data, Metadata("test_user", "run_fetcher", 0))
    message_td.append_data({"message": "hello avf"}, Metadata("test_user", "message_cleaner", 1))

    demog_1_data = {"phone": "+441632000001", "gender": "woman", "age": "twenty"}
    demog_1_td = TracedData(demog_1_data, Metadata("test_user", "run_fetcher", 2))
    demog_1_td.append_data({"gender": "female", "age": 20}, Metadata("test_user", "demog_cleaner", 3))

    demog_2_data = {"phone": "+441632000001", "country": "Kenyan citizen"}
    demog_2_td = TracedData(demog_2_data, Metadata("test_user", "run_fetcher", 4))
    demog_2_td.append_data({"country": "Kenya"}, Metadata("test_user", "demog_cleaner", 5))

    message_td.append_traced_data("demog_1", demog_1_td, Metadata("test_user", "demog_1_append", 6))
    message_td.append_traced_data("demog_2", demog_2_td, Metadata("test_user", "demog_2_append", 7))

    return message_td
Esempio n. 19
0
    def generate_message_td():
        message_data = {"phone": "+441632000001", "message": "Hello AVF!"}
        message_td = TracedData(message_data, Metadata("test_user", "run_fetcher", time.time()))
        message_td.append_data({"message": "hello avf"}, Metadata("test_user", "message_cleaner", time.time()))

        return message_td
Esempio n. 20
0
    def test_export_import_one_multi_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"},
            {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"},
            {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"},
            {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"}
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f:
            msg_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})

        na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        for td in imported_messages:
            self.assertEqual(len(td["msg_coded"]), 1)
        imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages]
        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)

            # Test that reading the same file-pointer twice without moving it back to the start of the file fails
            try:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)
                self.fail("Re-using the same file pointer didn't raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), "File-pointer not at byte 0. "
                                         "Should you have used e.g. `f.seek(0)` before calling this method?")

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = []
        for td in imported_messages:
            imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]])

        expected_code_ids = [
            [msg_scheme.get_code_with_match_value("food").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id]
        ]

        for x, y in zip(imported_code_ids, expected_code_ids):
            self.assertEqual(len(x), len(y))
            self.assertSetEqual(set(x), set(y))
Esempio n. 21
0
    def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None):
        """
        Converts raw data fetched from Rapid Pro to TracedData.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param raw_runs: Raw run objects to convert to TracedData.
        :type raw_runs: list of temba_client.v2.types.Run
        :param raw_contacts: Raw contact objects to use when converting to TracedData.
        :type raw_contacts: list of temba_client.v2.types.Contact
        :param phone_uuids: Phone number <-> UUID table.
        :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable
        :param test_contacts: Rapid Pro contact UUIDs of test contacts.
                              Runs from any of those test contacts will be tagged with {'test_run': True}
        :type test_contacts: list of str | None
        :return: Raw data fetched from Rapid Pro converted to TracedData.
        :rtype: list of TracedData
        """
        if test_contacts is None:
            test_contacts = []

        log.info(f"Converting {len(raw_runs)} raw runs to TracedData...")

        contacts_lut = {c.uuid: c for c in raw_contacts}

        runs_with_uuids = []
        phone_numbers = []
        for run in raw_runs:
            if run.contact.uuid not in contacts_lut:
                # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`.
                # I have only observed this happen for contacts which were created very recently.
                # This test skips the run in this case; it should be included next time this script is executed.
                log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', "
                            f"but this id is not present in the downloaded contacts")
                continue

            contact_urns = contacts_lut[run.contact.uuid].urns
            if len(contact_urns) == 0:
                log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} "
                            f"(Rapid Pro Contact UUID: {run.contact.uuid})")
                continue

            phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0]))
            runs_with_uuids.append(run)

        phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers)

        traced_runs = []
        for run in runs_with_uuids:
            contact_urns = contacts_lut[run.contact.uuid].urns
            run_dict = {
                "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])],
                f"run_id - {run.flow.name}": run.id
            }

            for category, response in run.values.items():
                run_dict[category.title() + " (Category) - " + run.flow.name] = response.category
                run_dict[category.title() + " (Value) - " + run.flow.name] = response.value
                # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports.
                run_dict[category.title() + " (Text) - " + run.flow.name] = response.input
                run_dict[category.title() + " (Name) - " + run.flow.name] = response.name
                run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat()
                run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id

            if run.contact.uuid in test_contacts:
                run_dict["test_run"] = True
            else:
                assert len(contact_urns) == 1, \
                    f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})"

            run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat()
            run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat()
            run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat()
            run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type

            traced_runs.append(
                TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())))

        log.info(f"Converted {len(traced_runs)} raw runs to TracedData")

        return traced_runs
    )
    parser.add_argument(
        "json_output_path",
        metavar="json-output-path",
        help="Path to a JSON file to write processed messages to")

    args = parser.parse_args()
    user = args.user
    json_input_path = args.json_input_path
    survey_input_path = args.survey_input_path
    json_output_path = args.json_output_path

    # Load messages
    with open(json_input_path, "r") as f:
        messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Load surveys
    with open(survey_input_path, "r") as f:
        surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Add survey data to the messages
    TracedData.update_iterable(user, "avf_phone_id", messages, surveys,
                               "survey_responses")

    # Write json output
    IOUtils.ensure_dirs_exist_for_file(json_output_path)
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(messages,
                                                             f,
                                                             pretty_print=True)
Esempio n. 23
0
    bossaso_output_path = args.bossaso_output_path
    baidoa_output_path = args.baidoa_output_path

    # Load the phone number <-> uuid table
    log.info(
        f"Loading the phone number <-> uuid table from file '{phone_number_uuid_table_path}'..."
    )
    with open(phone_number_uuid_table_path, "r") as f:
        phone_number_uuid_table = PhoneNumberUuidTable.load(f)
    log.info(f"Loaded {len(phone_number_uuid_table.numbers())} contacts")

    # Load the ADSS traced data
    log.info(f"Loading ADSS traced data from file '{traced_data_path}'...")
    with open(traced_data_path, "r") as f:
        # Manually deserialise the traced data because ADSS used an older serialiser
        data = [TracedData.deserialize(d) for d in json.load(f)]
    log.info(f"Loaded {len(data)} traced data objects")

    # Search the TracedData for the bossaso/baidoa contacts
    bossaso_uuids = set()
    baidoa_uuids = set()
    log.info("Searching for participants from Bossaso or Baidoa")
    for td in data:
        if td["district_coded"] == "STOP":
            continue

        if td["district_coded"][
                "CodeID"] == CodeSchemes.SOMALIA_DISTRICT.get_code_with_match_value(
                    "bossaso").code_id:
            bossaso_uuids.add(td["uid"])
        elif td["district_coded"][
        1: "wt_s06e1_activation",
        2: "wt_s06e2_activation",
        3: "wt_s06e03_activation",
        4: "wt_s06e04_activation",
        5: "wt_s06e05_activation"
    }

    # Produce output columns for each input message
    all_messages = []
    all_show_keys = {1: set(), 2: set(), 3: set(), 4: set(), 5: set()}
    trustworthy_advisors_keys = set()
    outbreak_keys = set()
    trustworthy_advisors_raw_key = "Trustworthy_Advisors (Text) - wt_practice"
    for show_number, show_name in shows.items():
        show_messages = load_show(show_name)
        TracedData.update_iterable(user, "avf_phone_id", show_messages,
                                   surveys, "surveys")

        for td in show_messages:
            AnalysisKeys.set_analysis_keys(user, show_number, td)
            AnalysisKeys.set_matrix_analysis_keys(user,
                                                  all_show_keys[show_number],
                                                  show_number, td)

            AnalysisKeys.set_matrix_keys(
                user, td, trustworthy_advisors_keys,
                "{}_coded".format(trustworthy_advisors_raw_key),
                "trustworthy_advisors_clean")

            AnalysisKeys.set_matrix_keys(
                user, td, outbreak_keys,
                "{}_outbreak_coded".format(trustworthy_advisors_raw_key),
 def make_traced_data(dicts, start_time=0):
     return [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i + start_time))
             for i, d in enumerate(dicts)]
Esempio n. 26
0
    def generate_demog_1_td():
        demog_1_data = {"phone": "+441632000001", "gender": "woman", "age": "twenty"}
        demog_1_td = TracedData(demog_1_data, Metadata("test_user", "run_fetcher", time.time()))
        demog_1_td.append_data({"gender": "female", "age": 20}, Metadata("test_user", "demog_cleaner", time.time()))

        return demog_1_td
Esempio n. 27
0
    def generate_demog_2_td():
        demog_2_data = {"phone": "+441632000001", "country": "Kenyan citizen"}
        demog_2_td = TracedData(demog_2_data, Metadata("test_user", "run_fetcher", time.time()))
        demog_2_td.append_data({"country": "Kenya"}, Metadata("test_user", "demog_cleaner", time.time()))

        return demog_2_td
    with open(demog_surveys_input_path, "r") as f:
        surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Filter out people who haven't answered the fgd_cc consent question
    fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc"
    fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td]

    # Filter out people that we have exported in the past
    prev_contacts = {td["Phone Number"] for td in prev_exports}
    fgd_cc_data = [
        td for td in fgd_cc_data if "+{}".format(
            phone_uuids.get_phone(td["avf_phone_id"])) not in prev_contacts
    ]

    # Apply the demog surveys to the fgd_cc data
    TracedData.update_iterable(user, "avf_phone_id", fgd_cc_data, surveys,
                               "surveys")

    # Annotate fgd_cc_data with whether or not the respondent's is from Mogadishu
    mogadishu_districts = [
        "mogadishu",
        "mogadisho",  # TODO: Remove need for this by correcting Coda file
        "boondheere",
        "cabdiasis",
        "daynile",
        "dharkenley",
        "heliwa",
        "hodan",
        "hawl wadaag",
        "karaan",
        "shangaani",
        "shibis",
Esempio n. 29
0
 def generate_test_data():
     """Returns a new TracedData object with example id, phone, and gender fields"""
     data = {"id": "0", "phone": "+441632000001", "gender": "man"}
     return TracedData(data, Metadata("test_user", "run_fetcher", time.time()))
Esempio n. 30
0
def generate_traced_data_iterable():
    for i, text in enumerate(["female", "m", "WoMaN", "27", "female"]):
        d = {"URN": "+001234500000" + str(i), "Gender": text}
        yield TracedData(d, Metadata("test_user", "data_generator", i))