Python CleaningUtils.apply_cleaner_to_traced_data_iterable Exemples, core_data_modules.cleaners.cleaning_utils.CleaningUtils.apply_cleaner_to_traced_data_iterable Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : auto_code.py Projet : AfricasVoices/Project-UNICEF-COVID19-SOM

 def run_cleaners(cls, user, data):
     for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
         for cc in plan.coding_configurations:
             if cc.cleaner is not None:
                 CleaningUtils.apply_cleaner_to_traced_data_iterable(
                     user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                     cc.code_scheme)

Exemple #2

0

Afficher le fichier

    def auto_code_surveys(cls, user, data, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        # Note: no need to handle location in any special way on this project because it is not being auto-coded

        return data

Exemple #3

0

Afficher le fichier

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field,
                                                                    plan.cleaner, plan.code_scheme)

        # Output survey answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)
            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, 'w') as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f
                )
        print("Coda demogs files successfully exported")

        return data

Exemple #4

0

Afficher le fichier

Fichier : auto_code_surveys.py Projet : AfricasVoices/Project-CAPYEI

    def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = []
            for td in data:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

Exemple #5

0

Afficher le fichier

Fichier : auto_code_surveys.py Projet : AfricasVoices/Project-UNDP-RCO

    def auto_code_surveys(cls, user, data, pipeline_configuration,
                          coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Remove survey data sent after the project finished
        log.info(
            "Hiding survey messages sent after the end of the project. These will not be exported in "
            "production/analysis files")
        out_of_range_count = 0
        for td in data:
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists
                if plan.raw_field in ["have_voice_raw", "suggestions_raw"]:
                    continue

                if plan.time_field in td and isoparse(
                        td[plan.time_field]
                ) > pipeline_configuration.project_end_date:
                    out_of_range_count += 1
                    td.hide_keys({plan.raw_field, plan.time_field},
                                 Metadata(user, Metadata.get_call_location(),
                                          time.time()))
        log.info(
            f"Hid {out_of_range_count} survey messages sent after the end of the project"
        )

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Output survey responses to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        return data

Exemple #6

0

Afficher le fichier

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Set operator from phone number
        for td in data:
            operator_clean = PhoneCleaner.clean_operator(
                phone_uuid_table.get_phone(td["uid"]))
            if operator_clean == Codes.NOT_CODED:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_control_code(
                        Codes.NOT_CODED), Metadata.get_call_location())
            else:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_match_value(
                        operator_clean), Metadata.get_call_location())
            td.append_data({"operator_coded": label.to_dict()},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field == "mogadishu_sub_district_raw":
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output location scheme to coda for manual verification + coding
        output_path = path.join(coda_output_dir, "location.json")
        TracedDataCodaV2IO.compute_message_ids(
            user, data, "mogadishu_sub_district_raw",
            "mogadishu_sub_district_raw_id")
        with open(output_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                data, "mogadishu_sub_district_raw",
                "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id",
                {
                    "mogadishu_sub_district_coded":
                    CodeSchemes.MOGADISHU_SUB_DISTRICT,
                    "district_coded": CodeSchemes.DISTRICT,
                    "region_coded": CodeSchemes.REGION,
                    "state_coded": CodeSchemes.STATE,
                    "zone_coded": CodeSchemes.ZONE
                }, f)

        return data

Exemple #7

0

Afficher le fichier

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded data into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            for cc in plan.coding_configurations:
                if not cc.requires_manual_verification:
                    continue

                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                    else:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                finally:
                    if f is not None:
                        f.close()

            if PipelineConfiguration.WS_CORRECT_DATASET_SCHEME is not None:
                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, data, plan.id_field, {
                            f"{plan.raw_field}_correct_dataset":
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                        }, f)
                finally:
                    if f is not None:
                        f.close()

        # Label data for which there is no response as TRUE_MISSING.
        # Label data for which the response is the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if raw_field not in td:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [
                                na_label
                            ]
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if td.get(raw_field) == "":
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.NOT_CODED),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                nc_label
                            ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td.get("noise", False):
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    for cc in plan.coding_configurations:
                        if cc.coded_field not in td:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                cc.code_scheme,
                                cc.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location()).to_dict()
                            nc_dict[
                                cc.
                                coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                    nc_label
                                ]
                td.append_data(
                    nc_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Run the cleaners that don't require manual verification again, this time setting "checked" to True
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None and not cc.requires_manual_verification:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user,
                        data,
                        raw_field,
                        cc.coded_field,
                        cc.cleaner,
                        cc.code_scheme,
                        set_checked=True)

        # Run code imputation functions
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.code_imputation_function is not None:
                plan.code_imputation_function(user, data,
                                              plan.coding_configurations)

        cls._impute_coding_error_codes(user, data)

        return data

Exemple #8

0

Afficher le fichier

Fichier : test_io.py Projet : AfricasVoices/CoreDataModules

    def test_export_import_one_single_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"},
            {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"},
            {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
            {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f:
            gender_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Apply the English gender cleaner
        with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \
                mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock:
            time_mock.return_value = "2018-11-02T15:00:07+00:00"
            location_mock.return_value = "english.DemographicCleaner.clean_gender"

            CleaningUtils.apply_cleaner_to_traced_data_iterable(
                "test_user", messages, "gender_raw", "gender_coded",
                english.DemographicCleaner.clean_gender, gender_scheme
            )

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})

        na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f)

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        expected_code_ids = [
            gender_scheme.get_code_with_match_value("female").code_id,  # Manually approved auto-code
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # Empty raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually assigned code which isn't checked
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # No raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id,  # Manually Not Coded
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually un-coded
        ]
        self.assertListEqual(imported_code_ids, expected_code_ids)

        # Add an element with the same raw text but a conflicting
        messages.append(TracedData({
            "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00",
            "gender_coded": CleaningUtils.make_label_from_cleaner_code(
                gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label",
                date_time_utc="2018-11-03T13:40:50Z").to_dict()
        }, Metadata("test_user", Metadata.get_call_location(), time.time())))
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        with open(file_path, "w") as f:
            try:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)
            except AssertionError as e:
                assert str(e) == "Messages with the same id " \
                                 "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \
                                 "labels for coded_key 'gender_coded'"
                return
            self.fail("Exporting data with conflicting labels did not fail")