コード例 #1
0
    def _impute_coding_error_codes(user, data):
        for td in data:
            coding_error_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        coding_error_dict[plan.coded_field] = [
                            CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(
                                    Codes.CODING_ERROR),
                                Metadata.get_call_location()).to_dict()
                        ]
                        if plan.binary_code_scheme is not None:
                            coding_error_dict[plan.binary_coded_field] = \
                                CleaningUtils.make_label_from_cleaner_code(
                                    plan.binary_code_scheme,
                                    plan.binary_code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                    Metadata.get_call_location()
                                ).to_dict()

            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        coding_error_dict[plan.coded_field] = \
                            CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                Metadata.get_call_location()
                            ).to_dict()

            td.append_data(
                coding_error_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))
コード例 #2
0
    def export_to_csv(user, data, pipeline_configuration, raw_data_dir, csv_path, export_keys, consent_withdrawn_key):
        # Convert codes to their string/matrix values
        for td in data:
            analysis_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.SINGLE:
                        analysis_dict[cc.analysis_file_key] = \
                            cc.code_scheme.get_code_with_code_id(td[cc.coded_field]["CodeID"]).string_value
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        show_matrix_keys = []
                        for code in cc.code_scheme.codes:
                            show_matrix_keys.append(f"{cc.analysis_file_key}{code.string_value}")

                        for label in td[cc.coded_field]:
                            code_string_value = cc.code_scheme.get_code_with_code_id(label["CodeID"]).string_value
                            analysis_dict[f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1

                        for key in show_matrix_keys:
                            if key not in analysis_dict:
                                analysis_dict[key] = Codes.MATRIX_0
            td.append_data(analysis_dict,
                           Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        # Tag listening group participants
        ListeningGroups.tag_listening_groups_participants(user, data, pipeline_configuration, raw_data_dir)

        # Hide data from participants who opted out
        ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys)

        with open(csv_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=export_keys)
コード例 #3
0
    def set_show_ids(cls, user, data, show_id_map):
        """
        Sets a show_id for each message, using the presence of Rapid Pro value keys to determine which show each message
        belongs to.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set the show ids of.
        :type data: iterable of TracedData
        :param show_id_map: Dictionary of Rapid Pro value key to show id.
        :type show_id_map: dict of str -> int
        """
        for td in data:
            show_dict = dict()

            for message_key, show_id in show_id_map.items():
                if message_key in td:
                    show_dict["rqa_message"] = td[message_key]
                    show_dict["show_id"] = show_id

            td.append_data(
                show_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))
コード例 #4
0
    def convert_facebook_comments_to_traced_data(user, dataset_name,
                                                 raw_comments,
                                                 facebook_uuid_table):
        log.info(
            f"Converting {len(raw_comments)} Facebook comments to TracedData..."
        )

        facebook_uuids = {comment["from"]["id"] for comment in raw_comments}
        facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch(
            facebook_uuids)

        traced_comments = []
        # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able
        # to see Facebook user ids or not.
        for comment in raw_comments:
            comment["created_time"] = isoparse(
                comment["created_time"]).isoformat()
            validators.validate_utc_iso_string(comment["created_time"])

            comment_dict = {
                "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]]
            }
            for k, v in comment.items():
                comment_dict[f"{dataset_name}.{k}"] = v

            traced_comments.append(
                TracedData(
                    comment_dict,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))

        log.info(
            f"Converted {len(traced_comments)} Facebook comments to TracedData"
        )

        return traced_comments
コード例 #5
0
    def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None):
        """
        Converts raw data fetched from Rapid Pro to TracedData.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param raw_runs: Raw run objects to convert to TracedData.
        :type raw_runs: list of temba_client.v2.types.Run
        :param raw_contacts: Raw contact objects to use when converting to TracedData.
        :type raw_contacts: list of temba_client.v2.types.Contact
        :param phone_uuids: Phone number <-> UUID table.
        :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable
        :param test_contacts: Rapid Pro contact UUIDs of test contacts.
                              Runs from any of those test contacts will be tagged with {'test_run': True}
        :type test_contacts: list of str | None
        :return: Raw data fetched from Rapid Pro converted to TracedData.
        :rtype: list of TracedData
        """
        if test_contacts is None:
            test_contacts = []

        log.info(f"Converting {len(raw_runs)} raw runs to TracedData...")

        contacts_lut = {c.uuid: c for c in raw_contacts}

        runs_with_uuids = []
        phone_numbers = []
        for run in raw_runs:
            if run.contact.uuid not in contacts_lut:
                # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`.
                # I have only observed this happen for contacts which were created very recently.
                # This test skips the run in this case; it should be included next time this script is executed.
                log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', "
                            f"but this id is not present in the downloaded contacts")
                continue

            contact_urns = contacts_lut[run.contact.uuid].urns
            if len(contact_urns) == 0:
                log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} "
                            f"(Rapid Pro Contact UUID: {run.contact.uuid})")
                continue

            phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0]))
            runs_with_uuids.append(run)

        phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers)

        traced_runs = []
        for run in runs_with_uuids:
            contact_urns = contacts_lut[run.contact.uuid].urns
            run_dict = {
                "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])],
                f"run_id - {run.flow.name}": run.id
            }

            for category, response in run.values.items():
                run_dict[category.title() + " (Category) - " + run.flow.name] = response.category
                run_dict[category.title() + " (Value) - " + run.flow.name] = response.value
                # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports.
                run_dict[category.title() + " (Text) - " + run.flow.name] = response.input
                run_dict[category.title() + " (Name) - " + run.flow.name] = response.name
                run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat()
                run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id

            if run.contact.uuid in test_contacts:
                run_dict["test_run"] = True
            else:
                assert len(contact_urns) == 1, \
                    f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})"

            run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat()
            run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat()
            run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat()
            run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type

            traced_runs.append(
                TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())))

        log.info(f"Converted {len(traced_runs)} raw runs to TracedData")

        return traced_runs
コード例 #6
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of raw/coded keys which
        survey_keys = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys:
                survey_keys.append(plan.analysis_file_key)
            if plan.raw_field not in survey_keys:
                survey_keys.append(plan.raw_field)

        # Convert survey codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.analysis_file_key: plan.code_scheme.get_code_with_id(
                        td[plan.coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.SURVEY_CODING_PLANS
                    if plan.analysis_file_key is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Convert RQA binary codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.binary_analysis_file_key:
                    plan.binary_code_scheme.get_code_with_id(
                        td[plan.binary_coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                    if plan.binary_code_scheme is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Translate the RQA reason codes to matrix values
        matrix_keys = []

        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            show_matrix_keys = list()
            for code in plan.code_scheme.codes:
                show_matrix_keys.append(
                    f"{plan.analysis_file_key}{code.string_value}")

            AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys,
                                         plan.code_scheme, plan.coded_field,
                                         plan.analysis_file_key)

            matrix_keys.extend(show_matrix_keys)

        binary_keys = [
            plan.binary_analysis_file_key
            for plan in PipelineConfiguration.RQA_CODING_PLANS
            if plan.binary_analysis_file_key is not None
        ]

        equal_keys = ["uid"]
        equal_keys.extend(survey_keys)
        concat_keys = [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ]
        bool_keys = [
            consent_withdrawn_key,

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]

        # Export to CSV
        export_keys = ["uid"]
        export_keys.extend(bool_keys)
        export_keys.extend(matrix_keys)
        export_keys.extend(binary_keys)
        export_keys.extend(concat_keys)
        export_keys.extend(survey_keys)

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.SURVEY_CODING_PLANS,
            consent_withdrawn_key)

        # Set consent withdrawn based on stop codes from radio question answers
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1:
                    td.append_data({consent_withdrawn_key: Codes.TRUE},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                if plan.binary_code_scheme is not None:
                    if td[plan.binary_coded_field]["CodeID"] == \
                            plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id:
                        td.append_data({consent_withdrawn_key: Codes.TRUE},
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td.get(plan.raw_field, "") != "":
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.TRUE_MISSING}":
                            Codes.MATRIX_0
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                contains_non_nc_key = False
                for key in matrix_keys:
                    if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                            and td.get(key) == Codes.MATRIX_1:
                        contains_non_nc_key = True
                if not contains_non_nc_key:
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.NOT_CODED}":
                            Codes.MATRIX_1
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data
コード例 #7
0
def impute_yes_no_reasons_codes(user, data, coding_configurations):
    # Synchronise the control codes between the binary and reasons schemes:
    # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
    # if there is an additional reason given. Importing those two schemes separately above caused the labels in
    # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
    # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
    # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
    binary_configuration = coding_configurations[0]
    reasons_configuration = coding_configurations[1]

    # TODO: Switch to using CodingModes.SINGLE/MULTIPLE once configuration is being set in configuration json
    #       rather than in pipeline_configuration.py
    assert binary_configuration.coding_mode == "SINGLE"
    assert reasons_configuration.coding_mode == "MULTIPLE"

    for td in data:
        binary_label = td[binary_configuration.coded_field]
        binary_code = binary_configuration.code_scheme.get_code_with_id(
            binary_label["CodeID"])

        binary_label_present = \
            binary_label["CodeID"] != binary_configuration.code_scheme.get_code_with_control_code(
                Codes.NOT_REVIEWED).code_id

        reasons_label_present = \
            len(td[reasons_configuration.coded_field]) > 1 or \
            td[reasons_configuration.coded_field][0][
                "CodeID"] != reasons_configuration.code_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        if binary_label_present and not reasons_label_present:
            if binary_code.code_type == "Control":
                control_code = binary_code.control_code
                reasons_code = reasons_configuration.code_scheme.get_code_with_control_code(
                    control_code)

                reasons_label = CleaningUtils.make_label_from_cleaner_code(
                    reasons_configuration.code_scheme,
                    reasons_code,
                    Metadata.get_call_location(),
                    origin_name="Pipeline Code Synchronisation")

                td.append_data(
                    {
                        reasons_configuration.coded_field:
                        [reasons_label.to_dict()]
                    },
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))
            else:
                assert binary_code.code_type == "Normal"

                nc_label = CleaningUtils.make_label_from_cleaner_code(
                    reasons_configuration.code_scheme,
                    reasons_configuration.code_scheme.
                    get_code_with_control_code(Codes.NOT_CODED),
                    Metadata.get_call_location(),
                    origin_name="Pipeline Code Synchronisation")
                td.append_data(
                    {reasons_configuration.coded_field: [nc_label.to_dict()]},
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))
コード例 #8
0
ファイル: io.py プロジェクト: AfricasVoices/CoreDataModules
    def import_coda_2_to_traced_data_iterable_multi_coded(
            cls, user, data, message_id_key, scheme_key_map, f=None):
        """
        Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file.

        Data which is has not been checked in the Coda file is coded using the provided nr_label
        (irrespective of whether there was an automatic code there before).
        
        Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme
        tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically.

        TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR
        TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs?

        :param user: Identifier of user running this program.
        :type user: str
        :param data: TracedData objects to be coded using the Coda file.
        :type data: iterable of TracedData
        :param message_id_key: Key in TracedData objects of the message ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) ->
                            (Scheme in the Coda messages file to retrieve the labels from)
        :type scheme_key_map: dict of str -> iterable of Scheme
        :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything.
        :type f: file-like | None
        """
        if f is None:
            f = cls._make_empty_file()

        # Build a lookup table of MessageID -> SchemeID -> Labels
        coda_dataset = cls._dataset_lut_from_messages_file(
            f, scheme_key_map.values())

        # Filter out TracedData objects that do not contain a message id key
        data = [td for td in data if message_id_key in td]

        # Apply the labels from Coda to each TracedData item in data
        for td in data:
            for coded_key, scheme in scheme_key_map.items():
                # Get labels for this (message id, scheme id) from the look-up table
                labels = coda_dataset.get(td[message_id_key],
                                          dict()).get(scheme.scheme_id, [])

                # Get the currently assigned list of labels for this multi-coded scheme,
                # and construct a look-up table of scheme id -> label
                td_labels = td.get(coded_key, [])
                td_labels_lut = {
                    label["SchemeID"]: Label.from_dict(label)
                    for label in td_labels
                }

                for label in reversed(labels):
                    # Update the relevant label in this traced data's list of labels with the new label,
                    # and append the whole new list to the traced data.
                    td_labels_lut[label.scheme_id] = label

                    td_labels = list(td_labels_lut.values())
                    td.append_data(
                        {coded_key: [label.to_dict() for label in td_labels]},
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                # Delete any labels that are SPECIAL-MANUALLY_UNCODED
                for scheme_id, label in list(td_labels_lut.items()):
                    if label.code_id == "SPECIAL-MANUALLY_UNCODED":
                        del td_labels_lut[scheme_id]
                        td_labels = list(td_labels_lut.values())
                        td.append_data(
                            {
                                coded_key:
                                [label.to_dict() for label in td_labels]
                            },
                            Metadata(user, Metadata.get_call_location(),
                                     time.time()))

                # If no manual labels have been set and are checked, set a code for NOT_REVIEWED
                checked_codes_count = 0
                labels = td.get(coded_key)
                if labels is not None:
                    for label in labels:
                        if label["Checked"]:
                            checked_codes_count += 1

                if checked_codes_count == 0:
                    nr_label = CleaningUtils.make_label_from_cleaner_code(
                        scheme,
                        scheme.get_code_with_control_code(Codes.NOT_REVIEWED),
                        Metadata.get_call_location())

                    td.append_data({coded_key: [nr_label.to_dict()]},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                # Normalise the scheme ids of all the imported labels
                labels = [Label.from_dict(d) for d in td[coded_key]]
                for label in labels:
                    assert label.scheme_id.startswith(scheme.scheme_id)
                    label.scheme_id = scheme.scheme_id

                # De-duplicate the imported labels by selecting the first label with each code id.
                # This is required in cases where the same label was applied to this message under different columns
                # of the same code scheme, and is possible now that we have normalised the scheme ids.
                unique_labels_by_code_id = []
                seen_code_ids = set()
                for label in labels:
                    if label.code_id not in seen_code_ids:
                        unique_labels_by_code_id.append(label)
                        seen_code_ids.add(label.code_id)

                td.append_data(
                    {
                        coded_key: [
                            label.to_dict()
                            for label in unique_labels_by_code_id
                        ]
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
コード例 #9
0
ファイル: io.py プロジェクト: AfricasVoices/CoreDataModules
    def import_coda_2_to_traced_data_iterable(cls,
                                              user,
                                              data,
                                              message_id_key,
                                              scheme_key_map,
                                              f=None):
        """
        Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file.

        Data which is has not been checked in the Coda file is coded using the provided nr_label
        (irrespective of whether there was an automatic code there before).

        TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR
        TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs?

        :param user: Identifier of user running this program.
        :type user: str
        :param data: TracedData objects to be coded using the Coda file.
        :type data: iterable of TracedData
        :param message_id_key: Key in TracedData objects of the message ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) ->
                               (Scheme in the Coda messages file to retrieve the labels from)
        :type scheme_key_map: dict of str -> Scheme
        :param f: Coda data file to import codes from, or None.
        :type f: file-like | None
        """
        if f is None:
            f = cls._make_empty_file()

        # Build a lookup table of MessageID -> SchemeID -> Labels
        coda_dataset = cls._dataset_lut_from_messages_file(
            f, scheme_key_map.values())

        # Filter out TracedData objects that do not contain a message id key
        data = [td for td in data if message_id_key in td]

        # Apply the labels from Coda to each TracedData item in data
        for td in data:
            for key_of_coded, scheme in scheme_key_map.items():
                # Get labels for this (message id, scheme id) from the look-up table
                labels = coda_dataset.get(td[message_id_key],
                                          dict()).get(scheme.scheme_id, [])
                if labels is not None:
                    # Append each label that was assigned to this message for this scheme to the TracedData.
                    for label in reversed(labels):
                        td.append_data({key_of_coded: label.to_dict()},
                                       Metadata(
                                           user, Metadata.get_call_location(),
                                           TimeUtils.utc_now_as_iso_string()))

                # If this td still has no label after importing from the Coda file, or the label is a non-missing label
                # that hasn't been checked in the Coda UI, set a code for NOT_REVIEWED
                if key_of_coded not in td or not td[key_of_coded]["Checked"]:
                    nr_label = CleaningUtils.make_label_from_cleaner_code(
                        scheme,
                        scheme.get_code_with_control_code(Codes.NOT_REVIEWED),
                        Metadata.get_call_location())
                    td.append_data({key_of_coded: nr_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))
コード例 #10
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of keys to be exported and how they are to be handled when folding
        export_keys = ["uid", consent_withdrawn_key]
        bool_keys = [
            consent_withdrawn_key

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]
        equal_keys = ["uid"]
        concat_keys = []
        matrix_keys = []
        binary_keys = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.analysis_file_key is None:
                    continue

                if cc.coding_mode == CodingModes.SINGLE:
                    export_keys.append(cc.analysis_file_key)

                    if cc.folding_mode == FoldingModes.ASSERT_EQUAL:
                        equal_keys.append(cc.analysis_file_key)
                    elif cc.folding_mode == FoldingModes.YES_NO_AMB:
                        binary_keys.append(cc.analysis_file_key)
                    else:
                        assert False, f"Incompatible folding_mode {plan.folding_mode}"
                else:
                    assert cc.folding_mode == FoldingModes.MATRIX
                    for code in cc.code_scheme.codes:
                        export_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")
                        matrix_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")

            export_keys.append(plan.raw_field)
            if plan.raw_field_folding_mode == FoldingModes.CONCATENATE:
                concat_keys.append(plan.raw_field)
            elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL:
                equal_keys.append(plan.raw_field)
            else:
                assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}"

        # Convert codes to their string/matrix values
        for td in data:
            analysis_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.SINGLE:
                        analysis_dict[cc.analysis_file_key] = \
                            cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        show_matrix_keys = []
                        for code in cc.code_scheme.codes:
                            show_matrix_keys.append(
                                f"{cc.analysis_file_key}{code.string_value}")

                        for label in td.get(cc.coded_field, []):
                            code_string_value = cc.code_scheme.get_code_with_id(
                                label['CodeID']).string_value
                            analysis_dict[
                                f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1

                        for key in show_matrix_keys:
                            if key not in analysis_dict:
                                analysis_dict[key] = Codes.MATRIX_0
            td.append_data(
                analysis_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.RQA_CODING_PLANS +
            PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key)

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.MULTIPLE:
                        if td.get(plan.raw_field, "") != "":
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.TRUE_MISSING}":
                                    Codes.MATRIX_0
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

                        contains_non_nc_key = False
                        for key in matrix_keys:
                            if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                                    and td.get(key) == Codes.MATRIX_1:
                                contains_non_nc_key = True
                        if not contains_non_nc_key:
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.NOT_CODED}":
                                    Codes.MATRIX_1
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data, folded_data
コード例 #11
0
ファイル: code_merge.py プロジェクト: lukechurch/CodaV2
    messages = [Message.from_firebase_map(d) for d in json.load(f)]
log.info(f"Loaded {len(messages)} messages")

log.info(f"Performing merge ({code_ids_to_merge} -> '{merged_code_id}')...")
merged_count = 0  # A count of the number of labels that were remapped to the merged value, for sense-check logging
for msg in messages:
    processed_scheme_ids = set()
    for label in list(msg.labels):
        # Skip labels that are not the latest assignment under each scheme
        if label.scheme_id in processed_scheme_ids:
            continue
        processed_scheme_ids.add(label.scheme_id)

        if label.code_id in code_ids_to_merge:
            msg.labels.insert(
                0,
                Label(label.scheme_id,
                      merged_code_id,
                      TimeUtils.utc_now_as_iso_string(),
                      Origin(Metadata.get_call_location(), "Auto Code-Merge",
                             "External"),
                      checked=label.checked))
            merged_count += 1
log.info(f"Merged {merged_count} labels to '{merged_code_id}'")

log.info(
    f"Exporting code-merged Coda messages to '{messages_output_file_path}'...")
with open(messages_output_file_path, "w") as f:
    json.dump([msg.to_firebase_map() for msg in messages], f, indent=2)
log.info("Done")
コード例 #12
0
        if project.flow_definitions_upload_url_prefix is None:
            log.info(
                f"Not archiving flow definitions for project {project.project_name} because its "
                f"'flow_definitions_upload_url_prefix' is unspecified.")
            continue

        log.info(
            f"Archiving the latest flow definitions for project {project.project_name}..."
        )

        log.info(
            "Downloading the Rapid Pro token file and initialising the Rapid Pro client..."
        )
        rapid_pro_token = google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path,
            project.rapid_pro_token_url).strip()
        rapid_pro = RapidProClient(project.rapid_pro_domain, rapid_pro_token)

        log.info("Downloading all the flow definitions for this instance...")
        flow_ids = rapid_pro.get_all_flow_ids()
        flow_definitions_request_timestamp = TimeUtils.utc_now_as_iso_string()
        flow_definitions = rapid_pro.get_flow_definitions_for_flow_ids(
            flow_ids)

        log.info("Uploading the flow definitions to a cloud bucket...")
        upload_url = f"{project.flow_definitions_upload_url_prefix}{flow_definitions_request_timestamp}.json"
        flow_definitions_json = json.dumps(flow_definitions.serialize())
        google_cloud_utils.upload_string_to_blob(
            google_cloud_credentials_file_path, upload_url,
            flow_definitions_json)
コード例 #13
0
    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field,
                        {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Label the RQA for which there is no response yet as TRUE MISSING
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [nc_label.to_dict()]
                elif plan.binary_code_scheme is not None and td[
                        plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme,
                        plan.binary_code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.binary_coded_field] = [
                        nc_label.to_dict()
                    ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(
                        binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                            plan.binary_code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(
                                control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                reasons_code,
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Merge manually coded demog and follow-up survey files into the cleaned dataset
        # Recursion depth currently exceeding
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(10000)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Not everyone will have answered all of the demographic and follow-up survey flows flows.
        # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING.
        # Label data which is just the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = nc_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Set county/constituency/from the coded constituency field.
        cls._impute_location_codes(user, data)

        # Set coding error codes using the coding error field
        cls._impute_coding_error_codes(user, data)

        return data
コード例 #14
0
    def remap_radio_shows(cls, user, data, coda_input_dir):
        """
        Remaps radio shows which were in the wrong flow, and therefore have the wrong key/values set, to have the
        key/values they would have had if they had been received by the correct flow.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to move the radio show messages in.
        :type data: iterable of TracedData
        :param coda_input_dir: Directory to read coded coda files from.
        :type coda_input_dir: str
        """
        # TODO: Convert the show remapping code here into reusable functions for each case that they handle.
        #       Note that ultimately we probably don't want to handle the 'WS' show remapping here,
        #       because we get that for free when we implement 'WS' handling properly.

        # Build a map of raw week 3 messages to wrong scheme data
        message_to_s01e02_dict = cls._build_message_to_s01e02_dict(
            user, data, coda_input_dir)

        for td in data:
            mapped_dict = dict()

            if cls.WEEK_3_TIME_KEY in td:
                # Redirect any week 3 messages coded as s01e02 in the WS - Correct Dataset scheme to week 2
                # Also, fake the timestamp of redirected week 3 messages to make it look like they arrived on the day
                # before the incorrect sms ad was sent, i.e. the last day of week 2.
                # This is super yucky, but works because (a) timestamps are never exported, and (b) this date
                # is being set to non_logical anyway in channels.py.
                if message_to_s01e02_dict.get(td["rqa_message"], False):
                    mapped_dict["show_id"] = 2
                    mapped_dict["sent_on"] = "2018-12-15T00:00:00+03:00"

            td.append_data(
                mapped_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

        # Redirect any week 4 messages which were in the week 3 flow due to a late flow change-over.
        cls._remap_radio_show_by_time_range(user,
                                            data,
                                            cls.WEEK_3_TIME_KEY,
                                            4,
                                            range_start=isoparse(
                                                cls.WEEK_4_START))

        # Redirect any week 2 messages which were in the week 4 flow, due to undelivered messages being delivered
        # in two bursts after the end of the radio shows.
        cls._remap_radio_show_by_time_range(
            user,
            data,
            cls.WEEK_4_TIME_KEY,
            2,
            range_start=isoparse(cls.THURSDAY_BURST_START),
            range_end=isoparse(cls.THURSDAY_BURST_END),
            time_to_adjust_to=isoparse(cls.THURSDAY_CORRECTION_TIME))

        cls._remap_radio_show_by_time_range(
            user,
            data,
            cls.WEEK_4_TIME_KEY,
            2,
            range_start=isoparse(cls.FRIDAY_BURST_START),
            range_end=isoparse(cls.FRIDAY_BURST_END),
            time_to_adjust_to=isoparse(cls.FRIDAY_CORRECTION_TIME))
コード例 #15
0
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        if blob_url in recovery_csv_source.activation_flow_urls:
            label_somalia_operator(user, traced_runs, phone_number_uuid_table)

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
コード例 #16
0
    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)
            print(coda_input_path)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # At this point, the TracedData objects still contain messages for at most one week each.
        # Label the weeks for which there is no response as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                            Metadata.get_call_location()
                        )
                        missing_dict[plan.binary_coded_field] = na_label.to_dict()

            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                           plan.binary_code_scheme.get_code_with_control_code(
                                               Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, reasons_code,
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation"
                            )
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )

        # Not everyone will have answered all of the demographic flows.
        # Label demographic questions which had no responses as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        return data
コード例 #17
0
    def move_wrong_scheme_messages(user, data, coda_input_dir):
        log.info("Importing manually coded Coda files to '_WS' fields...")
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            for td in data:
                if plan.raw_field in td:
                    td.append_data(
                        {f"{plan.id_field}_WS": plan.message_id_fn(td)},
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, f"{plan.id_field}_WS", {
                        f"{plan.raw_field}_WS_correct_dataset":
                        PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                    }, f)

            for cc in plan.coding_configurations:
                with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field + "_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, f"{plan.id_field}_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)

        log.info("Checking for WS Coding Errors...")
        # Check for coding errors
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                rqa_codes = []
                for cc in plan.coding_configurations:
                    if cc.coding_mode == CodingModes.SINGLE:
                        if f"{cc.coded_field}_WS" in td:
                            label = td[f"{cc.coded_field}_WS"]
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        for label in td.get(f"{cc.coded_field}_WS", []):
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))

                has_ws_code_in_code_scheme = False
                for code in rqa_codes:
                    if code.control_code == Codes.WRONG_SCHEME:
                        has_ws_code_in_code_scheme = True

                has_ws_code_in_ws_scheme = False
                if f"{plan.raw_field}_WS_correct_dataset" in td:
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.warning(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict = {
                        f"{plan.raw_field}_WS_correct_dataset":
                        CleaningUtils.make_label_from_cleaner_code(
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME,
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.
                            get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                    }
                    td.append_data(
                        coding_error_dict,
                        Metadata(user, Metadata.get_call_location(),
                                 time.time()))

        # Construct a map from WS normal code id to the raw field that code indicates a requested move to.
        ws_code_to_raw_field_map = dict()
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.ws_code is not None:
                ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field

        # Group the TracedData by uid.
        data_grouped_by_uid = dict()
        for td in data:
            uid = td["uid"]
            if uid not in data_grouped_by_uid:
                data_grouped_by_uid[uid] = []
            data_grouped_by_uid[uid].append(td)

        # Perform the WS correction for each uid.
        log.info("Performing WS correction...")
        corrected_data = []  # List of TracedData with the WS data moved.
        unknown_target_code_counts = dict(
        )  # 'WS - Correct Dataset' codes with no matching code id in any coding plan
        # for this project, with a count of the occurrences
        for group in data_grouped_by_uid.values():
            # Find all the surveys data being moved.
            # (Note: we only need to check one td in this group because all the demographics are the same)
            td = group[0]
            survey_moves = dict()  # of source_field -> target_field
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td or plan.coda_filename is None:
                    continue
                ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                    td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                    if ws_code.code_id in ws_code_to_raw_field_map:
                        survey_moves[
                            plan.raw_field] = ws_code_to_raw_field_map[
                                ws_code.code_id]
                    else:
                        if (ws_code.code_id, ws_code.display_text
                            ) not in unknown_target_code_counts:
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] = 0
                        unknown_target_code_counts[(ws_code.code_id,
                                                    ws_code.display_text)] += 1
                        survey_moves[plan.raw_field] = None

            # Find all the RQA data being moved.
            rqa_moves = dict(
            )  # of (index in group, source_field) -> target_field
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field not in td or plan.coda_filename is None:
                        continue
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                        if ws_code.code_id in ws_code_to_raw_field_map:
                            rqa_moves[(
                                i, plan.raw_field
                            )] = ws_code_to_raw_field_map[ws_code.code_id]
                        else:
                            if (ws_code.code_id, ws_code.display_text
                                ) not in unknown_target_code_counts:
                                unknown_target_code_counts[(
                                    ws_code.code_id, ws_code.display_text)] = 0
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] += 1
                            rqa_moves[(i, plan.raw_field)] = None

            # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have.
            survey_updates = dict()  # of raw_field -> updated value
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.coda_filename is None:
                    continue

                if plan.raw_field in survey_moves.keys():
                    # Data is moving
                    survey_updates[plan.raw_field] = []
                elif plan.raw_field in td:
                    # Data is not moving
                    survey_updates[plan.raw_field] = [
                        _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                  plan.raw_field, td)
                    ]

            # Build a list of the rqa fields that haven't been moved.
            rqa_updates = []  # of (raw_field, _WSUpdate)
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.coda_filename is None:
                        continue

                    if plan.raw_field in td:
                        if (i, plan.raw_field) in rqa_moves.keys():
                            # Data is moving
                            pass
                        else:
                            # Data is not moving
                            rqa_updates.append(
                                (plan.raw_field,
                                 _WSUpdate(td[plan.raw_field],
                                           td[plan.time_field], plan.raw_field,
                                           td)))

            # Add data moving from survey fields to the relevant survey_/rqa_updates
            raw_survey_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.SURVEY_CODING_PLANS
            }
            raw_rqa_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in survey_moves:
                    continue

                target_field = survey_moves[plan.raw_field]
                if target_field is None:
                    continue

                update = _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                   plan.raw_field, td)
                if target_field in raw_survey_fields:
                    survey_updates[target_field] = survey_updates.get(
                        target_field, []) + [update]
                else:
                    assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                    rqa_updates.append((target_field, update))

            # Add data moving from RQA fields to the relevant survey_/rqa_updates
            for (i, source_field), target_field in rqa_moves.items():
                if target_field is None:
                    continue

                for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field == source_field:
                        _td = group[i]
                        update = _WSUpdate(_td[plan.raw_field],
                                           _td[plan.time_field],
                                           plan.raw_field, td)
                        if target_field in raw_survey_fields:
                            survey_updates[target_field] = survey_updates.get(
                                target_field, []) + [update]
                        else:
                            assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                            rqa_updates.append((target_field, update))

            # Re-format the survey updates to a form suitable for use by the rest of the pipeline
            flattened_survey_updates = {}
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field in survey_updates:
                    plan_updates = survey_updates[plan.raw_field]

                    if len(plan_updates) > 0:
                        flattened_survey_updates[plan.raw_field] = "; ".join(
                            [u.message for u in plan_updates])
                        flattened_survey_updates[plan.time_field] = sorted(
                            [u.timestamp for u in plan_updates])[0]
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = "; ".join(
                                [u.source_field for u in plan_updates])
                    else:
                        flattened_survey_updates[plan.raw_field] = None
                        flattened_survey_updates[plan.time_field] = None
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = None

            # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to
            # the list of TracedData to be returned
            raw_field_to_rqa_plan_map = {
                plan.raw_field: plan
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for target_field, update in rqa_updates:
                corrected_td = update.source_td.copy()

                # Hide the survey keys currently in the TracedData which have had data moved away.
                corrected_td.hide_keys({
                    k
                    for k, v in flattened_survey_updates.items() if v is None
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                # Update with the corrected survey data
                corrected_td.append_data(
                    {
                        k: v
                        for k, v in flattened_survey_updates.items()
                        if v is not None
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))

                # Hide all the RQA fields (they will be added back, in turn, in the next step).
                corrected_td.hide_keys({
                    plan.raw_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))
                corrected_td.hide_keys({
                    plan.time_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                target_coding_plan = raw_field_to_rqa_plan_map[target_field]

                rqa_dict = {
                    target_field: update.message,
                    target_coding_plan.time_field: update.timestamp,
                    f"{target_field}_source": update.source_field
                }

                corrected_td.append_data(
                    rqa_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))
                corrected_data.append(corrected_td)

        if len(unknown_target_code_counts) > 0:
            log.warning(
                "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:"
            )
            for (code_id,
                 display_text), count in unknown_target_code_counts.items():
                log.warning(
                    f"  '{code_id}' (DisplayText '{display_text}') ({count} occurrences)"
                )

        return corrected_data
コード例 #18
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")