def label_somalia_operator(user, traced_runs, phone_number_uuid_table):
    # Set the operator codes for each message.
    uuids = {td["avf_phone_id"] for td in traced_runs}
    uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids)
    for td in traced_runs:
        operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5]  # Returns the country code 252 and the next two digits

        operator_code = PhoneCleaner.clean_operator(operator_raw)
        if operator_code == Codes.NOT_CODED:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED),
                Metadata.get_call_location()
            )
        else:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code),
                Metadata.get_call_location()
            )

        td.append_data({
            "operator_raw": operator_raw,
            "operator_coded": operator_label.to_dict()
        }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
Exemple #2
0
    def clean_operator(urn):
        """
        Returns the operator code for the given urn.

        If the urn is a telephone, returns the operator of the phone line (see `PhoneCleaner.clean_operator`),
        otherwise returns the urn.

        >>> URNCleaner.clean_operator("tel:+25261123123")
        'hormud'
        >>> URNCleaner.clean_operator("tel:+2547123123")
        'kenyan telephone'
        >>> URNCleaner.clean_operator("telegram:123456")
        'telegram'
        >>> URNCleaner.clean_operator("+25261123123")  # (Not a valid urn)
        Traceback (most recent call last):
          File "<stdin>", line 1, in <module>
        ValueError

        :param urn: URN number to determine the operator of.
        :type urn: str
        :return: URN operator.
        :rtype: str
        """
        if ":" not in urn:
            raise ValueError
        if urn.startswith("tel:"):
            # Set the operator name from the phone number
            return PhoneCleaner.clean_operator(urn)
        elif urn.startswith("deleted:"):
            return Codes.DELETED
        else:
            # Set the operator name from the channel type e.g. 'telegram', 'twitter'
            return urn.split(":")[0]
Exemple #3
0
    def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None):
        """
        Converts raw data fetched from Rapid Pro to TracedData.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param raw_runs: Raw run objects to convert to TracedData.
        :type raw_runs: list of temba_client.v2.types.Run
        :param raw_contacts: Raw contact objects to use when converting to TracedData.
        :type raw_contacts: list of temba_client.v2.types.Contact
        :param phone_uuids: Phone number <-> UUID table.
        :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable
        :param test_contacts: Rapid Pro contact UUIDs of test contacts.
                              Runs from any of those test contacts will be tagged with {'test_run': True}
        :type test_contacts: list of str | None
        :return: Raw data fetched from Rapid Pro converted to TracedData.
        :rtype: list of TracedData
        """
        if test_contacts is None:
            test_contacts = []

        log.info(f"Converting {len(raw_runs)} raw runs to TracedData...")

        contacts_lut = {c.uuid: c for c in raw_contacts}

        runs_with_uuids = []
        phone_numbers = []
        for run in raw_runs:
            if run.contact.uuid not in contacts_lut:
                # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`.
                # I have only observed this happen for contacts which were created very recently.
                # This test skips the run in this case; it should be included next time this script is executed.
                log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', "
                            f"but this id is not present in the downloaded contacts")
                continue

            contact_urns = contacts_lut[run.contact.uuid].urns
            if len(contact_urns) == 0:
                log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} "
                            f"(Rapid Pro Contact UUID: {run.contact.uuid})")
                continue

            phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0]))
            runs_with_uuids.append(run)

        phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers)

        traced_runs = []
        for run in runs_with_uuids:
            contact_urns = contacts_lut[run.contact.uuid].urns
            run_dict = {
                "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])],
                f"run_id - {run.flow.name}": run.id
            }

            for category, response in run.values.items():
                run_dict[category.title() + " (Category) - " + run.flow.name] = response.category
                run_dict[category.title() + " (Value) - " + run.flow.name] = response.value
                # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports.
                run_dict[category.title() + " (Text) - " + run.flow.name] = response.input
                run_dict[category.title() + " (Name) - " + run.flow.name] = response.name
                run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat()
                run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id

            if run.contact.uuid in test_contacts:
                run_dict["test_run"] = True
            else:
                assert len(contact_urns) == 1, \
                    f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})"

            run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat()
            run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat()
            run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat()
            run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type

            traced_runs.append(
                TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())))

        log.info(f"Converted {len(traced_runs)} raw runs to TracedData")

        return traced_runs
Exemple #4
0
    if args.time_frame:
        time_frame = args.time_frame

    with open(raw_messages_input_file_path, mode="r") as f:
        log.info(f"Loading messages from {raw_messages_input_file_path}...")
        input = json.load(f)
        messages = [Message.deserialize(val) for val in input]
        log.info(f"Loaded {len(messages)} messages")

    # Filter messages based on the target operator and target direction of the message
    log.info(f"Filtering messages based on {target_operator} and "
             f"message direction as '{target_message_direction}' from {len(messages)} total messages ")
    filtered_messages = []
    for msg in messages:
        if msg.urn.startswith("tel:"):
            operator = PhoneCleaner.clean_operator(msg.urn.split(":")[1])
        else:
            operator = msg.urn.split(":")[0]
        if operator == target_operator and msg.direction == target_message_direction:
            msg_direction = msg.direction
            filtered_messages.append(msg)
    log.info(f"returning {len(filtered_messages)} messages")

    time_interval = timedelta(hours=time_frame.hour,
                              minutes=time_frame.minute, seconds=time_frame.second)

    date_time_bounds = date_time_range(start_date, end_date, time_interval)

    # Compute the number of messages between two firebase time bounds i.e `PreviousMessageTimestamp` and
    # `NextMessageTimestamp` to get number of mesages in each firebase period and relate 
    #  each quantity with the operator and the message direction.
    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials,
        "avf-phone-uuid-"
    )
    log.info("Initialised the Firestore UUID table")

    log.info(f"Loading csv from '{csv_input_path}'...")
    with open(csv_input_path, "r", encoding='utf-8-sig') as f:
        raw_data = list(csv.DictReader(f))
    log.info(f"Loaded {len(raw_data)} rows")

    log.info(f"Normalising phone numbers in column '{column_to_de_identify}'...")
    for row in raw_data:
        row[column_to_de_identify] = PhoneCleaner.normalise_phone(row[column_to_de_identify])

    log.info(f"De-identifying column '{column_to_de_identify}'...")
    phone_numbers = [row[column_to_de_identify] for row in raw_data]

    phone_to_uuid_lut = phone_number_uuid_table.data_to_uuid_batch(phone_numbers)
    for row in raw_data:
        row[column_to_de_identify] = phone_to_uuid_lut[row[column_to_de_identify]]

    log.info(f"Exporting {len(raw_data)} de-identified rows to {de_identified_csv_output_path}...")
    with open(de_identified_csv_output_path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=raw_data[0].keys())
        writer.writeheader()

        for row in raw_data:
            writer.writerow(row)
Exemple #6
0
    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Set operator from phone number
        for td in data:
            operator_clean = PhoneCleaner.clean_operator(
                phone_uuid_table.get_phone(td["uid"]))
            if operator_clean == Codes.NOT_CODED:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_control_code(
                        Codes.NOT_CODED), Metadata.get_call_location())
            else:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_match_value(
                        operator_clean), Metadata.get_call_location())
            td.append_data({"operator_coded": label.to_dict()},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field == "mogadishu_sub_district_raw":
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output location scheme to coda for manual verification + coding
        output_path = path.join(coda_output_dir, "location.json")
        TracedDataCodaV2IO.compute_message_ids(
            user, data, "mogadishu_sub_district_raw",
            "mogadishu_sub_district_raw_id")
        with open(output_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                data, "mogadishu_sub_district_raw",
                "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id",
                {
                    "mogadishu_sub_district_coded":
                    CodeSchemes.MOGADISHU_SUB_DISTRICT,
                    "district_coded": CodeSchemes.DISTRICT,
                    "region_coded": CodeSchemes.REGION,
                    "state_coded": CodeSchemes.STATE,
                    "zone_coded": CodeSchemes.ZONE
                }, f)

        return data
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            pipeline_configuration.rapid_pro_test_contact_uuids)

        # Set the operator codes for each message.
        if flow in pipeline_configuration.activation_flow_names:
            uuids = {td["avf_phone_id"] for td in traced_runs}
            uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(
                uuids)
            for td in traced_runs:
                operator_code = PhoneCleaner.clean_operator(
                    uuid_to_phone_lut[td["avf_phone_id"]])
                if operator_code == Codes.NOT_CODED:
                    operator_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_OPERATOR,
                        CodeSchemes.SOMALIA_OPERATOR.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location())
                else:
                    operator_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_OPERATOR,
                        CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(
                            operator_code), Metadata.get_call_location())
                td.append_data({"operator_coded": operator_label.to_dict()},
                               Metadata(user, Metadata.get_call_location(),
                                        TimeUtils.utc_now_as_iso_string()))