def label_somalia_operator(user, traced_runs, phone_number_uuid_table): # Set the operator codes for each message. uuids = {td["avf_phone_id"] for td in traced_runs} uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids) for td in traced_runs: operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5] # Returns the country code 252 and the next two digits operator_code = PhoneCleaner.clean_operator(operator_raw) if operator_code == Codes.NOT_CODED: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) else: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code), Metadata.get_call_location() ) td.append_data({ "operator_raw": operator_raw, "operator_coded": operator_label.to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def clean_operator(urn): """ Returns the operator code for the given urn. If the urn is a telephone, returns the operator of the phone line (see `PhoneCleaner.clean_operator`), otherwise returns the urn. >>> URNCleaner.clean_operator("tel:+25261123123") 'hormud' >>> URNCleaner.clean_operator("tel:+2547123123") 'kenyan telephone' >>> URNCleaner.clean_operator("telegram:123456") 'telegram' >>> URNCleaner.clean_operator("+25261123123") # (Not a valid urn) Traceback (most recent call last): File "<stdin>", line 1, in <module> ValueError :param urn: URN number to determine the operator of. :type urn: str :return: URN operator. :rtype: str """ if ":" not in urn: raise ValueError if urn.startswith("tel:"): # Set the operator name from the phone number return PhoneCleaner.clean_operator(urn) elif urn.startswith("deleted:"): return Codes.DELETED else: # Set the operator name from the channel type e.g. 'telegram', 'twitter' return urn.split(":")[0]
def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None): """ Converts raw data fetched from Rapid Pro to TracedData. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param raw_runs: Raw run objects to convert to TracedData. :type raw_runs: list of temba_client.v2.types.Run :param raw_contacts: Raw contact objects to use when converting to TracedData. :type raw_contacts: list of temba_client.v2.types.Contact :param phone_uuids: Phone number <-> UUID table. :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable :param test_contacts: Rapid Pro contact UUIDs of test contacts. Runs from any of those test contacts will be tagged with {'test_run': True} :type test_contacts: list of str | None :return: Raw data fetched from Rapid Pro converted to TracedData. :rtype: list of TracedData """ if test_contacts is None: test_contacts = [] log.info(f"Converting {len(raw_runs)} raw runs to TracedData...") contacts_lut = {c.uuid: c for c in raw_contacts} runs_with_uuids = [] phone_numbers = [] for run in raw_runs: if run.contact.uuid not in contacts_lut: # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`. # I have only observed this happen for contacts which were created very recently. # This test skips the run in this case; it should be included next time this script is executed. log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', " f"but this id is not present in the downloaded contacts") continue contact_urns = contacts_lut[run.contact.uuid].urns if len(contact_urns) == 0: log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} " f"(Rapid Pro Contact UUID: {run.contact.uuid})") continue phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0])) runs_with_uuids.append(run) phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers) traced_runs = [] for run in runs_with_uuids: contact_urns = contacts_lut[run.contact.uuid].urns run_dict = { "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])], f"run_id - {run.flow.name}": run.id } for category, response in run.values.items(): run_dict[category.title() + " (Category) - " + run.flow.name] = response.category run_dict[category.title() + " (Value) - " + run.flow.name] = response.value # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports. run_dict[category.title() + " (Text) - " + run.flow.name] = response.input run_dict[category.title() + " (Name) - " + run.flow.name] = response.name run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat() run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id if run.contact.uuid in test_contacts: run_dict["test_run"] = True else: assert len(contact_urns) == 1, \ f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})" run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat() run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat() run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat() run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type traced_runs.append( TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info(f"Converted {len(traced_runs)} raw runs to TracedData") return traced_runs
if args.time_frame: time_frame = args.time_frame with open(raw_messages_input_file_path, mode="r") as f: log.info(f"Loading messages from {raw_messages_input_file_path}...") input = json.load(f) messages = [Message.deserialize(val) for val in input] log.info(f"Loaded {len(messages)} messages") # Filter messages based on the target operator and target direction of the message log.info(f"Filtering messages based on {target_operator} and " f"message direction as '{target_message_direction}' from {len(messages)} total messages ") filtered_messages = [] for msg in messages: if msg.urn.startswith("tel:"): operator = PhoneCleaner.clean_operator(msg.urn.split(":")[1]) else: operator = msg.urn.split(":")[0] if operator == target_operator and msg.direction == target_message_direction: msg_direction = msg.direction filtered_messages.append(msg) log.info(f"returning {len(filtered_messages)} messages") time_interval = timedelta(hours=time_frame.hour, minutes=time_frame.minute, seconds=time_frame.second) date_time_bounds = date_time_range(start_date, end_date, time_interval) # Compute the number of messages between two firebase time bounds i.e `PreviousMessageTimestamp` and # `NextMessageTimestamp` to get number of mesages in each firebase period and relate # each quantity with the operator and the message direction.
phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-" ) log.info("Initialised the Firestore UUID table") log.info(f"Loading csv from '{csv_input_path}'...") with open(csv_input_path, "r", encoding='utf-8-sig') as f: raw_data = list(csv.DictReader(f)) log.info(f"Loaded {len(raw_data)} rows") log.info(f"Normalising phone numbers in column '{column_to_de_identify}'...") for row in raw_data: row[column_to_de_identify] = PhoneCleaner.normalise_phone(row[column_to_de_identify]) log.info(f"De-identifying column '{column_to_de_identify}'...") phone_numbers = [row[column_to_de_identify] for row in raw_data] phone_to_uuid_lut = phone_number_uuid_table.data_to_uuid_batch(phone_numbers) for row in raw_data: row[column_to_de_identify] = phone_to_uuid_lut[row[column_to_de_identify]] log.info(f"Exporting {len(raw_data)} de-identified rows to {de_identified_csv_output_path}...") with open(de_identified_csv_output_path, "w") as f: writer = csv.DictWriter(f, fieldnames=raw_data[0].keys()) writer.writeheader() for row in raw_data: writer.writerow(row)
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Set operator from phone number for td in data: operator_clean = PhoneCleaner.clean_operator( phone_uuid_table.get_phone(td["uid"])) if operator_clean == Codes.NOT_CODED: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) else: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_match_value( operator_clean), Metadata.get_call_location()) td.append_data({"operator_coded": label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field == "mogadishu_sub_district_raw": continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output location scheme to coda for manual verification + coding output_path = path.join(coda_output_dir, "location.json") TracedDataCodaV2IO.compute_message_ids( user, data, "mogadishu_sub_district_raw", "mogadishu_sub_district_raw_id") with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, "mogadishu_sub_district_raw", "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id", { "mogadishu_sub_district_coded": CodeSchemes.MOGADISHU_SUB_DISTRICT, "district_coded": CodeSchemes.DISTRICT, "region_coded": CodeSchemes.REGION, "state_coded": CodeSchemes.STATE, "zone_coded": CodeSchemes.ZONE }, f) return data
with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) # Set the operator codes for each message. if flow in pipeline_configuration.activation_flow_names: uuids = {td["avf_phone_id"] for td in traced_runs} uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch( uuids) for td in traced_runs: operator_code = PhoneCleaner.clean_operator( uuid_to_phone_lut[td["avf_phone_id"]]) if operator_code == Codes.NOT_CODED: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location()) else: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value( operator_code), Metadata.get_call_location()) td.append_data({"operator_coded": operator_label.to_dict()}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))