def _impute_coding_error_codes(user, data): for td in data: coding_error_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if f"{plan.coded_field}_WS_correct_dataset" in td: if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: coding_error_dict[plan.coded_field] = [ CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.CODING_ERROR), Metadata.get_call_location()).to_dict() ] if plan.binary_code_scheme is not None: coding_error_dict[plan.binary_coded_field] = \ CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if f"{plan.coded_field}_WS_correct_dataset" in td: if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: coding_error_dict[plan.coded_field] = \ CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def export_to_csv(user, data, pipeline_configuration, raw_data_dir, csv_path, export_keys, consent_withdrawn_key): # Convert codes to their string/matrix values for td in data: analysis_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: analysis_dict[cc.analysis_file_key] = \ cc.code_scheme.get_code_with_code_id(td[cc.coded_field]["CodeID"]).string_value else: assert cc.coding_mode == CodingModes.MULTIPLE show_matrix_keys = [] for code in cc.code_scheme.codes: show_matrix_keys.append(f"{cc.analysis_file_key}{code.string_value}") for label in td[cc.coded_field]: code_string_value = cc.code_scheme.get_code_with_code_id(label["CodeID"]).string_value analysis_dict[f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1 for key in show_matrix_keys: if key not in analysis_dict: analysis_dict[key] = Codes.MATRIX_0 td.append_data(analysis_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Tag listening group participants ListeningGroups.tag_listening_groups_participants(user, data, pipeline_configuration, raw_data_dir) # Hide data from participants who opted out ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) with open(csv_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=export_keys)
def set_show_ids(cls, user, data, show_id_map): """ Sets a show_id for each message, using the presence of Rapid Pro value keys to determine which show each message belongs to. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set the show ids of. :type data: iterable of TracedData :param show_id_map: Dictionary of Rapid Pro value key to show id. :type show_id_map: dict of str -> int """ for td in data: show_dict = dict() for message_key, show_id in show_id_map.items(): if message_key in td: show_dict["rqa_message"] = td[message_key] show_dict["show_id"] = show_id td.append_data( show_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def convert_facebook_comments_to_traced_data(user, dataset_name, raw_comments, facebook_uuid_table): log.info( f"Converting {len(raw_comments)} Facebook comments to TracedData..." ) facebook_uuids = {comment["from"]["id"] for comment in raw_comments} facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch( facebook_uuids) traced_comments = [] # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able # to see Facebook user ids or not. for comment in raw_comments: comment["created_time"] = isoparse( comment["created_time"]).isoformat() validators.validate_utc_iso_string(comment["created_time"]) comment_dict = { "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]] } for k, v in comment.items(): comment_dict[f"{dataset_name}.{k}"] = v traced_comments.append( TracedData( comment_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info( f"Converted {len(traced_comments)} Facebook comments to TracedData" ) return traced_comments
def convert_runs_to_traced_data(user, raw_runs, raw_contacts, phone_uuids, test_contacts=None): """ Converts raw data fetched from Rapid Pro to TracedData. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param raw_runs: Raw run objects to convert to TracedData. :type raw_runs: list of temba_client.v2.types.Run :param raw_contacts: Raw contact objects to use when converting to TracedData. :type raw_contacts: list of temba_client.v2.types.Contact :param phone_uuids: Phone number <-> UUID table. :type phone_uuids: id_infrastructure.firestore_uuid_table.FirestoreUuidTable :param test_contacts: Rapid Pro contact UUIDs of test contacts. Runs from any of those test contacts will be tagged with {'test_run': True} :type test_contacts: list of str | None :return: Raw data fetched from Rapid Pro converted to TracedData. :rtype: list of TracedData """ if test_contacts is None: test_contacts = [] log.info(f"Converting {len(raw_runs)} raw runs to TracedData...") contacts_lut = {c.uuid: c for c in raw_contacts} runs_with_uuids = [] phone_numbers = [] for run in raw_runs: if run.contact.uuid not in contacts_lut: # Sometimes contact uuids which appear in `runs` do not appear in `contact_runs`. # I have only observed this happen for contacts which were created very recently. # This test skips the run in this case; it should be included next time this script is executed. log.warning(f"Run found with Rapid Pro Contact UUID '{run.contact.uuid}', " f"but this id is not present in the downloaded contacts") continue contact_urns = contacts_lut[run.contact.uuid].urns if len(contact_urns) == 0: log.warning(f"Ignoring contact with no urn. URNs: {contact_urns} " f"(Rapid Pro Contact UUID: {run.contact.uuid})") continue phone_numbers.append(PhoneCleaner.normalise_phone(contact_urns[0])) runs_with_uuids.append(run) phone_to_uuid_lut = phone_uuids.data_to_uuid_batch(phone_numbers) traced_runs = [] for run in runs_with_uuids: contact_urns = contacts_lut[run.contact.uuid].urns run_dict = { "avf_phone_id": phone_to_uuid_lut[PhoneCleaner.normalise_phone(contact_urns[0])], f"run_id - {run.flow.name}": run.id } for category, response in run.values.items(): run_dict[category.title() + " (Category) - " + run.flow.name] = response.category run_dict[category.title() + " (Value) - " + run.flow.name] = response.value # Convert from "input" to "text" here to match terminology in Rapid Pro's Excel exports. run_dict[category.title() + " (Text) - " + run.flow.name] = response.input run_dict[category.title() + " (Name) - " + run.flow.name] = response.name run_dict[category.title() + " (Time) - " + run.flow.name] = response.time.isoformat() run_dict[category.title() + " (Run ID) - " + run.flow.name] = run.id if run.contact.uuid in test_contacts: run_dict["test_run"] = True else: assert len(contact_urns) == 1, \ f"A non-test contact has multiple URNs (Rapid Pro Contact UUID: {run.contact.uuid})" run_dict[f"run_created_on - {run.flow.name}"] = run.created_on.isoformat() run_dict[f"run_modified_on - {run.flow.name}"] = run.modified_on.isoformat() run_dict[f"run_exited_on - {run.flow.name}"] = None if run.exited_on is None else run.exited_on.isoformat() run_dict[f"run_exit_type - {run.flow.name}"] = run.exit_type traced_runs.append( TracedData(run_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info(f"Converted {len(traced_runs)} raw runs to TracedData") return traced_runs
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of raw/coded keys which survey_keys = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys: survey_keys.append(plan.analysis_file_key) if plan.raw_field not in survey_keys: survey_keys.append(plan.raw_field) # Convert survey codes to their string values for td in data: td.append_data( { plan.analysis_file_key: plan.code_scheme.get_code_with_id( td[plan.coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.SURVEY_CODING_PLANS if plan.analysis_file_key is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Convert RQA binary codes to their string values for td in data: td.append_data( { plan.binary_analysis_file_key: plan.binary_code_scheme.get_code_with_id( td[plan.binary_coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_code_scheme is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Translate the RQA reason codes to matrix values matrix_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: show_matrix_keys = list() for code in plan.code_scheme.codes: show_matrix_keys.append( f"{plan.analysis_file_key}{code.string_value}") AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys, plan.code_scheme, plan.coded_field, plan.analysis_file_key) matrix_keys.extend(show_matrix_keys) binary_keys = [ plan.binary_analysis_file_key for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_analysis_file_key is not None ] equal_keys = ["uid"] equal_keys.extend(survey_keys) concat_keys = [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ] bool_keys = [ consent_withdrawn_key, # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] # Export to CSV export_keys = ["uid"] export_keys.extend(bool_keys) export_keys.extend(matrix_keys) export_keys.extend(binary_keys) export_keys.extend(concat_keys) export_keys.extend(survey_keys) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Set consent withdrawn based on stop codes from radio question answers for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) if plan.binary_code_scheme is not None: if td[plan.binary_coded_field]["CodeID"] == \ plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td.get(plan.raw_field, "") != "": td.append_data( { f"{plan.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{plan.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data
def impute_yes_no_reasons_codes(user, data, coding_configurations): # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. binary_configuration = coding_configurations[0] reasons_configuration = coding_configurations[1] # TODO: Switch to using CodingModes.SINGLE/MULTIPLE once configuration is being set in configuration json # rather than in pipeline_configuration.py assert binary_configuration.coding_mode == "SINGLE" assert reasons_configuration.coding_mode == "MULTIPLE" for td in data: binary_label = td[binary_configuration.coded_field] binary_code = binary_configuration.code_scheme.get_code_with_id( binary_label["CodeID"]) binary_label_present = \ binary_label["CodeID"] != binary_configuration.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = \ len(td[reasons_configuration.coded_field]) > 1 or \ td[reasons_configuration.coded_field][0][ "CodeID"] != reasons_configuration.code_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = reasons_configuration.code_scheme.get_code_with_control_code( control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( reasons_configuration.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( { reasons_configuration.coded_field: [reasons_label.to_dict()] }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( reasons_configuration.code_scheme, reasons_configuration.code_scheme. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {reasons_configuration.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def import_coda_2_to_traced_data_iterable_multi_coded( cls, user, data, message_id_key, scheme_key_map, f=None): """ Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file. Data which is has not been checked in the Coda file is coded using the provided nr_label (irrespective of whether there was an automatic code there before). Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically. TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs? :param user: Identifier of user running this program. :type user: str :param data: TracedData objects to be coded using the Coda file. :type data: iterable of TracedData :param message_id_key: Key in TracedData objects of the message ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) -> (Scheme in the Coda messages file to retrieve the labels from) :type scheme_key_map: dict of str -> iterable of Scheme :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything. :type f: file-like | None """ if f is None: f = cls._make_empty_file() # Build a lookup table of MessageID -> SchemeID -> Labels coda_dataset = cls._dataset_lut_from_messages_file( f, scheme_key_map.values()) # Filter out TracedData objects that do not contain a message id key data = [td for td in data if message_id_key in td] # Apply the labels from Coda to each TracedData item in data for td in data: for coded_key, scheme in scheme_key_map.items(): # Get labels for this (message id, scheme id) from the look-up table labels = coda_dataset.get(td[message_id_key], dict()).get(scheme.scheme_id, []) # Get the currently assigned list of labels for this multi-coded scheme, # and construct a look-up table of scheme id -> label td_labels = td.get(coded_key, []) td_labels_lut = { label["SchemeID"]: Label.from_dict(label) for label in td_labels } for label in reversed(labels): # Update the relevant label in this traced data's list of labels with the new label, # and append the whole new list to the traced data. td_labels_lut[label.scheme_id] = label td_labels = list(td_labels_lut.values()) td.append_data( {coded_key: [label.to_dict() for label in td_labels]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Delete any labels that are SPECIAL-MANUALLY_UNCODED for scheme_id, label in list(td_labels_lut.items()): if label.code_id == "SPECIAL-MANUALLY_UNCODED": del td_labels_lut[scheme_id] td_labels = list(td_labels_lut.values()) td.append_data( { coded_key: [label.to_dict() for label in td_labels] }, Metadata(user, Metadata.get_call_location(), time.time())) # If no manual labels have been set and are checked, set a code for NOT_REVIEWED checked_codes_count = 0 labels = td.get(coded_key) if labels is not None: for label in labels: if label["Checked"]: checked_codes_count += 1 if checked_codes_count == 0: nr_label = CleaningUtils.make_label_from_cleaner_code( scheme, scheme.get_code_with_control_code(Codes.NOT_REVIEWED), Metadata.get_call_location()) td.append_data({coded_key: [nr_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), time.time())) # Normalise the scheme ids of all the imported labels labels = [Label.from_dict(d) for d in td[coded_key]] for label in labels: assert label.scheme_id.startswith(scheme.scheme_id) label.scheme_id = scheme.scheme_id # De-duplicate the imported labels by selecting the first label with each code id. # This is required in cases where the same label was applied to this message under different columns # of the same code scheme, and is possible now that we have normalised the scheme ids. unique_labels_by_code_id = [] seen_code_ids = set() for label in labels: if label.code_id not in seen_code_ids: unique_labels_by_code_id.append(label) seen_code_ids.add(label.code_id) td.append_data( { coded_key: [ label.to_dict() for label in unique_labels_by_code_id ] }, Metadata(user, Metadata.get_call_location(), time.time()))
def import_coda_2_to_traced_data_iterable(cls, user, data, message_id_key, scheme_key_map, f=None): """ Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file. Data which is has not been checked in the Coda file is coded using the provided nr_label (irrespective of whether there was an automatic code there before). TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs? :param user: Identifier of user running this program. :type user: str :param data: TracedData objects to be coded using the Coda file. :type data: iterable of TracedData :param message_id_key: Key in TracedData objects of the message ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) -> (Scheme in the Coda messages file to retrieve the labels from) :type scheme_key_map: dict of str -> Scheme :param f: Coda data file to import codes from, or None. :type f: file-like | None """ if f is None: f = cls._make_empty_file() # Build a lookup table of MessageID -> SchemeID -> Labels coda_dataset = cls._dataset_lut_from_messages_file( f, scheme_key_map.values()) # Filter out TracedData objects that do not contain a message id key data = [td for td in data if message_id_key in td] # Apply the labels from Coda to each TracedData item in data for td in data: for key_of_coded, scheme in scheme_key_map.items(): # Get labels for this (message id, scheme id) from the look-up table labels = coda_dataset.get(td[message_id_key], dict()).get(scheme.scheme_id, []) if labels is not None: # Append each label that was assigned to this message for this scheme to the TracedData. for label in reversed(labels): td.append_data({key_of_coded: label.to_dict()}, Metadata( user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # If this td still has no label after importing from the Coda file, or the label is a non-missing label # that hasn't been checked in the Coda UI, set a code for NOT_REVIEWED if key_of_coded not in td or not td[key_of_coded]["Checked"]: nr_label = CleaningUtils.make_label_from_cleaner_code( scheme, scheme.get_code_with_control_code(Codes.NOT_REVIEWED), Metadata.get_call_location()) td.append_data({key_of_coded: nr_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time()))
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of keys to be exported and how they are to be handled when folding export_keys = ["uid", consent_withdrawn_key] bool_keys = [ consent_withdrawn_key # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] equal_keys = ["uid"] concat_keys = [] matrix_keys = [] binary_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: export_keys.append(cc.analysis_file_key) if cc.folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(cc.analysis_file_key) elif cc.folding_mode == FoldingModes.YES_NO_AMB: binary_keys.append(cc.analysis_file_key) else: assert False, f"Incompatible folding_mode {plan.folding_mode}" else: assert cc.folding_mode == FoldingModes.MATRIX for code in cc.code_scheme.codes: export_keys.append( f"{cc.analysis_file_key}{code.string_value}") matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") export_keys.append(plan.raw_field) if plan.raw_field_folding_mode == FoldingModes.CONCATENATE: concat_keys.append(plan.raw_field) elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(plan.raw_field) else: assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}" # Convert codes to their string/matrix values for td in data: analysis_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: analysis_dict[cc.analysis_file_key] = \ cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value else: assert cc.coding_mode == CodingModes.MULTIPLE show_matrix_keys = [] for code in cc.code_scheme.codes: show_matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") for label in td.get(cc.coded_field, []): code_string_value = cc.code_scheme.get_code_with_id( label['CodeID']).string_value analysis_dict[ f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1 for key in show_matrix_keys: if key not in analysis_dict: analysis_dict[key] = Codes.MATRIX_0 td.append_data( analysis_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.MULTIPLE: if td.get(plan.raw_field, "") != "": td.append_data( { f"{cc.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{cc.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data, folded_data
messages = [Message.from_firebase_map(d) for d in json.load(f)] log.info(f"Loaded {len(messages)} messages") log.info(f"Performing merge ({code_ids_to_merge} -> '{merged_code_id}')...") merged_count = 0 # A count of the number of labels that were remapped to the merged value, for sense-check logging for msg in messages: processed_scheme_ids = set() for label in list(msg.labels): # Skip labels that are not the latest assignment under each scheme if label.scheme_id in processed_scheme_ids: continue processed_scheme_ids.add(label.scheme_id) if label.code_id in code_ids_to_merge: msg.labels.insert( 0, Label(label.scheme_id, merged_code_id, TimeUtils.utc_now_as_iso_string(), Origin(Metadata.get_call_location(), "Auto Code-Merge", "External"), checked=label.checked)) merged_count += 1 log.info(f"Merged {merged_count} labels to '{merged_code_id}'") log.info( f"Exporting code-merged Coda messages to '{messages_output_file_path}'...") with open(messages_output_file_path, "w") as f: json.dump([msg.to_firebase_map() for msg in messages], f, indent=2) log.info("Done")
if project.flow_definitions_upload_url_prefix is None: log.info( f"Not archiving flow definitions for project {project.project_name} because its " f"'flow_definitions_upload_url_prefix' is unspecified.") continue log.info( f"Archiving the latest flow definitions for project {project.project_name}..." ) log.info( "Downloading the Rapid Pro token file and initialising the Rapid Pro client..." ) rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, project.rapid_pro_token_url).strip() rapid_pro = RapidProClient(project.rapid_pro_domain, rapid_pro_token) log.info("Downloading all the flow definitions for this instance...") flow_ids = rapid_pro.get_all_flow_ids() flow_definitions_request_timestamp = TimeUtils.utc_now_as_iso_string() flow_definitions = rapid_pro.get_flow_definitions_for_flow_ids( flow_ids) log.info("Uploading the flow definitions to a cloud bucket...") upload_url = f"{project.flow_definitions_upload_url_prefix}{flow_definitions_request_timestamp}.json" flow_definitions_json = json.dumps(flow_definitions.serialize()) google_cloud_utils.upload_string_to_blob( google_cloud_credentials_file_path, upload_url, flow_definitions_json)
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Label the RQA for which there is no response yet as TRUE MISSING for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = [nc_label.to_dict()] elif plan.binary_code_scheme is not None and td[ plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.binary_coded_field] = [ nc_label.to_dict() ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id( binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code( control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Merge manually coded demog and follow-up survey files into the cleaned dataset # Recursion depth currently exceeding # TODO: Investigate/address the cause of this. sys.setrecursionlimit(10000) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() # Not everyone will have answered all of the demographic and follow-up survey flows flows. # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING. # Label data which is just the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = nc_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Set county/constituency/from the coded constituency field. cls._impute_location_codes(user, data) # Set coding error codes using the coding error field cls._impute_coding_error_codes(user, data) return data
def remap_radio_shows(cls, user, data, coda_input_dir): """ Remaps radio shows which were in the wrong flow, and therefore have the wrong key/values set, to have the key/values they would have had if they had been received by the correct flow. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to move the radio show messages in. :type data: iterable of TracedData :param coda_input_dir: Directory to read coded coda files from. :type coda_input_dir: str """ # TODO: Convert the show remapping code here into reusable functions for each case that they handle. # Note that ultimately we probably don't want to handle the 'WS' show remapping here, # because we get that for free when we implement 'WS' handling properly. # Build a map of raw week 3 messages to wrong scheme data message_to_s01e02_dict = cls._build_message_to_s01e02_dict( user, data, coda_input_dir) for td in data: mapped_dict = dict() if cls.WEEK_3_TIME_KEY in td: # Redirect any week 3 messages coded as s01e02 in the WS - Correct Dataset scheme to week 2 # Also, fake the timestamp of redirected week 3 messages to make it look like they arrived on the day # before the incorrect sms ad was sent, i.e. the last day of week 2. # This is super yucky, but works because (a) timestamps are never exported, and (b) this date # is being set to non_logical anyway in channels.py. if message_to_s01e02_dict.get(td["rqa_message"], False): mapped_dict["show_id"] = 2 mapped_dict["sent_on"] = "2018-12-15T00:00:00+03:00" td.append_data( mapped_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Redirect any week 4 messages which were in the week 3 flow due to a late flow change-over. cls._remap_radio_show_by_time_range(user, data, cls.WEEK_3_TIME_KEY, 4, range_start=isoparse( cls.WEEK_4_START)) # Redirect any week 2 messages which were in the week 4 flow, due to undelivered messages being delivered # in two bursts after the end of the radio shows. cls._remap_radio_show_by_time_range( user, data, cls.WEEK_4_TIME_KEY, 2, range_start=isoparse(cls.THURSDAY_BURST_START), range_end=isoparse(cls.THURSDAY_BURST_END), time_to_adjust_to=isoparse(cls.THURSDAY_CORRECTION_TIME)) cls._remap_radio_show_by_time_range( user, data, cls.WEEK_4_TIME_KEY, 2, range_start=isoparse(cls.FRIDAY_BURST_START), range_end=isoparse(cls.FRIDAY_BURST_END), time_to_adjust_to=isoparse(cls.FRIDAY_CORRECTION_TIME))
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") if blob_url in recovery_csv_source.activation_flow_urls: label_somalia_operator(user, traced_runs, phone_number_uuid_table) log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) print(coda_input_path) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # At this point, the TracedData objects still contain messages for at most one week each. # Label the weeks for which there is no response as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code(control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation" ) td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) # Not everyone will have answered all of the demographic flows. # Label demographic questions which had no responses as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) return data
def move_wrong_scheme_messages(user, data, coda_input_dir): log.info("Importing manually coded Coda files to '_WS' fields...") for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue for td in data: if plan.raw_field in td: td.append_data( {f"{plan.id_field}_WS": plan.message_id_fn(td)}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, f"{plan.id_field}_WS", { f"{plan.raw_field}_WS_correct_dataset": PipelineConfiguration.WS_CORRECT_DATASET_SCHEME }, f) for cc in plan.coding_configurations: with open(f"{coda_input_dir}/{plan.coda_filename}") as f: if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) else: assert cc.coding_mode == CodingModes.MULTIPLE TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, f"{plan.id_field}_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) log.info("Checking for WS Coding Errors...") # Check for coding errors for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: rqa_codes = [] for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: if f"{cc.coded_field}_WS" in td: label = td[f"{cc.coded_field}_WS"] rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) else: assert cc.coding_mode == CodingModes.MULTIPLE for label in td.get(f"{cc.coded_field}_WS", []): rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) has_ws_code_in_code_scheme = False for code in rqa_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False if f"{plan.raw_field}_WS_correct_dataset" in td: ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.warning( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict = { f"{plan.raw_field}_WS_correct_dataset": CleaningUtils.make_label_from_cleaner_code( PipelineConfiguration.WS_CORRECT_DATASET_SCHEME, PipelineConfiguration.WS_CORRECT_DATASET_SCHEME. get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() } td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Construct a map from WS normal code id to the raw field that code indicates a requested move to. ws_code_to_raw_field_map = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.ws_code is not None: ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field # Group the TracedData by uid. data_grouped_by_uid = dict() for td in data: uid = td["uid"] if uid not in data_grouped_by_uid: data_grouped_by_uid[uid] = [] data_grouped_by_uid[uid].append(td) # Perform the WS correction for each uid. log.info("Performing WS correction...") corrected_data = [] # List of TracedData with the WS data moved. unknown_target_code_counts = dict( ) # 'WS - Correct Dataset' codes with no matching code id in any coding plan # for this project, with a count of the occurrences for group in data_grouped_by_uid.values(): # Find all the surveys data being moved. # (Note: we only need to check one td in this group because all the demographics are the same) td = group[0] survey_moves = dict() # of source_field -> target_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: survey_moves[ plan.raw_field] = ws_code_to_raw_field_map[ ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[(ws_code.code_id, ws_code.display_text)] += 1 survey_moves[plan.raw_field] = None # Find all the RQA data being moved. rqa_moves = dict( ) # of (index in group, source_field) -> target_field for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: rqa_moves[( i, plan.raw_field )] = ws_code_to_raw_field_map[ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] += 1 rqa_moves[(i, plan.raw_field)] = None # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have. survey_updates = dict() # of raw_field -> updated value for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in survey_moves.keys(): # Data is moving survey_updates[plan.raw_field] = [] elif plan.raw_field in td: # Data is not moving survey_updates[plan.raw_field] = [ _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) ] # Build a list of the rqa fields that haven't been moved. rqa_updates = [] # of (raw_field, _WSUpdate) for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in td: if (i, plan.raw_field) in rqa_moves.keys(): # Data is moving pass else: # Data is not moving rqa_updates.append( (plan.raw_field, _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td))) # Add data moving from survey fields to the relevant survey_/rqa_updates raw_survey_fields = { plan.raw_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS } raw_rqa_fields = { plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS } for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in survey_moves: continue target_field = survey_moves[plan.raw_field] if target_field is None: continue update = _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Add data moving from RQA fields to the relevant survey_/rqa_updates for (i, source_field), target_field in rqa_moves.items(): if target_field is None: continue for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field == source_field: _td = group[i] update = _WSUpdate(_td[plan.raw_field], _td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Re-format the survey updates to a form suitable for use by the rest of the pipeline flattened_survey_updates = {} for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field in survey_updates: plan_updates = survey_updates[plan.raw_field] if len(plan_updates) > 0: flattened_survey_updates[plan.raw_field] = "; ".join( [u.message for u in plan_updates]) flattened_survey_updates[plan.time_field] = sorted( [u.timestamp for u in plan_updates])[0] flattened_survey_updates[ f"{plan.raw_field}_source"] = "; ".join( [u.source_field for u in plan_updates]) else: flattened_survey_updates[plan.raw_field] = None flattened_survey_updates[plan.time_field] = None flattened_survey_updates[ f"{plan.raw_field}_source"] = None # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to # the list of TracedData to be returned raw_field_to_rqa_plan_map = { plan.raw_field: plan for plan in PipelineConfiguration.RQA_CODING_PLANS } for target_field, update in rqa_updates: corrected_td = update.source_td.copy() # Hide the survey keys currently in the TracedData which have had data moved away. corrected_td.hide_keys({ k for k, v in flattened_survey_updates.items() if v is None }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) # Update with the corrected survey data corrected_td.append_data( { k: v for k, v in flattened_survey_updates.items() if v is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Hide all the RQA fields (they will be added back, in turn, in the next step). corrected_td.hide_keys({ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) corrected_td.hide_keys({ plan.time_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) target_coding_plan = raw_field_to_rqa_plan_map[target_field] rqa_dict = { target_field: update.message, target_coding_plan.time_field: update.timestamp, f"{target_field}_source": update.source_field } corrected_td.append_data( rqa_dict, Metadata(user, Metadata.get_call_location(), time.time())) corrected_data.append(corrected_td) if len(unknown_target_code_counts) > 0: log.warning( "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:" ) for (code_id, display_text), count in unknown_target_code_counts.items(): log.warning( f" '{code_id}' (DisplayText '{display_text}') ({count} occurrences)" ) return corrected_data
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")