def auto_code_surveys(cls, user, data, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) # Note: no need to handle location in any special way on this project because it is not being auto-coded return data
def test_ensure_dirs_exist(self): IOUtils.ensure_dirs_exist(path.join(self.test_dir, "a/b/c")) self.assertTrue(path.exists(path.join(self.test_dir, "a/b/c"))) IOUtils.ensure_dirs_exist(path.join(self.test_dir, "a/b/d")) self.assertTrue(path.exists(path.join(self.test_dir, "a/b/c"))) self.assertTrue(path.exists(path.join(self.test_dir, "a/b/d")))
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output survey answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, 'w') as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f ) print("Coda demogs files successfully exported") return data
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE, PipelineConfiguration.PROJECT_END_DATE) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output RQA and follow up surveys messages to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output RQA and follow up messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: rqa_and_follow_up_messages = [] # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change for td in data: if plan.raw_field in td: rqa_and_follow_up_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def export_coda(cls, user, data, coda_output_dir): IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f)
def export_icr(cls, data, icr_output_dir): # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field])
def export_coda(cls, user, data, coda_output_dir): IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue for td in data: if plan.raw_field in td: td.append_data({plan.id_field: plan.message_id_fn(td)}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f)
def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def test_ensure_dirs_exist_for_file(self): IOUtils.ensure_dirs_exist_for_file(path.join(self.test_dir, "x/y/test.txt")) self.assertTrue(path.exists(path.join(self.test_dir, "x/y"))) self.assertFalse(path.exists(path.join(self.test_dir, "x/y/test.txt"))) # Test method doesn't fail if no parent directories provided IOUtils.ensure_dirs_exist_for_file(path.join(self.test_dir, "test.txt")) IOUtils.ensure_dirs_exist_for_file("test.txt")
data = ProductionFile.generate(data, production_csv_output_path) log.info("Auto Coding Surveys...") data = AutoCodeSurveys.auto_code_surveys(user, data, phone_number_uuid_table, coded_dir_path) log.info("Applying Manual Codes from Coda...") data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path) log.info("Generating Analysis CSVs...") data = AnalysisFile.generate(user, data, csv_by_message_output_path, csv_by_individual_output_path) log.info("Writing TracedData to file...") IOUtils.ensure_dirs_exist_for_file(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Upload to Google Drive, if requested. # Note: This should happen as late as possible in order to reduce the risk of the remainder of the pipeline failing # after a Drive upload has occurred. Failures could result in inconsistent outputs or outputs with no # traced data log. if pipeline_configuration.drive_upload is not None: log.info("Uploading CSVs to Google Drive...") production_csv_drive_dir = os.path.dirname( pipeline_configuration.drive_upload.production_upload_path) production_csv_drive_file_name = os.path.basename(
user = args.user phone_uuid_path = args.phone_uuid_table_path demog_dataset_path = args.demog_dataset_path json_output_path = args.json_output_path with open(phone_uuid_path, "r") as f: phone_uuids = PhoneNumberUuidTable.load(f) with open(demog_dataset_path, "r") as f: traced_demog = TracedDataCSVIO.import_csv_to_traced_data_iterable( user, f) traced_demog = list(traced_demog) for td in traced_demog: uuid_dict = { "avf_phone_id": phone_uuids.add_phone(td["final_phone"]) } td.append_data( uuid_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Write the UUIDs out to a file with open(phone_uuid_path, "w") as f: phone_uuids.dump(f) # Output TracedData to JSON. IOUtils.ensure_dirs_exist(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(traced_demog, f, pretty_print=True)
parser.add_argument("individuals_json_input_path", metavar="individuals-json-input-path", help="Path to a JSONL file to read the TracedData of the messages data from") parser.add_argument("output_dir", metavar="output-dir", help="Directory to write the output graphs to") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...")
def auto_code_surveys(cls, user, data, pipeline_configuration, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Remove survey data sent after the project finished log.info( "Hiding survey messages sent after the end of the project. These will not be exported in " "production/analysis files") out_of_range_count = 0 for td in data: for plan in PipelineConfiguration.SURVEY_CODING_PLANS: # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists if plan.raw_field in ["have_voice_raw", "suggestions_raw"]: continue if plan.time_field in td and isoparse( td[plan.time_field] ) > pipeline_configuration.project_end_date: out_of_range_count += 1 td.hide_keys({plan.raw_field, plan.time_field}, Metadata(user, Metadata.get_call_location(), time.time())) log.info( f"Hid {out_of_range_count} survey messages sent after the end of the project" ) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output survey responses to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) return data
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Set operator from phone number for td in data: operator_clean = PhoneCleaner.clean_operator( phone_uuid_table.get_phone(td["uid"])) if operator_clean == Codes.NOT_CODED: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) else: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_match_value( operator_clean), Metadata.get_call_location()) td.append_data({"operator_coded": label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field == "mogadishu_sub_district_raw": continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output location scheme to coda for manual verification + coding output_path = path.join(coda_output_dir, "location.json") TracedDataCodaV2IO.compute_message_ids( user, data, "mogadishu_sub_district_raw", "mogadishu_sub_district_raw_id") with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, "mogadishu_sub_district_raw", "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id", { "mogadishu_sub_district_coded": CodeSchemes.MOGADISHU_SUB_DISTRICT, "district_coded": CodeSchemes.DISTRICT, "region_coded": CodeSchemes.REGION, "state_coded": CodeSchemes.STATE, "zone_coded": CodeSchemes.ZONE }, f) return data
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
google_cloud_credentials_file_path, workspace_2_credentials_url).strip() workspace_2 = RapidProClient(workspace_2_domain, workspace_2_token) workspace_2_name = workspace_2.get_workspace_name() log.info(f"Done. workspace 2 is called {workspace_2_name}") # Download the data from Rapid Pro log.info("Downloading contact fields...") log.info(f"Downloading all fields from {workspace_1_name}...") workspace_1_fields = workspace_1.get_fields() log.info(f"Downloading all fields from {workspace_2_name}...") workspace_2_fields = workspace_2.get_fields() # Synchronise the contacts log.info("Downloading contacts...") IOUtils.ensure_dirs_exist(raw_data_log_directory) log.info(f"Downloading all contacts from {workspace_1_name}...") with open(f"{raw_data_log_directory}/{workspace_1_name}_raw_contacts.json", "w") as f: workspace_1_contacts = workspace_1.get_raw_contacts( raw_export_log_file=f) log.info(f"Downloading all contacts from {workspace_2_name}...") with open(f"{raw_data_log_directory}/{workspace_2_name}_raw_contacts.json", "w") as f: workspace_2_contacts = workspace_2.get_raw_contacts( raw_export_log_file=f) # If in dry_run mode, dereference workspace_1 and workspace_2 as an added safety. This prevents accidental # writes to either workspace. if dry_run: workspace_1 = None
td.append_data({eat_key: eat_time}, Metadata(user, Metadata.get_call_location(), time.time())) if START_TIME <= utc_time <= END_TIME: inside_time_window.append(td) else: print("Dropping: {}".format(utc_time)) print("{}:{} Dropped as outside time/Total".format( len(show_messages) - len(inside_time_window), len(show_messages))) show_messages = inside_time_window # Output messages to a CSV file IOUtils.ensure_dirs_exist_for_file(csv_output_path) run_id_key = "{} (Run ID) - {}".format(variable_name, flow_name) raw_text_key = "{} (Text) - {}".format(variable_name, flow_name) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( show_messages, f, headers=["avf_phone_id", run_id_key, raw_text_key]) # Output messages to Coda IOUtils.ensure_dirs_exist_for_file(coda_output_path) if os.path.exists(prev_coda_path): # TODO: Modifying this line once the coding frame has been developed to include lots of Nones feels a bit # TODO: cumbersome. We could instead modify export_traced_data_iterable_to_coda to support a prev_f argument. # TODO: Modify by adding code scheme keys once they are ready scheme_keys = {
def export_participation_maps(individuals, consent_withdrawn_field, theme_configurations, admin_region_configuration, mapper, file_prefix, export_by_theme=True): """ Computes and exports a map showing participation by administrative region. Optionally exports maps showing the participation broken down by theme. :param individuals: Individuals to export participation maps for. :type individuals: iterable of core_data_modules.traced_data.TracedData :param consent_withdrawn_field: Field in each individuals object which records if consent is withdrawn. :type consent_withdrawn_field: str :param theme_configurations: Configuration for the theme datasets. :type theme_configurations: iterable of core_data_modules.analysis.AnalysisConfiguration :param admin_region_configuration: Configuration for the administrative region labels, used to count the engagement by admin region for each map. :type admin_region_configuration: iterable of core_data_modules.analysis.AnalysisConfiguration :param mapper: A function which, given participation frequencies and a file name to export to, renders a map of those frequencies to disk. For standard maps, see the mapper functions provided in `core_data_modules.analysis.mapping`. :type mapper: func of (dict of str -> int, str) -> void :param file_prefix: The prefix of the path to write the files to, e.g. "/data/maps/mogadishu_" :type file_prefix: str :param export_by_theme: Whether to export a map of participation for each theme. :type export_by_theme: bool """ IOUtils.ensure_dirs_exist_for_file(file_prefix) # Export a map showing the total participations log.info(f"Exporting map to '{file_prefix}total_participants.png'...") region_distributions = theme_distributions.compute_theme_distributions( individuals, consent_withdrawn_field, [admin_region_configuration], [] )[admin_region_configuration.dataset_name] total_frequencies = dict() for region_code in _normal_codes(admin_region_configuration.code_scheme.codes): total_frequencies[region_code.string_value] = region_distributions[region_code.string_value]["Total Participants"] mapper(total_frequencies, f"{file_prefix}total_participants.png") if not export_by_theme: return # For each theme_configuration, export: # 1. A map showing the totals for individuals relevant to that episode. # 2. A map showing the totals for each theme distributions = theme_distributions.compute_theme_distributions( individuals, consent_withdrawn_field, theme_configurations, [admin_region_configuration] ) for config in theme_configurations: map_index = 1 log.info(f"Exporting map to '{file_prefix}{config.dataset_name}_{map_index}_total_relevant.png'...") config_total_frequencies = dict() for region_code in _normal_codes(admin_region_configuration.code_scheme.codes): config_total_frequencies[region_code.string_value] = distributions[config.dataset_name][ "Total Relevant Participants"][f"{admin_region_configuration.dataset_name}:{region_code.string_value}"] mapper(config_total_frequencies, f"{file_prefix}{config.dataset_name}_{map_index}_total_relevant.png") for theme in _normal_codes(config.code_scheme.codes): map_index += 1 log.info(f"Exporting map to '{file_prefix}{config.dataset_name}_{map_index}_{theme.string_value}.png'...") theme_frequencies = dict() for region_code in _normal_codes(admin_region_configuration.code_scheme.codes): theme_frequencies[region_code.string_value] = distributions[config.dataset_name][theme.string_value][ f"{admin_region_configuration.dataset_name}:{region_code.string_value}"] mapper(theme_frequencies, f"{file_prefix}{config.dataset_name}_{map_index}_{theme.string_value}.png")
def auto_code_show_messages(cls, user, data, pipeline_configuration, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if pipeline_configuration.filter_test_messages: data = MessageFilters.filter_test_messages(data) else: log.debug( "Not filtering out test messages (because the pipeline configuration json key " "'FilterTestMessages' was set to false)") # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Skipping auto-assigning noise, as an experiment on this project. # If it turns out we need this, uncomment this block. # for td in data: # is_noise = True # for rqa_key in cls.RQA_KEYS: # if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10): # is_noise = False # td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # TODO: Label each message with channel keys # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY, # pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Compute the number of RQA messages that were the empty string log.debug( "Counting the number of empty string messages for each raw radio show field..." ) raw_rqa_fields = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in raw_rqa_fields: raw_rqa_fields.append(plan.raw_field) cls.log_empty_string_stats(data, raw_rqa_fields) # Compute the number of survey messages that were the empty string log.debug( "Counting the number of empty string messages for each survey field..." ) raw_survey_fields = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in raw_survey_fields: raw_survey_fields.append(plan.raw_field) survey_data = dict() for td in data: survey_data[td["uid"]] = td cls.log_empty_string_stats(survey_data.values(), raw_survey_fields) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir, facebook_uuid_table, facebook_source): log.info("Fetching data from Facebook...") log.info("Downloading Facebook access token...") facebook_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, facebook_source.token_file_url).strip() facebook = FacebookClient(facebook_token) for dataset in facebook_source.datasets: log.info(f"Exporting comments for dataset {dataset.name}...") raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json" traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl" # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook. raw_comments = [] for post_id in dataset.post_ids: comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl" with open(comments_log_path, "a") as raw_comments_log_file: post_comments = facebook.get_all_comments_on_post( post_id, raw_export_log_file=raw_comments_log_file, fields=[ "from{id}", "parent", "attachments", "created_time", "message" ]) # Download the post and add it as context to all the comments. Adding a reference to the post under # which a comment was made enables downstream features such as post-type labelling and comment context # in Coda, as well as allowing us to track how many comments were made on each post. post = facebook.get_post(post_id, fields=["attachments"]) for comment in post_comments: comment["post"] = post raw_comments.extend(post_comments) # Facebook only returns a parent if the comment is a reply to another comment. # If there is no parent, set one to the empty-dict. for comment in raw_comments: if "parent" not in comment: comment["parent"] = {} # Convert the comments to TracedData. traced_comments = facebook.convert_facebook_comments_to_traced_data( user, dataset.name, raw_comments, facebook_uuid_table) # Export to disk. log.info( f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path) with open(raw_comments_output_path, "w") as raw_comments_output_file: json.dump(raw_comments, raw_comments_output_file) log.info(f"Saved {len(raw_comments)} raw comments") log.info( f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path) with open(traced_comments_output_path, "w") as traced_comments_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_comments, traced_comments_output_file) log.info(f"Saved {len(traced_comments)} traced comments")
data = ProductionFile.generate(data, production_csv_output_path) if pipeline_run_mode == "all-stages": log.info("Running post labelling pipeline stages...") log.info("Applying Manual Codes from Coda...") data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path) log.info("Generating Analysis CSVs...") messages_data, individuals_data = AnalysisFile.generate( user, data, csv_by_message_output_path, csv_by_individual_output_path) log.info("Writing messages TracedData to file...") IOUtils.ensure_dirs_exist_for_file(messages_json_output_path) with open(messages_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( messages_data, f) log.info("Writing individuals TracedData to file...") IOUtils.ensure_dirs_exist_for_file(individuals_json_output_path) with open(individuals_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( individuals_data, f) else: assert pipeline_run_mode == "auto-code-only", "pipeline run mode must be either auto-code-only or all-stages" log.info("Writing Auto-Coding TracedData to file...") IOUtils.ensure_dirs_exist_for_file(auto_coding_json_output_path) with open(auto_coding_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
help="Path to a JSONL file to read the TracedData of the messages data from") parser.add_argument("individuals_json_input_path", metavar="individuals-json-input-path", help="Path to a JSONL file to read the TracedData of the messages data from") parser.add_argument("automated_analysis_output_dir", metavar="automated-analysis-output-dir", help="Directory to write the automated analysis outputs to") args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/counties") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/constituencies") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_json(traced_runs, traced_runs_output_file, pretty_print=True) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
) parser.add_argument("output_dir", metavar="output-dir", help="Directory to write the analysis outputs to") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) IOUtils.ensure_dirs_exist(f"{output_dir}/maps/regions") IOUtils.ensure_dirs_exist(f"{output_dir}/maps/districts") IOUtils.ensure_dirs_exist(f"{output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string(
) parser.add_argument( "automated_analysis_output_dir", metavar="automated-analysis-output-dir", help="Directory to write the automated analysis outputs to") args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
) parser.add_argument( "automated_analysis_output_dir", metavar="automated-analysis-output-dir", help="Directory to write the automated analysis outputs to") args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/regions") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/districts") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/mogadishu") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000)
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Tag messages which are noise as being noise for td in data: is_noise = True for rqa_key in cls.RQA_KEYS: if rqa_key in td and not somali.DemographicCleaner.is_noise( td[rqa_key], min_length=10): is_noise = False td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change. if plan.coded_field not in td: rqa_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [ Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file) ] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info( f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server" ) with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts( raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [ Run.deserialize(run_json) for run_json in json.load(raw_runs_file) ] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info( f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'" ) raw_runs = rapid_pro.get_raw_runs_for_flow_id( flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
parser.add_argument("individuals_json_input_path", metavar="individuals-json-input-path", help="Path to a JSONL file to read the TracedData of the messages data from") parser.add_argument("output_dir", metavar="output-dir", help="Directory to write the analysis outputs to") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) IOUtils.ensure_dirs_exist(f"{output_dir}/maps") IOUtils.ensure_dirs_exist(f"{output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info)