def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def test_round_trip(self): expected = self.generate_test_data() temp_file = tempfile.NamedTemporaryFile() with open(temp_file.name, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(expected, f) with open(temp_file.name, "r") as f: imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)) self.assertEqual(len(expected), len(imported)) for x, y in zip(expected, imported): x_attributes = {k: getattr(x, k) for k in dir(x) if not k.startswith("__") and not callable(getattr(x, k)) and k != "_cache"} y_attributes = {k: getattr(y, k) for k in dir(y) if not k.startswith("__") and not callable(getattr(y, k)) and k != "_cache"} self.assertDictEqual(x_attributes, y_attributes)
def test_import_json_to_traced_data_iterable(self): file_path = "tests/traced_data/resources/json_export_expected.json" expected = self.generate_test_data() with open(file_path, "r") as f: imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)) self.assertListEqual(expected, imported)
def test_export_traced_data_iterable_to_jsonl(self): file_path = path.join(self.test_dir, "json_test.json") # Test exporting wrong data type data = self.generate_test_data() with open(file_path, "w") as f: try: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data[0], f) self.fail("Exporting the wrong data type did not raise an assertion error") except AssertionError as e: self.assertEqual(str(e), _td_type_error_string) # Test normal export data = self.generate_test_data() with open(file_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/json_export_expected.json"))
def load_show(show_name): show_path = path.join(messages_input_path, "{}.json".format(show_name)) if not path.exists(show_path): print("Warning: No show found with file name '{}.json'".format( show_name)) return [] with open(show_path, "r") as f: return list( TracedDataJsonIO.import_json_to_traced_data_iterable(f))
def load_datasets(flow_names): datasets = [] for i, flow_name in enumerate(flow_names): raw_flow_path = f"{raw_data_dir}/{flow_name}.jsonl" log.info(f"Loading {i + 1}/{len(flow_names)}: {raw_flow_path}...") with open(raw_flow_path, "r") as f: runs = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(runs)} runs") datasets.append(runs) return datasets
def load_survey_dict(file_path): """ Loads a survey from a TracedData JSON file into a dict indexed by avf_phone_id :param file_path: Path to survey file to load :type file_path: str :return: Dictionary mapping contact id ('avf_phone_id') to the survey TracedData for that contact. :rtype: dict of str -> TracedData """ with open(file_path, "r") as f: return {td["avf_phone_id"]: td for td in TracedDataJsonIO.import_json_to_traced_data_iterable(f)}
def test_flush_history_from_traced_data_iterable(self): history_file_path = path.join(self.test_dir, "flush_test_history.jsonl") data = self.generate_test_data() data_0_sha = data[0].get_sha() self.assertEqual(len(data[1].get_history("Gender")), 2) TracedDataJsonIO.flush_history_from_traced_data_iterable("test_user", data, history_file_path) self.assertTrue(filecmp.cmp(history_file_path, "tests/traced_data/resources/flush_history_expected_history.jsonl")) self.assertEqual(len(data[1].get_history("Gender")), 1) self.assertEqual(data[0]["_PrevTracedDataSHA"], data_0_sha) # Test the remaining data can be round-tripped latest_file_path = path.join(self.test_dir, "flush_test_latest.jsonl") with open(latest_file_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) with open(latest_file_path, "r") as f: TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
parser.add_argument("demogs_traced_json_output_path", help="Path to updted survey TraceData JSON") args = parser.parse_args() user = args.user messages_input_path = args.messages_input_path surveys_input_path = args.surveys_input_path demogs_input_path = args.demogs_input_path messages_traced_json_output_path = args.messages_traced_json_output_path surveys_traced_json_output_path = args.surveys_traced_json_output_path demogs_traced_json_output_path = args.demogs_traced_json_output_path # Load messages for filename in os.listdir(messages_input_path): with open(os.path.join(messages_input_path, filename)) as f: messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Create a set of all the keys appearing in the data keys = {key for message in messages for key in message.keys()} keys = list(keys) keys.remove("avf_phone_id") # Add group name to each key group_name = filename.split("_with_id.json")[0] create_unique_keys(messages, keys, group_name) # Output updated td-s message_output_path = os.path.join( messages_traced_json_output_path, "{}_updated_keys.json".format(group_name)) with open(message_output_path, "w") as f:
user = args.user phone_uuid_path = args.phone_uuid_table_path demog_dataset_path = args.demog_dataset_path json_output_path = args.json_output_path with open(phone_uuid_path, "r") as f: phone_uuids = PhoneNumberUuidTable.load(f) with open(demog_dataset_path, "r") as f: traced_demog = TracedDataCSVIO.import_csv_to_traced_data_iterable( user, f) traced_demog = list(traced_demog) for td in traced_demog: uuid_dict = { "avf_phone_id": phone_uuids.add_phone(td["final_phone"]) } td.append_data( uuid_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Write the UUIDs out to a file with open(phone_uuid_path, "w") as f: phone_uuids.dump(f) # Output TracedData to JSON. IOUtils.ensure_dirs_exist(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(traced_demog, f, pretty_print=True)
metavar="json-output-path", help="Path to a JSON file to write processed messages to") args = parser.parse_args() user = args.user messages_input_path = args.messages_input_path survey_input_path = args.survey_input_path demog_input_path = args.demog_input_path json_output_path = args.json_output_path # Load messages messages_datasets = [] for filename in os.listdir(messages_input_path): with open(os.path.join(messages_input_path, filename)) as f: messages_datasets.append( TracedDataJsonIO.import_json_to_traced_data_iterable(f)) # Load followup surveys survey_datasets = [] for filename in os.listdir(survey_input_path): with open(os.path.join(survey_input_path, filename)) as f: survey_datasets.append( TracedDataJsonIO.import_json_to_traced_data_iterable(f)) # Load demogs print("Loading Demographics...") with open(demog_input_path, "r") as f: demographics = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Add survey data to the messages print("Combining Datasets...")
coda_output_path = args.coda_output_path icr_output_path = args.icr_output_path csv_output_path = args.csv_output_path ICR_MESSAGES_COUNT = 200 # Number of messages to export in the ICR file # Convert date/time of messages to EAT utc_key = "{} (Time) - {}".format(variable_name, flow_name) eat_key = "{} (Time EAT) - {}".format(variable_name, flow_name) inside_time_window = [] START_TIME = isoparse("2018-10-18T00+03:00") END_TIME = isoparse("2018-10-27T00+03:00") # Load data from JSON file with open(json_input_path, "r") as f: show_messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Filter out test messages sent by AVF. show_messages = [ td for td in show_messages if not td.get("test_run", False) ] # Filter for runs which contain a response to this week's question. show_message_key = "{} (Text) - {}".format(variable_name, flow_name) show_messages = [td for td in show_messages if show_message_key in td] # Filter out messages sent outside the project run period for td in show_messages: utc_time = isoparse(td[utc_key]) eat_time = utc_time.astimezone( pytz.timezone("Africa/Nairobi")).isoformat()
fixup_table = json.load(open(fixup_table_path, 'r')) # print ("Loading code schemes") code_scheme_paths_list = [ os.path.join(code_schemes_in_folder, f) for f in os.listdir(code_schemes_in_folder) if os.path.isfile(os.path.join(code_schemes_in_folder, f)) ] code_schemes = {} for code_scheme_path in code_scheme_paths_list: scheme = json.load(open(code_scheme_path, 'r')) code_schemes[scheme["SchemeID"]] = scheme # print ("Loading demog data") demog_td = TracedDataJsonIO.import_json_to_traced_data_iterable( open(os.path.join(demogs_in_folder, "Demog_survey_with_id.json"), 'r')) # print ("Remapping demogs") for msg in demog_td: for demog_map in demog_maps: id = msg[demog_map["MessageId"]] dataset_in_principle = demog_map["Coda-Dataset"] remap(id, dataset_in_principle, fixup_table, code_schemes) # print ("Remapping messages") for message_map in message_maps: # print ("Loading message_map: {}".format(message_map["FileName"])) messages_td = TracedDataJsonIO.import_json_to_traced_data_iterable( open(os.path.join(messages_in_folder, message_map["FileName"]), 'r'))
"json_output", metavar="json-output", help= "Path to write results of merging to, as a serialised TracedData JSON file", nargs=1) args = parser.parse_args() user = args.user[0] input_path_messages = args.input_messages[0] group = args.group[0] input_path_adverts = args.input_adverts[0] json_output_path = args.json_output[0] # Load data from JSON file with open(input_path_messages, "r") as f: messages_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Load data from JSON file with open(input_path_adverts, "r") as f: adverts_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Map "QUESTION_R" => "Message" for td in adverts_data: assert "QUESTION_R" in td.keys() td.append_data( { "Message": td["QUESTION_R"], "Date": td["start_date"], "Group": group }, Metadata(user, Metadata.get_call_location(), time.time()))
drive_client_wrapper.init_client_from_info(credentials_info) # Load phone number <-> UUID table print("Loading Phone Number <-> UUID Table...") with open(phone_number_uuid_table_path, "r") as f: phone_number_uuid_table = PhoneNumberUuidTable.load(f) # Load messages messages_datasets = [] for i, activation_flow_name in enumerate( pipeline_configuration.activation_flow_names): raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.jsonl" log.info(f"Loading {raw_activation_path}...") with open(raw_activation_path, "r") as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.debug(f"Loaded {len(messages)} messages") messages_datasets.append(messages) # Load surveys survey_datasets = [] for i, survey_flow_name in enumerate( pipeline_configuration.survey_flow_names): raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.jsonl" log.info(f"Loading {raw_survey_path}...") with open(raw_survey_path, "r") as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.debug(f"Loaded {len(messages)} messages") survey_datasets.append(messages) # Add survey data to the messages
message_paths = [s02e01_input_path] # Load the pipeline configuration file print("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) # Load phone number <-> UUID table print("Loading Phone Number <-> UUID Table...") with open(phone_number_uuid_table_path, "r") as f: phone_number_uuid_table = PhoneNumberUuidTable.load(f) # Load demographics print("Loading Demographics 1/1...") with open(demog_input_path, "r") as f: demographics = TracedDataJsonIO.import_json_to_traced_data_iterable(f) print(f"Loaded {len(demographics)} contacts") # Load messages messages_datasets = [] for i, path in enumerate(message_paths): print("Loading Episode {}/{}...".format(i + 1, len(message_paths))) with open(path, "r") as f: messages_datasets.append(TracedDataJsonIO.import_json_to_traced_data_iterable(f)) # Add survey data to the messages print("Combining Datasets...") import argparse import os import random
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir, facebook_uuid_table, facebook_source): log.info("Fetching data from Facebook...") log.info("Downloading Facebook access token...") facebook_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, facebook_source.token_file_url).strip() facebook = FacebookClient(facebook_token) for dataset in facebook_source.datasets: log.info(f"Exporting comments for dataset {dataset.name}...") raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json" traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl" # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook. raw_comments = [] for post_id in dataset.post_ids: comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl" with open(comments_log_path, "a") as raw_comments_log_file: post_comments = facebook.get_all_comments_on_post( post_id, raw_export_log_file=raw_comments_log_file, fields=[ "from{id}", "parent", "attachments", "created_time", "message" ]) # Download the post and add it as context to all the comments. Adding a reference to the post under # which a comment was made enables downstream features such as post-type labelling and comment context # in Coda, as well as allowing us to track how many comments were made on each post. post = facebook.get_post(post_id, fields=["attachments"]) for comment in post_comments: comment["post"] = post raw_comments.extend(post_comments) # Facebook only returns a parent if the comment is a reply to another comment. # If there is no parent, set one to the empty-dict. for comment in raw_comments: if "parent" not in comment: comment["parent"] = {} # Convert the comments to TracedData. traced_comments = facebook.convert_facebook_comments_to_traced_data( user, dataset.name, raw_comments, facebook_uuid_table) # Export to disk. log.info( f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path) with open(raw_comments_output_path, "w") as raw_comments_output_file: json.dump(raw_comments, raw_comments_output_file) log.info(f"Saved {len(raw_comments)} raw comments") log.info( f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path) with open(traced_comments_output_path, "w") as traced_comments_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_comments, traced_comments_output_file) log.info(f"Saved {len(traced_comments)} traced comments")
json_output_path = args.json_output_path fgd_csv_output_path = args.fgd_csv_output_path cc_csv_output_path = args.cc_csv_output_path prev_exports_path = args.prev_exports_path MINIMUM_AGE = 18 TOTAL_CC_CONTACTS = 160 TOTAL_FGD_CONTACTS = 100 # Load phone uuid table with open(phone_uuid_table_path, "r") as f: phone_uuids = PhoneNumberUuidTable.load(f) # Load FGD/CC survey responses with open(fgd_cc_input_path, "r") as f: fgd_cc_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Load the previous export prev_exports = [] if prev_exports_path is not None: with open(prev_exports_path, "r") as f: prev_exports = list( TracedDataCSVIO.import_csv_to_traced_data_iterable(user, f)) # Load coded demog surveys with open(demog_surveys_input_path, "r") as f: surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Filter out people who haven't answered the fgd_cc consent question fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc" fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td]
if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Load messages messages_datasets = [] for i, activation_flow_name in enumerate( pipeline_configuration.activation_flow_names): raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.json" log.info(f"Loading {raw_activation_path}...") with open(raw_activation_path, "r") as f: messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f) log.info(f"Loaded {len(messages)} messages") messages_datasets.append(messages) log.info("Loading surveys datasets:") surveys_datasets = [] for i, survey_flow_name in enumerate( pipeline_configuration.survey_flow_names): raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.json" log.info(f"Loading {raw_survey_path}...") with open(raw_survey_path, "r") as f: contacts = TracedDataJsonIO.import_json_to_traced_data_iterable(f) log.info(f"Loaded {len(contacts)} contacts") surveys_datasets.append(contacts) # Add survey data to the messages
if pipeline_run_mode == "all-stages": log.info("Running post labelling pipeline stages...") log.info("Applying Manual Codes from Coda...") data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path) log.info("Generating Analysis CSVs...") messages_data, individuals_data = AnalysisFile.generate( user, data, csv_by_message_output_path, csv_by_individual_output_path) log.info("Writing messages TracedData to file...") IOUtils.ensure_dirs_exist_for_file(messages_json_output_path) with open(messages_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( messages_data, f) log.info("Writing individuals TracedData to file...") IOUtils.ensure_dirs_exist_for_file(individuals_json_output_path) with open(individuals_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( individuals_data, f) else: assert pipeline_run_mode == "auto-code-only", "pipeline run mode must be either auto-code-only or all-stages" log.info("Writing Auto-Coding TracedData to file...") IOUtils.ensure_dirs_exist_for_file(auto_coding_json_output_path) with open(auto_coding_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) log.info("Python script complete")
"Path to a coded survey JSON file, containing a list of serialized TracedData objects" ) parser.add_argument( "csv_output_path", metavar="csv-output-path", help="Path to a CSV file to write the summarised stats to") args = parser.parse_args() user = args.user messages_input_path = args.messages_input_path survey_input_path = args.survey_input_path csv_output_path = args.csv_output_path # Load surveys with open(survey_input_path, "r") as f: surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f) def load_show(show_name): show_path = path.join(messages_input_path, "{}.json".format(show_name)) if not path.exists(show_path): print("Warning: No show found with file name '{}.json'".format( show_name)) return [] with open(show_path, "r") as f: return list( TracedDataJsonIO.import_json_to_traced_data_iterable(f)) survey_keys = { "District (Text) - wt_demog_1": surveys, "Gender (Text) - wt_demog_1": surveys, "Urban_Rural (Text) - wt_demog_1": surveys,
# Convert times to ISO for td in messages: td.append_data( {"Date": session.echo_mobile_date_to_iso(td["Date"])}, Metadata(user, Metadata.get_call_location(), time.time()) ) # Filter out messages sent outwith the desired time range. messages = list(filter(lambda td: echo_mobile_start_date <= isoparse(td["Date"]) < echo_mobile_end_date, messages)) # Add a unique id to each message for td in messages: td.append_data( {"avf_message_id": message_uuids.add_message( EchoMobileSession.normalise_message(td, "avf_phone_id", "Date", "Message"))}, Metadata(user, Metadata.get_call_location(), time.time()) ) # Write the UUIDs out to a file with open(phone_uuid_path, "w") as f: phone_uuids.dump(f) with open(message_uuid_path, "w") as f: message_uuids.dump(f) # Write the parsed messages to a json file if os.path.dirname(json_output_path) is not "" and not os.path.exists(os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(messages, f, pretty_print=True)
with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") log.info("Initialised the Firestore UUID table") uuids = set() skipped_nr = 0 for path in traced_data_paths: # Load the traced data log.info(f"Loading previous traced data from file '{path}'...") with open(path) as f: data = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(data)} traced data objects") for td in data: if td["consent_withdrawn"] == Codes.TRUE: continue uuids.add(td["uid"]) log.info( f"Loaded {len(uuids)} uuids from TracedData (skipped {skipped_nr} items with an NR property)" ) if exclusion_list_file_path is not None: # Load the exclusion list log.info( f"Loading the exclusion list from {exclusion_list_file_path}...")
"Path to input file to concatenate, containing a list of TracedData objects as JSON", nargs=1) parser.add_argument("json_output", metavar="json-output", help="Path to write results of cleaning to", nargs=1) args = parser.parse_args() user = args.user[0] json_input_path_1 = args.json_input_1[0] json_input_path_2 = args.json_input_2[0] json_output_path = args.json_output[0] # Load data from JSON file with open(json_input_path_1, "r") as f: input_data_1 = TracedDataJsonIO.import_json_to_traced_data_iterable(f) with open(json_input_path_2, "r") as f: input_data_2 = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Concatenate files output_data = list(input_data_1) output_data.extend(input_data_2) # Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(output_data, f, pretty_print=True)
f"{automated_analysis_output_dir}/maps/mogadishu") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) for i in range(len(messages)): messages[i] = dict(messages[i].items()) log.info(f"Loaded {len(messages)} messages") # Read the individuals dataset log.info( f"Loading the individuals dataset from {individuals_json_input_path}..." ) with open(individuals_json_input_path) as f: individuals = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) for i in range(len(individuals)): individuals[i] = dict(individuals[i].items()) log.info(f"Loaded {len(individuals)} individuals") # Compute the number of messages, individuals, and relevant messages per episode and overall.
if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Load messages messages_datasets = [] for i, activation_flow_name in enumerate( pipeline_configuration.activation_flow_names): raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.jsonl" log.info(f"Loading {raw_activation_path}...") with open(raw_activation_path, "r") as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(messages)} messages") messages_datasets.append(messages) log.info("Loading surveys datasets:") surveys_datasets = [] for i, survey_flow_name in enumerate( pipeline_configuration.survey_flow_names): raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.jsonl" log.info(f"Loading {raw_survey_path}...") with open(raw_survey_path, "r") as f: contacts = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(contacts)} contacts") surveys_datasets.append(contacts) # Add survey data to the messages
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [ Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file) ] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info( f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server" ) with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts( raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [ Run.deserialize(run_json) for run_json in json.load(raw_runs_file) ] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info( f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'" ) raw_runs = rapid_pro.get_raw_runs_for_flow_id( flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_json(traced_runs, traced_runs_output_file, pretty_print=True) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")