def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def test_round_trip(self): expected = self.generate_test_data() temp_file = tempfile.NamedTemporaryFile() with open(temp_file.name, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(expected, f) with open(temp_file.name, "r") as f: imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)) self.assertEqual(len(expected), len(imported)) for x, y in zip(expected, imported): x_attributes = {k: getattr(x, k) for k in dir(x) if not k.startswith("__") and not callable(getattr(x, k)) and k != "_cache"} y_attributes = {k: getattr(y, k) for k in dir(y) if not k.startswith("__") and not callable(getattr(y, k)) and k != "_cache"} self.assertDictEqual(x_attributes, y_attributes)
def test_export_traced_data_iterable_to_jsonl(self): file_path = path.join(self.test_dir, "json_test.json") # Test exporting wrong data type data = self.generate_test_data() with open(file_path, "w") as f: try: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data[0], f) self.fail("Exporting the wrong data type did not raise an assertion error") except AssertionError as e: self.assertEqual(str(e), _td_type_error_string) # Test normal export data = self.generate_test_data() with open(file_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/json_export_expected.json"))
def test_flush_history_from_traced_data_iterable(self): history_file_path = path.join(self.test_dir, "flush_test_history.jsonl") data = self.generate_test_data() data_0_sha = data[0].get_sha() self.assertEqual(len(data[1].get_history("Gender")), 2) TracedDataJsonIO.flush_history_from_traced_data_iterable("test_user", data, history_file_path) self.assertTrue(filecmp.cmp(history_file_path, "tests/traced_data/resources/flush_history_expected_history.jsonl")) self.assertEqual(len(data[1].get_history("Gender")), 1) self.assertEqual(data[0]["_PrevTracedDataSHA"], data_0_sha) # Test the remaining data can be round-tripped latest_file_path = path.join(self.test_dir, "flush_test_latest.jsonl") with open(latest_file_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) with open(latest_file_path, "r") as f: TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [ Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file) ] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info( f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server" ) with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts( raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [ Run.deserialize(run_json) for run_json in json.load(raw_runs_file) ] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info( f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'" ) raw_runs = rapid_pro.get_raw_runs_for_flow_id( flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
if pipeline_run_mode == "all-stages": log.info("Running post labelling pipeline stages...") log.info("Applying Manual Codes from Coda...") data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path) log.info("Generating Analysis CSVs...") messages_data, individuals_data = AnalysisFile.generate( user, data, csv_by_message_output_path, csv_by_individual_output_path) log.info("Writing messages TracedData to file...") IOUtils.ensure_dirs_exist_for_file(messages_json_output_path) with open(messages_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( messages_data, f) log.info("Writing individuals TracedData to file...") IOUtils.ensure_dirs_exist_for_file(individuals_json_output_path) with open(individuals_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( individuals_data, f) else: assert pipeline_run_mode == "auto-code-only", "pipeline run mode must be either auto-code-only or all-stages" log.info("Writing Auto-Coding TracedData to file...") IOUtils.ensure_dirs_exist_for_file(auto_coding_json_output_path) with open(auto_coding_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) log.info("Python script complete")
print("Exporting advert CSV...") advert_phone_numbers = AdvertPhoneNumbers.generate( data, phone_number_uuid_table, advert_phone_numbers_csv_output_path) print("Filtering out RQA Messages labelled as Noise_Other_Project...") data = FilterNOP.filter_rqa_noise_other_project(data) print("Generating Analysis CSVs...") data = AnalysisFile.generate(user, data, csv_by_message_output_path, csv_by_individual_output_path) print("Writing TracedData to file...") IOUtils.ensure_dirs_exist_for_file(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f) # Upload to Google Drive, if requested. # Note: This should happen as late as possible in order to reduce the risk of the remainder of the pipeline failing # after a Drive upload has occurred. Failures could result in inconsistent outputs or outputs with no # traced data log. if pipeline_configuration.drive_upload is not None: print("Uploading CSVs to Google Drive...") production_csv_drive_dir = os.path.dirname( pipeline_configuration.drive_upload.production_upload_path) production_csv_drive_file_name = os.path.basename( pipeline_configuration.drive_upload.production_upload_path) drive_client_wrapper.update_or_create( production_csv_output_path, production_csv_drive_dir,
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir, facebook_uuid_table, facebook_source): log.info("Fetching data from Facebook...") log.info("Downloading Facebook access token...") facebook_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, facebook_source.token_file_url).strip() facebook = FacebookClient(facebook_token) for dataset in facebook_source.datasets: log.info(f"Exporting comments for dataset {dataset.name}...") raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json" traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl" # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook. raw_comments = [] for post_id in dataset.post_ids: comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl" with open(comments_log_path, "a") as raw_comments_log_file: post_comments = facebook.get_all_comments_on_post( post_id, raw_export_log_file=raw_comments_log_file, fields=[ "from{id}", "parent", "attachments", "created_time", "message" ]) # Download the post and add it as context to all the comments. Adding a reference to the post under # which a comment was made enables downstream features such as post-type labelling and comment context # in Coda, as well as allowing us to track how many comments were made on each post. post = facebook.get_post(post_id, fields=["attachments"]) for comment in post_comments: comment["post"] = post raw_comments.extend(post_comments) # Facebook only returns a parent if the comment is a reply to another comment. # If there is no parent, set one to the empty-dict. for comment in raw_comments: if "parent" not in comment: comment["parent"] = {} # Convert the comments to TracedData. traced_comments = facebook.convert_facebook_comments_to_traced_data( user, dataset.name, raw_comments, facebook_uuid_table) # Export to disk. log.info( f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path) with open(raw_comments_output_path, "w") as raw_comments_output_file: json.dump(raw_comments, raw_comments_output_file) log.info(f"Saved {len(raw_comments)} raw comments") log.info( f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path) with open(traced_comments_output_path, "w") as traced_comments_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_comments, traced_comments_output_file) log.info(f"Saved {len(traced_comments)} traced comments")
with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")