google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path avf_uuid_file_path = args.avf_uuid_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_token_url).strip() rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") log.info("Initialised the Firestore UUID table") log.info(f"Loading the uuids that are safe to send to") with open(avf_uuid_file_path) as f: safe_uuids = json.load(f)
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [ Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file) ] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info( f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server" ) with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts( raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [ Run.deserialize(run_json) for run_json in json.load(raw_runs_file) ] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info( f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'" ) raw_runs = rapid_pro.get_raw_runs_for_flow_id( flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
if project.flow_definitions_upload_url_prefix is None: log.info( f"Not archiving flow definitions for project {project.project_name} because its " f"'flow_definitions_upload_url_prefix' is unspecified.") continue log.info( f"Archiving the latest flow definitions for project {project.project_name}..." ) log.info( "Downloading the Rapid Pro token file and initialising the Rapid Pro client..." ) rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, project.rapid_pro_token_url).strip() rapid_pro = RapidProClient(project.rapid_pro_domain, rapid_pro_token) log.info("Downloading all the flow definitions for this instance...") flow_ids = rapid_pro.get_all_flow_ids() flow_definitions_request_timestamp = TimeUtils.utc_now_as_iso_string() flow_definitions = rapid_pro.get_flow_definitions_for_flow_ids( flow_ids) log.info("Uploading the flow definitions to a cloud bucket...") upload_url = f"{project.flow_definitions_upload_url_prefix}{flow_definitions_request_timestamp}.json" flow_definitions_json = json.dumps(flow_definitions.serialize()) google_cloud_utils.upload_string_to_blob( google_cloud_credentials_file_path, upload_url, flow_definitions_json)
help="GS URLs of a text file containing the authorisation token for the Rapid Pro server") parser.add_argument("output_file_path", metavar="output-file-path", help="Output CSV file to write the phone numbers to") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path rapid_pro_domain = args.rapid_pro_domain rapid_pro_token_file_url = args.rapid_pro_token_file_url output_file_path = args.output_file_path log.info("Downloading the Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token) all_messages = rapid_pro.get_raw_messages() inbound_messages = [msg for msg in all_messages if msg.direction == "in"] inbound_phone_numbers = set() for msg in inbound_messages: if msg.urn.startswith("tel:"): phone_number = msg.urn.split(":")[1] inbound_phone_numbers.add(phone_number) else: log.warning(f"Skipped non-telephone URN type {msg.urn.split(':')[0]}") log.warning(f"Exporting {len(inbound_phone_numbers)} inbound phone numbers to {output_file_path}...") with open(output_file_path, "w") as f: writer = csv.DictWriter(f, fieldnames=["URN:Tel", "Name"])
"GS URL to the organisation access token file for authenticating to the target instance" ) args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path source_domain = args.source_domain source_credentials_url = args.source_credentials_url target_domain = args.target_domain target_credentials_url = args.target_credentials_url # Initialise the source/target instances log.info("Downloading the source instance access token...") source_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, source_credentials_url).strip() source_instance = RapidProClient(source_domain, source_token) log.info("Downloading the target instance access token...") target_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, target_credentials_url).strip() target_instance = RapidProClient(target_domain, target_token) # For each contact field in the source instance, create a matching contact field in the target instance if it # does not already exist log.info("Copying contact fields...") source_fields = source_instance.get_fields() target_field_keys = {f.key for f in target_instance.get_fields()} for field in source_fields: if field.key not in target_field_keys: target_instance.create_field(field.label) log.info("Contact fields copied")
args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path force_update = args.force instance_1_domain = args.instance_1_domain instance_1_credentials_url = args.instance_1_credentials_url instance_2_domain = args.instance_2_domain instance_2_credentials_url = args.instance_2_credentials_url # Initialise the two instances log.info("Downloading the access token for instance 1...") instance_1_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, instance_1_credentials_url).strip() instance_1 = RapidProClient(instance_1_domain, instance_1_token) log.info("Downloading the target instance access token...") instance_2_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, instance_2_credentials_url).strip() instance_2 = RapidProClient(instance_2_domain, instance_2_token) # Synchronise the contact fields log.info("Synchronising contact fields...") instance_1_fields = instance_1.get_fields() instance_2_fields = instance_2.get_fields() for field in instance_1_fields: if field.key not in {f.key for f in instance_2_fields}: instance_2.create_field(field.label) for field in instance_2_fields:
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
google_cloud_credentials_file_path = args.google_cloud_credentials_file_path workspace_1_domain = args.workspace_1_domain workspace_1_credentials_url = args.workspace_1_credentials_url workspace_2_domain = args.workspace_2_domain workspace_2_credentials_url = args.workspace_2_credentials_url raw_data_log_directory = args.raw_data_log_directory if dry_run: log.info("Performing a dry-run") # Initialise the two Rapid Pro clients log.info("Downloading the access token for workspace 1...") workspace_1_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, workspace_1_credentials_url).strip() workspace_1 = RapidProClient(workspace_1_domain, workspace_1_token) workspace_1_name = workspace_1.get_workspace_name() log.info(f"Done. workspace 1 is called {workspace_1_name}") log.info("Downloading the access token for workspace 2...") workspace_2_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, workspace_2_credentials_url).strip() workspace_2 = RapidProClient(workspace_2_domain, workspace_2_token) workspace_2_name = workspace_2.get_workspace_name() log.info(f"Done. workspace 2 is called {workspace_2_name}") # Download the data from Rapid Pro log.info("Downloading contact fields...") log.info(f"Downloading all fields from {workspace_1_name}...") workspace_1_fields = workspace_1.get_fields()
google_cloud_credentials_file_path, pipeline_configuration.rapid_pro_token_file_url).strip() log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url )) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-" ) log.info("Initialised the Firestore UUID table") rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/contacts_raw.json" contacts_log_path = f"{raw_data_dir}/contacts_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows