def main(user, google_cloud_credentials_file_path, pipeline_configuration_file_path, raw_data_dir): # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url )) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-" ) log.info("Initialised the Firestore UUID table") log.info(f"Fetching data from {len(pipeline_configuration.raw_data_sources)} sources...") for i, raw_data_source in enumerate(pipeline_configuration.raw_data_sources): log.info(f"Fetching from source {i + 1}/{len(pipeline_configuration.raw_data_sources)}...") if isinstance(raw_data_source, RapidProSource): fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, raw_data_source) elif isinstance(raw_data_source, GCloudBucketSource): fetch_from_gcloud_bucket(google_cloud_credentials_file_path, raw_data_dir, raw_data_source) elif isinstance(raw_data_source, ShaqadoonCSVSource): fetch_from_shaqadoon_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, raw_data_source) else: assert False, f"Unknown raw_data_source type {type(raw_data_source)}"
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def main(user, google_cloud_credentials_file_path, pipeline_configuration_file_path, raw_data_dir): # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.uuid_table.firebase_credentials_file_url)) uuid_table = FirestoreUuidTable( pipeline_configuration.uuid_table.table_name, firestore_uuid_table_credentials, pipeline_configuration.uuid_table.uuid_prefix) log.info("Initialised the Firestore UUID table") log.info( f"Fetching data from {len(pipeline_configuration.raw_data_sources)} sources..." ) for i, raw_data_source in enumerate( pipeline_configuration.raw_data_sources): log.info( f"Fetching from source {i + 1}/{len(pipeline_configuration.raw_data_sources)}..." ) if isinstance(raw_data_source, RapidProSource): fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, uuid_table, raw_data_source) elif isinstance(raw_data_source, GCloudBucketSource): fetch_from_gcloud_bucket(google_cloud_credentials_file_path, raw_data_dir, raw_data_source) elif isinstance(raw_data_source, RecoveryCSVSource): fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, uuid_table, raw_data_source) elif isinstance(raw_data_source, FacebookSource): fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir, uuid_table, raw_data_source) else: assert False, f"Unknown raw_data_source type {type(raw_data_source)}"
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [ Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file) ] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info( f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server" ) with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts( raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [ Run.deserialize(run_json) for run_json in json.load(raw_runs_file) ] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info( f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'" ) raw_runs = rapid_pro.get_raw_runs_for_flow_id( flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified( raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info( f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info( f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..." ) with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path traced_data_paths = args.traced_data_paths csv_output_file_path = args.csv_output_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") log.info("Initialised the Firestore UUID table") uuids = set() skipped_nr = 0 for path in traced_data_paths: # Load the traced data log.info(f"Loading previous traced data from file '{path}'...") with open(path) as f: data = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(data)} traced data objects")
memory_profile_file_path = args.memory_profile_file_path data_archive_file_path = args.data_archive_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Upload to Google Drive, if requested. if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) log.info("Uploading CSVs to Google Drive...") production_csv_drive_dir = os.path.dirname( pipeline_configuration.drive_upload.production_upload_path) production_csv_drive_file_name = os.path.basename( pipeline_configuration.drive_upload.production_upload_path) drive_client_wrapper.update_or_create( production_csv_input_path, production_csv_drive_dir, target_file_name=production_csv_drive_file_name, target_folder_is_shared_with_me=True)
"credentials bucket") parser.add_argument("rapid_pro_domain", help="URL of the Rapid Pro server to download data from") parser.add_argument("rapid_pro_token_file_url", metavar="rapid-pro-token-file-url", help="GS URLs of a text file containing the authorisation token for the Rapid Pro server") parser.add_argument("output_file_path", metavar="output-file-path", help="Output CSV file to write the phone numbers to") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path rapid_pro_domain = args.rapid_pro_domain rapid_pro_token_file_url = args.rapid_pro_token_file_url output_file_path = args.output_file_path log.info("Downloading the Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token) all_messages = rapid_pro.get_raw_messages() inbound_messages = [msg for msg in all_messages if msg.direction == "in"] inbound_phone_numbers = set() for msg in inbound_messages: if msg.urn.startswith("tel:"): phone_number = msg.urn.split(":")[1] inbound_phone_numbers.add(phone_number) else: log.warning(f"Skipped non-telephone URN type {msg.urn.split(':')[0]}") log.warning(f"Exporting {len(inbound_phone_numbers)} inbound phone numbers to {output_file_path}...")
"credentials bucket") parser.add_argument( "firestore_credentials_url", metavar="firestore-credentials-url", help= "GS URL to the credentials file to use to access the Firestore instance containing " "the operations statistics") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path firestore_credentials_url = args.firestore_credentials_url log.info("Initialising the Firestore client...") firestore_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, firestore_credentials_url)) firestore_wrapper = FirestoreWrapper(firestore_credentials) log.info("Loading the active project details...") active_projects = firestore_wrapper.get_active_projects() log.info(f"Loaded the details for {len(active_projects)} active projects") for project in active_projects: if project.flow_definitions_upload_url_prefix is None: log.info( f"Not archiving flow definitions for project {project.project_name} because its " f"'flow_definitions_upload_url_prefix' is unspecified.") continue log.info( f"Archiving the latest flow definitions for project {project.project_name}..."
metavar="target-credentials-url", help= "GS URL to the organisation access token file for authenticating to the target instance" ) args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path source_domain = args.source_domain source_credentials_url = args.source_credentials_url target_domain = args.target_domain target_credentials_url = args.target_credentials_url # Initialise the source/target instances log.info("Downloading the source instance access token...") source_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, source_credentials_url).strip() source_instance = RapidProClient(source_domain, source_token) log.info("Downloading the target instance access token...") target_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, target_credentials_url).strip() target_instance = RapidProClient(target_domain, target_token) # For each contact field in the source instance, create a matching contact field in the target instance if it # does not already exist log.info("Copying contact fields...") source_fields = source_instance.get_fields() target_field_keys = {f.key for f in target_instance.get_fields()} for field in source_fields: if field.key not in target_field_keys: target_instance.create_field(field.label)
) args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path force_update = args.force instance_1_domain = args.instance_1_domain instance_1_credentials_url = args.instance_1_credentials_url instance_2_domain = args.instance_2_domain instance_2_credentials_url = args.instance_2_credentials_url # Initialise the two instances log.info("Downloading the access token for instance 1...") instance_1_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, instance_1_credentials_url).strip() instance_1 = RapidProClient(instance_1_domain, instance_1_token) log.info("Downloading the target instance access token...") instance_2_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, instance_2_credentials_url).strip() instance_2 = RapidProClient(instance_2_domain, instance_2_token) # Synchronise the contact fields log.info("Synchronising contact fields...") instance_1_fields = instance_1.get_fields() instance_2_fields = instance_2.get_fields() for field in instance_1_fields: if field.key not in {f.key for f in instance_2_fields}:
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir, facebook_uuid_table, facebook_source): log.info("Fetching data from Facebook...") log.info("Downloading Facebook access token...") facebook_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, facebook_source.token_file_url).strip() facebook = FacebookClient(facebook_token) for dataset in facebook_source.datasets: log.info(f"Exporting comments for dataset {dataset.name}...") raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json" traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl" # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook. raw_comments = [] for post_id in dataset.post_ids: comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl" with open(comments_log_path, "a") as raw_comments_log_file: post_comments = facebook.get_all_comments_on_post( post_id, raw_export_log_file=raw_comments_log_file, fields=[ "from{id}", "parent", "attachments", "created_time", "message" ]) # Download the post and add it as context to all the comments. Adding a reference to the post under # which a comment was made enables downstream features such as post-type labelling and comment context # in Coda, as well as allowing us to track how many comments were made on each post. post = facebook.get_post(post_id, fields=["attachments"]) for comment in post_comments: comment["post"] = post raw_comments.extend(post_comments) # Facebook only returns a parent if the comment is a reply to another comment. # If there is no parent, set one to the empty-dict. for comment in raw_comments: if "parent" not in comment: comment["parent"] = {} # Convert the comments to TracedData. traced_comments = facebook.convert_facebook_comments_to_traced_data( user, dataset.name, raw_comments, facebook_uuid_table) # Export to disk. log.info( f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path) with open(raw_comments_output_path, "w") as raw_comments_output_file: json.dump(raw_comments, raw_comments_output_file) log.info(f"Saved {len(raw_comments)} raw comments") log.info( f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path) with open(traced_comments_output_path, "w") as traced_comments_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_comments, traced_comments_output_file) log.info(f"Saved {len(traced_comments)} traced comments")
help="Path to a directory to save the raw data to") args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path raw_data_dir = args.raw_data_dir # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.rapid_pro_token_file_url).strip() log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url )) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-" ) log.info("Initialised the Firestore UUID table") rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token)
args = parser.parse_args() gzip_export_file_path = args.gzip_export_file_path gcs_upload_path = args.gcs_upload_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path firebase_credentials_file_url = args.firebase_credentials_file_url table_names = args.table_names if gzip_export_file_path is None and gcs_upload_path is None: log.error(f"No output locations specified. Please provide at least one of --gzip-export-file-path or " f"--gcs-upload-path") exit(1) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, firebase_credentials_file_url )) id_tables = FirestoreUuidInfrastructure.init_from_credentials(firestore_uuid_table_credentials) if len(table_names) == 0: table_names = id_tables.list_table_names() log.info(f"Found {len(table_names)} uuid tables to export") export = dict() # of table_name -> {mappings: dict of data -> uuid} for i, table_name in enumerate(table_names): log.info(f"Fetching mappings from table {i + 1}/{len(table_names)}: {table_name}...") mappings = id_tables.get_table(table_name, None).get_all_mappings() export[table_name] = { "mappings": mappings } log.info(f"Fetched {len(mappings)} mappings")
args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path raw_data_dir = args.raw_data_dir # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.rapid_pro_token_file_url).strip() log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") log.info("Initialised the Firestore UUID table") rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token)