google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path raw_data_dir = args.raw_data_dir prev_coded_dir_path = args.prev_coded_dir_path messages_json_output_path = args.messages_json_output_path individuals_json_output_path = args.individuals_json_output_path icr_output_dir = args.icr_output_dir coded_dir_path = args.coded_dir_path csv_by_message_output_path = args.csv_by_message_output_path csv_by_individual_output_path = args.csv_by_individual_output_path production_csv_output_path = args.production_csv_output_path # Load the pipeline configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Load the input datasets def load_datasets(flow_names): datasets = [] for i, flow_name in enumerate(flow_names): raw_flow_path = f"{raw_data_dir}/{flow_name}.jsonl" log.info(f"Loading {i + 1}/{len(flow_names)}: {raw_flow_path}...")
args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/counties") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/constituencies") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) for i in range (len(messages)): messages[i] = dict(messages[i].items()) log.info(f"Loaded {len(messages)} messages") # Read the individuals dataset
parser.add_argument("-t", "--time_frame", metavar="time-frame", type=lambda s: datetime.strptime(s, '%H:%M:%S'), default="00:00:10", help="The time frame (HH:MM:SS) to generate dates in intervals between the start and end date") args = parser.parse_args() raw_messages_input_file_path = args.raw_messages_input_file_path messages_difference_per_two_firebase_time_period_output_file_path = args.messages_difference_per_two_firebase_time_period_output_file_path target_operator = args.target_operator target_message_direction = args.target_message_direction start_date = args.start_date end_date = args.end_date if args.time_frame: time_frame = args.time_frame with open(raw_messages_input_file_path, mode="r") as f: log.info(f"Loading messages from {raw_messages_input_file_path}...") input = json.load(f) messages = [Message.deserialize(val) for val in input] log.info(f"Loaded {len(messages)} messages") # Filter messages based on the target operator and target direction of the message log.info(f"Filtering messages based on {target_operator} and " f"message direction as '{target_message_direction}' from {len(messages)} total messages ") filtered_messages = [] for msg in messages: if msg.urn.startswith("tel:"): operator = PhoneCleaner.clean_operator(msg.urn.split(":")[1]) else: operator = msg.urn.split(":")[0] if operator == target_operator and msg.direction == target_message_direction: msg_direction = msg.direction
user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path raw_data_dir = args.raw_data_dir prev_coded_dir_path = args.prev_coded_dir_path json_output_path = args.json_output_path icr_output_dir = args.icr_output_dir coded_dir_path = args.coded_dir_path csv_by_message_output_path = args.csv_by_message_output_path csv_by_individual_output_path = args.csv_by_individual_output_path production_csv_output_path = args.production_csv_output_path # Load the pipeline configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) log.info("Downloading Firestore Uuid Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...")
metavar="data-archive-file-path", help="Path to the data archive file to upload") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path run_id = args.run_id production_csv_input_path = args.production_csv_input_path messages_csv_input_path = args.messages_csv_input_path individuals_csv_input_path = args.individuals_csv_input_path memory_profile_file_path = args.memory_profile_file_path data_archive_file_path = args.data_archive_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Upload to Google Drive, if requested. if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info)
parser.add_argument("output_dir", metavar="output-dir", help="Directory to write the output graphs to") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
type=lambda s: isoparse(s), help= "The end date as ISO 8601 string to which the window of downtime computation will end" ) args = parser.parse_args() raw_messages_file_path = args.raw_messages_file_path window_of_downtimes_output_file_path = args.window_of_downtimes_output_file_path target_operator = args.target_operator target_message_direction = args.target_message_direction start_date = args.start_date end_date = args.end_date with open(raw_messages_file_path, mode="r") as f: log.info(f"Loading messages from {raw_messages_file_path}...") raw_messages_data = json.load(f) messages = [Message.deserialize(val) for val in raw_messages_data] log.info(f"Loaded {len(messages)} messages") msg_sent_on_timestamps = [] msg_sent_on_timestamps.append(start_date) # Append `sent_on` timestamps to `msg_sent_on_timestamps` list # based on the target operator and target direction of the message for msg in messages: if msg.urn.startswith("tel:"): operator = PhoneCleaner.clean_operator(msg.urn.split(":")[1]) else: operator = msg.urn.split(":")[0] if operator == target_operator and msg.direction == target_message_direction: msg_sent_on_timestamps.append(msg.sent_on)
help="Path to a Google Cloud service account credentials file to use to access the " "credentials bucket") parser.add_argument("rapid_pro_domain", help="URL of the Rapid Pro server to download data from") parser.add_argument("rapid_pro_token_file_url", metavar="rapid-pro-token-file-url", help="GS URLs of a text file containing the authorisation token for the Rapid Pro server") parser.add_argument("output_file_path", metavar="output-file-path", help="Output CSV file to write the phone numbers to") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path rapid_pro_domain = args.rapid_pro_domain rapid_pro_token_file_url = args.rapid_pro_token_file_url output_file_path = args.output_file_path log.info("Downloading the Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token) all_messages = rapid_pro.get_raw_messages() inbound_messages = [msg for msg in all_messages if msg.direction == "in"] inbound_phone_numbers = set() for msg in inbound_messages: if msg.urn.startswith("tel:"): phone_number = msg.urn.split(":")[1] inbound_phone_numbers.add(phone_number) else: log.warning(f"Skipped non-telephone URN type {msg.urn.split(':')[0]}")
help="CSV file to write the ADSS contacts from Bossaso to") parser.add_argument( "baidoa_output_path", metavar="baidoa-output-path", help="CSV file to write the ADSS contacts from Baidoa to") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path traced_data_path = args.traced_data_path bossaso_output_path = args.bossaso_output_path baidoa_output_path = args.baidoa_output_path # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-") log.info("Initialised the Firestore UUID table")
args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) IOUtils.ensure_dirs_exist(f"{output_dir}/maps") IOUtils.ensure_dirs_exist(f"{output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
help= "Path to a Google Cloud service account credentials file to use to access the " "credentials bucket") parser.add_argument( "firestore_credentials_url", metavar="firestore-credentials-url", help= "GS URL to the credentials file to use to access the Firestore instance containing " "the operations statistics") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path firestore_credentials_url = args.firestore_credentials_url log.info("Initialising the Firestore client...") firestore_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, firestore_credentials_url)) firestore_wrapper = FirestoreWrapper(firestore_credentials) log.info("Loading the active project details...") active_projects = firestore_wrapper.get_active_projects() log.info(f"Loaded the details for {len(active_projects)} active projects") for project in active_projects: if project.flow_definitions_upload_url_prefix is None: log.info( f"Not archiving flow definitions for project {project.project_name} because its " f"'flow_definitions_upload_url_prefix' is unspecified.") continue
parser.add_argument("phone_number_uuid_table_path", metavar="phone-number-uuid-table-path", help="JSON file containing the phone number <-> UUID lookup table for the messages/surveys " "datasets") parser.add_argument("output_path", metavar="output-path", help="CSV file to write the REACH contacts to") args = parser.parse_args() traced_data_path = args.traced_data_path phone_number_uuid_table_path = args.phone_number_uuid_table_path output_path = args.output_path sys.setrecursionlimit(15000) # Load the phone number <-> uuid table log.info(f"Loading the phone number <-> uuid table from file '{phone_number_uuid_table_path}'...") with open(phone_number_uuid_table_path, "r") as f: phone_number_uuid_table = PhoneNumberUuidTable.load(f) log.info(f"Loaded {len(phone_number_uuid_table.numbers())} contacts") # Load the REACH traced data log.info(f"Loading REACH traced data from file '{traced_data_path}'...") with open(traced_data_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) log.info(f"Loaded {len(data)} traced data objects") # Search the TracedData for consenting contacts log.info("Searching for consenting uuids...") consenting_uuids = set() for td in data: if td["withdrawn_consent"] == Codes.TRUE:
user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/regions") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/districts") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/mogadishu") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(messages)} messages") # Read the individuals dataset log.info(
help= "Path to analysis dataset CSV where respondents are the unit for analysis (i.e. one " "respondent per row, with all their messages joined into a single cell)" ), args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path run_id = args.run_id production_csv_input_path = args.production_csv_input_path messages_csv_input_path = args.messages_csv_input_path individuals_csv_input_path = args.individuals_csv_input_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Upload to Google Drive, if requested. if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info)
parser.add_argument( "data_archive_dir_path", metavar="data-archive-dir-path", help="Path to the data archive directory with file to upload") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path memory_profile_dir_path = args.memory_profile_dir_path data_archive_dir_path = args.data_archive_dir_path date_pattern = r'\d{4}-\d{2}-\d{2}' log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") uploaded_memory_logs = google_cloud_utils.list_blobs( google_cloud_credentials_file_path, pipeline_configuration.memory_profile_upload_bucket, pipeline_configuration.bucket_dir_path, ) uploaded_memory_log_dates = get_uploaded_file_dates( uploaded_memory_logs, date_pattern) uploaded_data_archives = google_cloud_utils.list_blobs(
raise doc_count += 1 time_end = time.perf_counter_ns() ms_elapsed = (time_end - time_start) / (1000 * 1000) log.info(f"validated {doc_count} ids in {ms_elapsed} ms") def usage(): print("Usage python validate_firebase.py crypto_token") if len(sys.argv) != 2: usage() exit(1) crypto_token_path = sys.argv[1] if not os.path.isfile(crypto_token_path): print(f"Expected crypto token file {crypto_token_path}") usage() exit(1) firebase_client = firebase_util.init_firebase_client(crypto_token_path) validate_documents("systemMessages", model.validate_SystemMessage_doc) validate_documents("suggestedReplies", model.validate_SuggestedReply_doc) validate_documents("conversationTags", custom.validate_ConversationTag) validate_documents("messageTags", custom.validate_MessageTag) validate_documents("nook_conversations", custom.validate_Conversation) log.info(f"Validation complete")
help="Directory to read the automated analysis outputs from") args = parser.parse_args() user = args.user pipeline_run_mode = args.pipeline_run_mode google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path run_id = args.run_id production_csv_input_path = args.production_csv_input_path messages_csv_input_path = args.messages_csv_input_path individuals_csv_input_path = args.individuals_csv_input_path automated_analysis_input_dir = args.automated_analysis_input_dir log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Upload to Google Drive, if requested. if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info)
help="Identifier of this pipeline run") parser.add_argument("memory_profile_file_path", metavar="memory-profile-file-path", help="Path to the memory profile log file to upload") parser.add_argument("data_archive_file_path", metavar="data-archive-file-path", help="Path to the data archive file to upload") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path run_id = args.run_id memory_profile_file_path = args.memory_profile_file_path data_archive_file_path = args.data_archive_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) memory_profile_upload_location = f"{pipeline_configuration.memory_profile_upload_url_prefix}{run_id}.profile" log.info(f"Uploading the memory profile from {memory_profile_file_path} to " f"{memory_profile_upload_location}...") with open(memory_profile_file_path, "rb") as f: google_cloud_utils.upload_file_to_blob( google_cloud_credentials_file_path, memory_profile_upload_location, f ) data_archive_upload_location = f"{pipeline_configuration.data_archive_upload_url_prefix}{run_id}.tar.gzip" log.info(f"Uploading the data archive from {data_archive_file_path} to " f"{data_archive_upload_location}...") with open(data_archive_file_path, "rb") as f:
args = parser.parse_args() user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/regions") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/districts") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/mogadishu") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info(f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) for i in range(len(messages)): messages[i] = dict(messages[i].items()) log.info(f"Loaded {len(messages)} messages") # Read the individuals dataset
IGNORE_STOP) # prepare for writing to a json file that can be uploaded to firebase daily_metrics_list = [] for day in daily_metrics: day_metrics = daily_metrics[day] day_metrics["__id"] = day day_metrics[ "__reference_path"] = f"{DAILY_TAG_METRICS_COLLECTION_KEY}/{day}" day_metrics["__subcollections"] = [] daily_metrics_list.append(day_metrics) daily_metrics_json = {DAILY_TAG_METRICS_COLLECTION_KEY: daily_metrics_list} daily_metrics_file = f"{OUTPUT_FOLDER}/nook-analysis-daily_metrics.json" with open(daily_metrics_file, mode="w", encoding='utf-8') as output_file: json.dump(daily_metrics_json, output_file, indent=2) log.info( f"compute_daily_tag_distribution saved to {daily_metrics_file}") total_counts = compute_total_counts(nook_conversations, IGNORE_STOP) # prepare for writing to a json file that can be uploaded to firebase total_counts["__id"] = TOTAL_COUNTS_METRICS_COLLECTION_KEY total_counts[ "__reference_path"] = f"{TOTAL_COUNTS_METRICS_COLLECTION_KEY}/{TOTAL_COUNTS_METRICS_COLLECTION_KEY}" total_counts["__subcollections"] = [] total_counts_json = {TOTAL_COUNTS_METRICS_COLLECTION_KEY: [total_counts]} total_counts_file = f"{OUTPUT_FOLDER}/nook-analysis-total_counts.json" with open(total_counts_file, mode="w", encoding='utf-8') as output_file: json.dump(total_counts_json, output_file, indent=2) log.info(f"compute_total_counts saved to {total_counts_file}") needs_reply_metrics = compute_needs_reply_metrics(nook_conversations)
if (not reset_flag and not replay_flag): print(f"Unknown flag: {sys.argv[3]}") usage() exit(1) if len(sys.argv) > 4: print(f"Unexpected argument {sys.argv[4]}") usage() exit(1) doc_count = 0 skip_count = 0 migration_count = 0 firebase_client = firebase_util.init_firebase_client(crypto_token_path) # migrate_collection("suggestedReplies", migrate_nook_model.migrate_SuggestedReply) migrate_collection("nook_conversations", migrate_nook_model.migrate_Conversation) migrate_collection("conversationTags", migrate_nook_model.migrate_Tag) migrate_collection("messageTags", migrate_nook_model.migrate_Tag) log.info(f"Migration complete") log.info(f" {migration_count} documents migrated") log.info(f" {skip_count} documents already migrated") log.info(f" {doc_count - skip_count - migration_count} documents unchanged") if migrate_nook_model.warning_count == 0: log.info(f" no warnings") else: log.info(f"") log.info(f" {migrate_nook_model.warning_count} WARNINGS") log.info(f"")
google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path raw_data_dir = args.raw_data_dir prev_coded_dir_path = args.prev_coded_dir_path messages_json_output_path = args.messages_json_output_path individuals_json_output_path = args.individuals_json_output_path icr_output_dir = args.icr_output_dir coded_dir_path = args.coded_dir_path csv_by_message_output_path = args.csv_by_message_output_path csv_by_individual_output_path = args.csv_by_individual_output_path production_csv_output_path = args.production_csv_output_path # Load the pipeline configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) log.info(f"Running {pipeline_configuration.pipeline_name}") if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) log.info("Loading the raw data...") data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration)
metavar="pipeline-configuration-file", help="Path to the pipeline configuration json file") parser.add_argument( "avf_uuid_file_path", metavar="avf-uuid-file-path", help= "Path to a json file containing a list of avf uuids that it's safe to trigger " "the demog flow to") args = parser.parse_args() google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path avf_uuid_file_path = args.avf_uuid_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_token_url).strip() rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string(
metavar="merged-code-id", help="Id of the code to merge the source codes to") parser.add_argument( "messages_output_file_path", metavar="messages-output-file-path", help= "Path to the Coda messages file to write the messages to after performing the code merge" ) args = parser.parse_args() messages_input_file_path = args.messages_input_file_path code_ids_to_merge = args.code_ids_to_merge merged_code_id = args.merged_code_id messages_output_file_path = args.messages_output_file_path log.info(f"Loading Coda messages from '{messages_input_file_path}'...") with open(messages_input_file_path) as f: messages = [Message.from_firebase_map(d) for d in json.load(f)] log.info(f"Loaded {len(messages)} messages") log.info(f"Performing merge ({code_ids_to_merge} -> '{merged_code_id}')...") merged_count = 0 # A count of the number of labels that were remapped to the merged value, for sense-check logging for msg in messages: processed_scheme_ids = set() for label in list(msg.labels): # Skip labels that are not the latest assignment under each scheme if label.scheme_id in processed_scheme_ids: continue processed_scheme_ids.add(label.scheme_id) if label.code_id in code_ids_to_merge:
pipeline_configuration_file_path = args.pipeline_configuration_file_path raw_data_dir = args.raw_data_dir prev_coded_dir_path = args.prev_coded_dir_path auto_coding_json_output_path = args.auto_coding_json_output_path messages_json_output_path = args.messages_json_output_path individuals_json_output_path = args.individuals_json_output_path icr_output_dir = args.icr_output_dir coded_dir_path = args.coded_dir_path csv_by_message_output_path = args.csv_by_message_output_path csv_by_individual_output_path = args.csv_by_individual_output_path production_csv_output_path = args.production_csv_output_path # Load the pipeline configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Loading the raw data...") data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration) log.info("Translating source Keys...") data = TranslateSourceKeys.translate_source_keys(user, data, pipeline_configuration) if pipeline_configuration.move_ws_messages: log.info("Pre-filtering empty message objects...")
user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/counties") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/constituencies") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/urban") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) log.info(f"Loaded {len(messages)} messages") # Read the individuals dataset log.info(
parser.add_argument( "csv_output_file_path", metavar="csv-output-file-path", help= "Path to a CSV file to write the contacts from the locations of interest to. " "Exported file is in a format suitable for direct upload to Rapid Pro") args = parser.parse_args() exclusion_list_file_path = args.exclusion_list_file_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path traced_data_paths = args.traced_data_paths csv_output_file_path = args.csv_output_file_path log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. phone_number_uuid_table.firebase_credentials_file_url)) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-")
help="Path to the pipeline configuration json file") parser.add_argument("column_to_de_identify", metavar="column-to-de-identify", help="Name of the column containing phone numbers to be de-identified") parser.add_argument("de_identified_csv_output_path", metavar="de-identified-csv-output-path", help="Path to write the de-identified CSV to") args = parser.parse_args() csv_input_path = args.csv_input_path pipeline_configuration_file_path = args.pipeline_configuration_file_path google_cloud_credentials_file_path = args.google_cloud_credentials_file_path column_to_de_identify = args.column_to_de_identify de_identified_csv_output_path = args.de_identified_csv_output_path # Read the settings from the configuration file log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file(f) log.info("Downloading Firestore UUID Table credentials...") firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url )) phone_number_uuid_table = FirestoreUuidTable( pipeline_configuration.phone_number_uuid_table.table_name, firestore_uuid_table_credentials, "avf-phone-uuid-" ) log.info("Initialised the Firestore UUID table")
user = args.user pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path automated_analysis_output_dir = args.automated_analysis_output_dir IOUtils.ensure_dirs_exist(automated_analysis_output_dir) IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/maps/regions") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/districts") IOUtils.ensure_dirs_exist( f"{automated_analysis_output_dir}/maps/mogadishu") IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) Logger.set_project_name(pipeline_configuration.pipeline_name) log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}") sys.setrecursionlimit(30000) # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...") with open(messages_json_input_path) as f: messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f) for i in range(len(messages)): messages[i] = dict(messages[i].items()) log.info(f"Loaded {len(messages)} messages")
help="Directory to write the output graphs to") args = parser.parse_args() user = args.user google_cloud_credentials_file_path = args.google_cloud_credentials_file_path pipeline_configuration_file_path = args.pipeline_configuration_file_path messages_json_input_path = args.messages_json_input_path individuals_json_input_path = args.individuals_json_input_path output_dir = args.output_dir IOUtils.ensure_dirs_exist(output_dir) IOUtils.ensure_dirs_exist(f"{output_dir}/graphs") log.info("Loading Pipeline Configuration File...") with open(pipeline_configuration_file_path) as f: pipeline_configuration = PipelineConfiguration.from_configuration_file( f) if pipeline_configuration.drive_upload is not None: log.info(f"Downloading Google Drive service account credentials...") credentials_info = json.loads( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, pipeline_configuration. drive_upload.drive_credentials_file_url)) drive_client_wrapper.init_client_from_info(credentials_info) # Read the messages dataset log.info( f"Loading the messages dataset from {messages_json_input_path}...")