google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    pipeline_configuration_file_path = args.pipeline_configuration_file_path
    avf_uuid_file_path = args.avf_uuid_file_path

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_token_url).strip()

    rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token)

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    log.info(f"Loading the uuids that are safe to send to")
    with open(avf_uuid_file_path) as f:
        safe_uuids = json.load(f)
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                         raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [
                Contact.deserialize(contact_json)
                for contact_json in json.load(raw_contacts_file)
            ]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(
            f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server"
        )
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(
                raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [
                        Run.deserialize(run_json)
                        for run_json in json.load(raw_runs_file)
                    ]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id,
                    raw_runs,
                    raw_export_log_file=raw_runs_log_file,
                    ignore_archives=True)
            except FileNotFoundError:
                log.info(
                    f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'"
                )
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(
                    flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            rapid_pro_source.test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
        if project.flow_definitions_upload_url_prefix is None:
            log.info(
                f"Not archiving flow definitions for project {project.project_name} because its "
                f"'flow_definitions_upload_url_prefix' is unspecified.")
            continue

        log.info(
            f"Archiving the latest flow definitions for project {project.project_name}..."
        )

        log.info(
            "Downloading the Rapid Pro token file and initialising the Rapid Pro client..."
        )
        rapid_pro_token = google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path,
            project.rapid_pro_token_url).strip()
        rapid_pro = RapidProClient(project.rapid_pro_domain, rapid_pro_token)

        log.info("Downloading all the flow definitions for this instance...")
        flow_ids = rapid_pro.get_all_flow_ids()
        flow_definitions_request_timestamp = TimeUtils.utc_now_as_iso_string()
        flow_definitions = rapid_pro.get_flow_definitions_for_flow_ids(
            flow_ids)

        log.info("Uploading the flow definitions to a cloud bucket...")
        upload_url = f"{project.flow_definitions_upload_url_prefix}{flow_definitions_request_timestamp}.json"
        flow_definitions_json = json.dumps(flow_definitions.serialize())
        google_cloud_utils.upload_string_to_blob(
            google_cloud_credentials_file_path, upload_url,
            flow_definitions_json)
Ejemplo n.º 4
0
                        help="GS URLs of a text file containing the authorisation token for the Rapid Pro server")
    parser.add_argument("output_file_path", metavar="output-file-path",
                        help="Output CSV file to write the phone numbers to")

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    rapid_pro_domain = args.rapid_pro_domain
    rapid_pro_token_file_url = args.rapid_pro_token_file_url
    output_file_path = args.output_file_path

    log.info("Downloading the Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token)

    all_messages = rapid_pro.get_raw_messages()
    inbound_messages = [msg for msg in all_messages if msg.direction == "in"]

    inbound_phone_numbers = set()
    for msg in inbound_messages:
        if msg.urn.startswith("tel:"):
            phone_number = msg.urn.split(":")[1]
            inbound_phone_numbers.add(phone_number)
        else:
            log.warning(f"Skipped non-telephone URN type {msg.urn.split(':')[0]}")

    log.warning(f"Exporting {len(inbound_phone_numbers)} inbound phone numbers to {output_file_path}...")
    with open(output_file_path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=["URN:Tel", "Name"])
Ejemplo n.º 5
0
        "GS URL to the organisation access token file for authenticating to the target instance"
    )

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    source_domain = args.source_domain
    source_credentials_url = args.source_credentials_url
    target_domain = args.target_domain
    target_credentials_url = args.target_credentials_url

    # Initialise the source/target instances
    log.info("Downloading the source instance access token...")
    source_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, source_credentials_url).strip()
    source_instance = RapidProClient(source_domain, source_token)

    log.info("Downloading the target instance access token...")
    target_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, target_credentials_url).strip()
    target_instance = RapidProClient(target_domain, target_token)

    # For each contact field in the source instance, create a matching contact field in the target instance if it
    # does not already exist
    log.info("Copying contact fields...")
    source_fields = source_instance.get_fields()
    target_field_keys = {f.key for f in target_instance.get_fields()}
    for field in source_fields:
        if field.key not in target_field_keys:
            target_instance.create_field(field.label)
    log.info("Contact fields copied")
    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    force_update = args.force

    instance_1_domain = args.instance_1_domain
    instance_1_credentials_url = args.instance_1_credentials_url
    instance_2_domain = args.instance_2_domain
    instance_2_credentials_url = args.instance_2_credentials_url

    # Initialise the two instances
    log.info("Downloading the access token for instance 1...")
    instance_1_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        instance_1_credentials_url).strip()
    instance_1 = RapidProClient(instance_1_domain, instance_1_token)

    log.info("Downloading the target instance access token...")
    instance_2_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        instance_2_credentials_url).strip()
    instance_2 = RapidProClient(instance_2_domain, instance_2_token)

    # Synchronise the contact fields
    log.info("Synchronising contact fields...")
    instance_1_fields = instance_1.get_fields()
    instance_2_fields = instance_2.get_fields()
    for field in instance_1_fields:
        if field.key not in {f.key for f in instance_2_fields}:
            instance_2.create_field(field.label)
    for field in instance_2_fields:
Ejemplo n.º 7
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Ejemplo n.º 8
0
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    workspace_1_domain = args.workspace_1_domain
    workspace_1_credentials_url = args.workspace_1_credentials_url
    workspace_2_domain = args.workspace_2_domain
    workspace_2_credentials_url = args.workspace_2_credentials_url
    raw_data_log_directory = args.raw_data_log_directory

    if dry_run:
        log.info("Performing a dry-run")

    # Initialise the two Rapid Pro clients
    log.info("Downloading the access token for workspace 1...")
    workspace_1_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        workspace_1_credentials_url).strip()
    workspace_1 = RapidProClient(workspace_1_domain, workspace_1_token)
    workspace_1_name = workspace_1.get_workspace_name()
    log.info(f"Done. workspace 1 is called {workspace_1_name}")

    log.info("Downloading the access token for workspace 2...")
    workspace_2_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        workspace_2_credentials_url).strip()
    workspace_2 = RapidProClient(workspace_2_domain, workspace_2_token)
    workspace_2_name = workspace_2.get_workspace_name()
    log.info(f"Done. workspace 2 is called {workspace_2_name}")

    # Download the data from Rapid Pro
    log.info("Downloading contact fields...")
    log.info(f"Downloading all fields from {workspace_1_name}...")
    workspace_1_fields = workspace_1.get_fields()
Ejemplo n.º 9
0
        google_cloud_credentials_file_path, pipeline_configuration.rapid_pro_token_file_url).strip()

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url
    ))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials,
        "avf-phone-uuid-"
    )
    log.info("Initialised the Firestore UUID table")

    rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/contacts_raw.json"
    contacts_log_path = f"{raw_data_dir}/contacts_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows