firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    log.info(f"Loading the uuids that are safe to send to")
    with open(avf_uuid_file_path) as f:
        safe_uuids = json.load(f)
    log.info(f"Loaded {len(safe_uuids)} uuids")

    log.info(f"Re-identifying the uuids")
    safe_numbers = phone_number_uuid_table.uuid_to_data_batch(
        safe_uuids).values()
    safe_urns = {f"tel:+{number}" for number in safe_numbers}
    log.info(f"Re-identified {len(safe_urns)} uuids")

    log.info("Downloading the latest contacts fields from Rapid Pro")
    contacts = rapid_pro.get_raw_contacts()
    urn_to_contact = dict()
    for c in contacts:
        for urn in c.urns:
            urn_to_contact[urn] = c

    log.info(f"Filtering the urns who haven't received demogs before")
    urns_to_send_to = {
        urn
        for urn in safe_urns
        if urn_to_contact[urn].fields[demogs_attempted_variable] is None
        # Filter for people created since the project started. People created before then went through
        # the demogs flow, but may not have been asked any questions/been assigned the demogs_attempted
        # variable if they had already completed them last season, so we can skip these.
        and urn_to_contact[urn].created_on >
        pipeline_configuration.project_start_date
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                         raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [
                Contact.deserialize(contact_json)
                for contact_json in json.load(raw_contacts_file)
            ]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(
            f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server"
        )
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(
                raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [
                        Run.deserialize(run_json)
                        for run_json in json.load(raw_runs_file)
                    ]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id,
                    raw_runs,
                    raw_export_log_file=raw_runs_log_file,
                    ignore_archives=True)
            except FileNotFoundError:
                log.info(
                    f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'"
                )
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(
                    flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            rapid_pro_source.test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Example #3
0
    target_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, target_credentials_url).strip()
    target_instance = RapidProClient(target_domain, target_token)

    # For each contact field in the source instance, create a matching contact field in the target instance if it
    # does not already exist
    log.info("Copying contact fields...")
    source_fields = source_instance.get_fields()
    target_field_keys = {f.key for f in target_instance.get_fields()}
    for field in source_fields:
        if field.key not in target_field_keys:
            target_instance.create_field(field.label)
    log.info("Contact fields copied")

    log.info("Fetching all contacts from the source instance...")
    contacts = source_instance.get_raw_contacts()
    log.info(f"Fetched {len(contacts)} contacts")

    log.info("Updating contacts in the target instance...")
    # Update each contact's name and fields.
    # Language, groups, blocked, and stopped properties are not touched.
    multiple_urns_count = 0
    telephone_with_no_country_code_count = 0
    updated_count = 0
    for i, contact in enumerate(contacts):
        log.debug(f"Updating contact {i + 1}/{len(contacts)}...")
        if len(contact.urns) != 1:
            log.warning(
                f"Found a contact in the source instance with multiple URNS. "
                f"The RapidPro UUID is '{contact.uuid}'")
            multiple_urns_count += 1
    # Synchronise the contact fields
    log.info("Synchronising contact fields...")
    instance_1_fields = instance_1.get_fields()
    instance_2_fields = instance_2.get_fields()
    for field in instance_1_fields:
        if field.key not in {f.key for f in instance_2_fields}:
            instance_2.create_field(field.label)
    for field in instance_2_fields:
        if field.key not in {f.key for f in instance_1_fields}:
            instance_1.create_field(field.label)
    log.info("Contact fields synchronised")

    # Synchronise the contacts
    log.info("Synchronising contacts...")
    instance_1_contacts = instance_1.get_raw_contacts()
    instance_2_contacts = instance_2.get_raw_contacts()

    def filter_valid_contacts(contacts):
        valid_contacts = []
        for contact in contacts:
            if len(contact.urns) != 1:
                log.warning(
                    f"Found a contact in the source instance with multiple URNS; skipping. "
                    f"The RapidPro UUID is '{contact.uuid}'")
                continue
            if contact.urns[0].startswith(
                    "tel:") and not contact.urns[0].startswith("tel:+"):
                log.warning(
                    f"Found a contact with a telephone number but without a country code; skipping. "
                    f"The RapidPro UUID is '{contact.uuid}'")
Example #5
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Example #6
0
    log.info(f"Done. workspace 2 is called {workspace_2_name}")

    # Download the data from Rapid Pro
    log.info("Downloading contact fields...")
    log.info(f"Downloading all fields from {workspace_1_name}...")
    workspace_1_fields = workspace_1.get_fields()
    log.info(f"Downloading all fields from {workspace_2_name}...")
    workspace_2_fields = workspace_2.get_fields()

    # Synchronise the contacts
    log.info("Downloading contacts...")
    IOUtils.ensure_dirs_exist(raw_data_log_directory)
    log.info(f"Downloading all contacts from {workspace_1_name}...")
    with open(f"{raw_data_log_directory}/{workspace_1_name}_raw_contacts.json",
              "w") as f:
        workspace_1_contacts = workspace_1.get_raw_contacts(
            raw_export_log_file=f)
    log.info(f"Downloading all contacts from {workspace_2_name}...")
    with open(f"{raw_data_log_directory}/{workspace_2_name}_raw_contacts.json",
              "w") as f:
        workspace_2_contacts = workspace_2.get_raw_contacts(
            raw_export_log_file=f)

    # If in dry_run mode, dereference workspace_1 and workspace_2 as an added safety. This prevents accidental
    # writes to either workspace.
    if dry_run:
        workspace_1 = None
        workspace_2 = None

    # Synchronise the data
    # Synchronise the contact fields
    new_contact_fields_in_workspace_2 = 0
Example #7
0
    log.info("Initialised the Firestore UUID table")

    rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/contacts_raw.json"
    contacts_log_path = f"{raw_data_dir}/contacts_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in pipeline_configuration.activation_flow_names + pipeline_configuration.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.json"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")