def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a Recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
Esempio n. 2
0
    def test_round_trip(self):
        expected = self.generate_test_data()
        temp_file = tempfile.NamedTemporaryFile()

        with open(temp_file.name, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(expected, f)

        with open(temp_file.name, "r") as f:
            imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f))

        self.assertEqual(len(expected), len(imported))
        for x, y in zip(expected, imported):
            x_attributes = {k: getattr(x, k) for k in dir(x) if not k.startswith("__") and not callable(getattr(x, k)) and k != "_cache"}
            y_attributes = {k: getattr(y, k) for k in dir(y) if not k.startswith("__") and not callable(getattr(y, k)) and k != "_cache"}

            self.assertDictEqual(x_attributes, y_attributes)
Esempio n. 3
0
    def test_export_traced_data_iterable_to_jsonl(self):
        file_path = path.join(self.test_dir, "json_test.json")

        # Test exporting wrong data type
        data = self.generate_test_data()
        with open(file_path, "w") as f:
            try:
                TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data[0], f)
                self.fail("Exporting the wrong data type did not raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), _td_type_error_string)

        # Test normal export
        data = self.generate_test_data()
        with open(file_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/json_export_expected.json"))
Esempio n. 4
0
    def test_flush_history_from_traced_data_iterable(self):
        history_file_path = path.join(self.test_dir, "flush_test_history.jsonl")

        data = self.generate_test_data()
        data_0_sha = data[0].get_sha()
        self.assertEqual(len(data[1].get_history("Gender")), 2)

        TracedDataJsonIO.flush_history_from_traced_data_iterable("test_user", data, history_file_path)

        self.assertTrue(filecmp.cmp(history_file_path, "tests/traced_data/resources/flush_history_expected_history.jsonl"))
        self.assertEqual(len(data[1].get_history("Gender")), 1)
        self.assertEqual(data[0]["_PrevTracedDataSHA"], data_0_sha)

        # Test the remaining data can be round-tripped
        latest_file_path = path.join(self.test_dir, "flush_test_latest.jsonl")
        with open(latest_file_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
        with open(latest_file_path, "r") as f:
            TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                         raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [
                Contact.deserialize(contact_json)
                for contact_json in json.load(raw_contacts_file)
            ]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(
            f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server"
        )
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(
                raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [
                        Run.deserialize(run_json)
                        for run_json in json.load(raw_runs_file)
                    ]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id,
                    raw_runs,
                    raw_export_log_file=raw_runs_log_file,
                    ignore_archives=True)
            except FileNotFoundError:
                log.info(
                    f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'"
                )
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(
                    flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            rapid_pro_source.test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
    if pipeline_run_mode == "all-stages":
        log.info("Running post labelling pipeline stages...")

        log.info("Applying Manual Codes from Coda...")
        data = ApplyManualCodes.apply_manual_codes(user, data,
                                                   prev_coded_dir_path)

        log.info("Generating Analysis CSVs...")
        messages_data, individuals_data = AnalysisFile.generate(
            user, data, csv_by_message_output_path,
            csv_by_individual_output_path)

        log.info("Writing messages TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(messages_json_output_path)
        with open(messages_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                messages_data, f)

        log.info("Writing individuals TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(individuals_json_output_path)
        with open(individuals_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                individuals_data, f)
    else:
        assert pipeline_run_mode == "auto-code-only", "pipeline run mode must be either auto-code-only or all-stages"
        log.info("Writing Auto-Coding TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(auto_coding_json_output_path)
        with open(auto_coding_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)

    log.info("Python script complete")
Esempio n. 7
0
    print("Exporting advert CSV...")
    advert_phone_numbers = AdvertPhoneNumbers.generate(
        data, phone_number_uuid_table, advert_phone_numbers_csv_output_path)

    print("Filtering out RQA Messages labelled as Noise_Other_Project...")
    data = FilterNOP.filter_rqa_noise_other_project(data)

    print("Generating Analysis CSVs...")
    data = AnalysisFile.generate(user, data, csv_by_message_output_path,
                                 csv_by_individual_output_path)

    print("Writing TracedData to file...")
    IOUtils.ensure_dirs_exist_for_file(json_output_path)
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)

    # Upload to Google Drive, if requested.
    # Note: This should happen as late as possible in order to reduce the risk of the remainder of the pipeline failing
    # after a Drive upload has occurred. Failures could result in inconsistent outputs or outputs with no
    # traced data log.
    if pipeline_configuration.drive_upload is not None:
        print("Uploading CSVs to Google Drive...")

        production_csv_drive_dir = os.path.dirname(
            pipeline_configuration.drive_upload.production_upload_path)
        production_csv_drive_file_name = os.path.basename(
            pipeline_configuration.drive_upload.production_upload_path)
        drive_client_wrapper.update_or_create(
            production_csv_output_path,
            production_csv_drive_dir,
Esempio n. 8
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir,
                        facebook_uuid_table, facebook_source):
    log.info("Fetching data from Facebook...")
    log.info("Downloading Facebook access token...")
    facebook_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        facebook_source.token_file_url).strip()

    facebook = FacebookClient(facebook_token)

    for dataset in facebook_source.datasets:
        log.info(f"Exporting comments for dataset {dataset.name}...")
        raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json"
        traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl"

        # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook.
        raw_comments = []
        for post_id in dataset.post_ids:
            comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl"
            with open(comments_log_path, "a") as raw_comments_log_file:
                post_comments = facebook.get_all_comments_on_post(
                    post_id,
                    raw_export_log_file=raw_comments_log_file,
                    fields=[
                        "from{id}", "parent", "attachments", "created_time",
                        "message"
                    ])

            # Download the post and add it as context to all the comments. Adding a reference to the post under
            # which a comment was made enables downstream features such as post-type labelling and comment context
            # in Coda, as well as allowing us to track how many comments were made on each post.
            post = facebook.get_post(post_id, fields=["attachments"])
            for comment in post_comments:
                comment["post"] = post

            raw_comments.extend(post_comments)

        # Facebook only returns a parent if the comment is a reply to another comment.
        # If there is no parent, set one to the empty-dict.
        for comment in raw_comments:
            if "parent" not in comment:
                comment["parent"] = {}

        # Convert the comments to TracedData.
        traced_comments = facebook.convert_facebook_comments_to_traced_data(
            user, dataset.name, raw_comments, facebook_uuid_table)

        # Export to disk.
        log.info(
            f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path)
        with open(raw_comments_output_path, "w") as raw_comments_output_file:
            json.dump(raw_comments, raw_comments_output_file)
        log.info(f"Saved {len(raw_comments)} raw comments")

        log.info(
            f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path)
        with open(traced_comments_output_path,
                  "w") as traced_comments_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_comments, traced_comments_output_file)
        log.info(f"Saved {len(traced_comments)} traced comments")
Esempio n. 10
0
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            pipeline_configuration.rapid_pro_test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")