def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a Recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
    def test_round_trip(self):
        expected = self.generate_test_data()
        temp_file = tempfile.NamedTemporaryFile()

        with open(temp_file.name, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(expected, f)

        with open(temp_file.name, "r") as f:
            imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f))

        self.assertEqual(len(expected), len(imported))
        for x, y in zip(expected, imported):
            x_attributes = {k: getattr(x, k) for k in dir(x) if not k.startswith("__") and not callable(getattr(x, k)) and k != "_cache"}
            y_attributes = {k: getattr(y, k) for k in dir(y) if not k.startswith("__") and not callable(getattr(y, k)) and k != "_cache"}

            self.assertDictEqual(x_attributes, y_attributes)
    def test_import_json_to_traced_data_iterable(self):
        file_path = "tests/traced_data/resources/json_export_expected.json"
        expected = self.generate_test_data()

        with open(file_path, "r") as f:
            imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f))

        self.assertListEqual(expected, imported)
    def test_export_traced_data_iterable_to_jsonl(self):
        file_path = path.join(self.test_dir, "json_test.json")

        # Test exporting wrong data type
        data = self.generate_test_data()
        with open(file_path, "w") as f:
            try:
                TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data[0], f)
                self.fail("Exporting the wrong data type did not raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), _td_type_error_string)

        # Test normal export
        data = self.generate_test_data()
        with open(file_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/json_export_expected.json"))
 def load_show(show_name):
     show_path = path.join(messages_input_path, "{}.json".format(show_name))
     if not path.exists(show_path):
         print("Warning: No show found with file name '{}.json'".format(
             show_name))
         return []
     with open(show_path, "r") as f:
         return list(
             TracedDataJsonIO.import_json_to_traced_data_iterable(f))
 def load_datasets(flow_names):
     datasets = []
     for i, flow_name in enumerate(flow_names):
         raw_flow_path = f"{raw_data_dir}/{flow_name}.jsonl"
         log.info(f"Loading {i + 1}/{len(flow_names)}: {raw_flow_path}...")
         with open(raw_flow_path, "r") as f:
             runs = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
         log.info(f"Loaded {len(runs)} runs")
         datasets.append(runs)
     return datasets
    def load_survey_dict(file_path):
        """
        Loads a survey from a TracedData JSON file into a dict indexed by avf_phone_id

        :param file_path: Path to survey file to load
        :type file_path: str
        :return: Dictionary mapping contact id ('avf_phone_id') to the survey TracedData for that contact.
        :rtype: dict of str -> TracedData
        """
        with open(file_path, "r") as f:
            return {td["avf_phone_id"]: td for td in TracedDataJsonIO.import_json_to_traced_data_iterable(f)}
    def test_flush_history_from_traced_data_iterable(self):
        history_file_path = path.join(self.test_dir, "flush_test_history.jsonl")

        data = self.generate_test_data()
        data_0_sha = data[0].get_sha()
        self.assertEqual(len(data[1].get_history("Gender")), 2)

        TracedDataJsonIO.flush_history_from_traced_data_iterable("test_user", data, history_file_path)

        self.assertTrue(filecmp.cmp(history_file_path, "tests/traced_data/resources/flush_history_expected_history.jsonl"))
        self.assertEqual(len(data[1].get_history("Gender")), 1)
        self.assertEqual(data[0]["_PrevTracedDataSHA"], data_0_sha)

        # Test the remaining data can be round-tripped
        latest_file_path = path.join(self.test_dir, "flush_test_latest.jsonl")
        with open(latest_file_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
        with open(latest_file_path, "r") as f:
            TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
    parser.add_argument("demogs_traced_json_output_path",
                        help="Path to updted survey TraceData JSON")

    args = parser.parse_args()
    user = args.user
    messages_input_path = args.messages_input_path
    surveys_input_path = args.surveys_input_path
    demogs_input_path = args.demogs_input_path
    messages_traced_json_output_path = args.messages_traced_json_output_path
    surveys_traced_json_output_path = args.surveys_traced_json_output_path
    demogs_traced_json_output_path = args.demogs_traced_json_output_path

    # Load messages
    for filename in os.listdir(messages_input_path):
        with open(os.path.join(messages_input_path, filename)) as f:
            messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

            # Create a set of all the keys appearing in the data
            keys = {key for message in messages for key in message.keys()}
            keys = list(keys)
            keys.remove("avf_phone_id")

            # Add group name to each key
            group_name = filename.split("_with_id.json")[0]
            create_unique_keys(messages, keys, group_name)

        # Output updated td-s
        message_output_path = os.path.join(
            messages_traced_json_output_path,
            "{}_updated_keys.json".format(group_name))
        with open(message_output_path, "w") as f:
    user = args.user
    phone_uuid_path = args.phone_uuid_table_path
    demog_dataset_path = args.demog_dataset_path
    json_output_path = args.json_output_path

    with open(phone_uuid_path, "r") as f:
        phone_uuids = PhoneNumberUuidTable.load(f)

    with open(demog_dataset_path, "r") as f:
        traced_demog = TracedDataCSVIO.import_csv_to_traced_data_iterable(
            user, f)
        traced_demog = list(traced_demog)
        for td in traced_demog:
            uuid_dict = {
                "avf_phone_id": phone_uuids.add_phone(td["final_phone"])
            }
            td.append_data(
                uuid_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

    # Write the UUIDs out to a file
    with open(phone_uuid_path, "w") as f:
        phone_uuids.dump(f)

    # Output TracedData to JSON.
    IOUtils.ensure_dirs_exist(json_output_path)
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(traced_demog,
                                                             f,
                                                             pretty_print=True)
Beispiel #11
0
        metavar="json-output-path",
        help="Path to a JSON file to write processed messages to")

    args = parser.parse_args()
    user = args.user
    messages_input_path = args.messages_input_path
    survey_input_path = args.survey_input_path
    demog_input_path = args.demog_input_path
    json_output_path = args.json_output_path

    # Load messages
    messages_datasets = []
    for filename in os.listdir(messages_input_path):
        with open(os.path.join(messages_input_path, filename)) as f:
            messages_datasets.append(
                TracedDataJsonIO.import_json_to_traced_data_iterable(f))

    # Load followup surveys
    survey_datasets = []
    for filename in os.listdir(survey_input_path):
        with open(os.path.join(survey_input_path, filename)) as f:
            survey_datasets.append(
                TracedDataJsonIO.import_json_to_traced_data_iterable(f))

    # Load demogs
    print("Loading Demographics...")
    with open(demog_input_path, "r") as f:
        demographics = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Add survey data to the messages
    print("Combining Datasets...")
Beispiel #12
0
    coda_output_path = args.coda_output_path
    icr_output_path = args.icr_output_path
    csv_output_path = args.csv_output_path

    ICR_MESSAGES_COUNT = 200  # Number of messages to export in the ICR file

    # Convert date/time of messages to EAT
    utc_key = "{} (Time) - {}".format(variable_name, flow_name)
    eat_key = "{} (Time EAT) - {}".format(variable_name, flow_name)
    inside_time_window = []
    START_TIME = isoparse("2018-10-18T00+03:00")
    END_TIME = isoparse("2018-10-27T00+03:00")

    # Load data from JSON file
    with open(json_input_path, "r") as f:
        show_messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Filter out test messages sent by AVF.
    show_messages = [
        td for td in show_messages if not td.get("test_run", False)
    ]

    # Filter for runs which contain a response to this week's question.
    show_message_key = "{} (Text) - {}".format(variable_name, flow_name)
    show_messages = [td for td in show_messages if show_message_key in td]

    # Filter out messages sent outside the project run period
    for td in show_messages:
        utc_time = isoparse(td[utc_key])
        eat_time = utc_time.astimezone(
            pytz.timezone("Africa/Nairobi")).isoformat()
    fixup_table = json.load(open(fixup_table_path, 'r'))

    # print ("Loading code schemes")
    code_scheme_paths_list = [
        os.path.join(code_schemes_in_folder, f)
        for f in os.listdir(code_schemes_in_folder)
        if os.path.isfile(os.path.join(code_schemes_in_folder, f))
    ]

    code_schemes = {}
    for code_scheme_path in code_scheme_paths_list:
        scheme = json.load(open(code_scheme_path, 'r'))
        code_schemes[scheme["SchemeID"]] = scheme

    # print ("Loading demog data")
    demog_td = TracedDataJsonIO.import_json_to_traced_data_iterable(
        open(os.path.join(demogs_in_folder, "Demog_survey_with_id.json"), 'r'))

    # print ("Remapping demogs")
    for msg in demog_td:
        for demog_map in demog_maps:
            id = msg[demog_map["MessageId"]]
            dataset_in_principle = demog_map["Coda-Dataset"]
            remap(id, dataset_in_principle, fixup_table, code_schemes)

    # print ("Remapping messages")
    for message_map in message_maps:
        # print ("Loading message_map: {}".format(message_map["FileName"]))
        messages_td = TracedDataJsonIO.import_json_to_traced_data_iterable(
            open(os.path.join(messages_in_folder, message_map["FileName"]),
                 'r'))
        "json_output",
        metavar="json-output",
        help=
        "Path to write results of merging to, as a serialised TracedData JSON file",
        nargs=1)

    args = parser.parse_args()
    user = args.user[0]
    input_path_messages = args.input_messages[0]
    group = args.group[0]
    input_path_adverts = args.input_adverts[0]
    json_output_path = args.json_output[0]

    # Load data from JSON file
    with open(input_path_messages, "r") as f:
        messages_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Load data from JSON file
    with open(input_path_adverts, "r") as f:
        adverts_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Map "QUESTION_R" => "Message"
    for td in adverts_data:
        assert "QUESTION_R" in td.keys()
        td.append_data(
            {
                "Message": td["QUESTION_R"],
                "Date": td["start_date"],
                "Group": group
            }, Metadata(user, Metadata.get_call_location(), time.time()))
Beispiel #15
0
        drive_client_wrapper.init_client_from_info(credentials_info)

    # Load phone number <-> UUID table
    print("Loading Phone Number <-> UUID Table...")
    with open(phone_number_uuid_table_path, "r") as f:
        phone_number_uuid_table = PhoneNumberUuidTable.load(f)

    # Load messages
    messages_datasets = []
    for i, activation_flow_name in enumerate(
            pipeline_configuration.activation_flow_names):
        raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.jsonl"
        log.info(f"Loading {raw_activation_path}...")
        with open(raw_activation_path, "r") as f:
            messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.debug(f"Loaded {len(messages)} messages")
        messages_datasets.append(messages)

    # Load surveys
    survey_datasets = []
    for i, survey_flow_name in enumerate(
            pipeline_configuration.survey_flow_names):
        raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.jsonl"
        log.info(f"Loading {raw_survey_path}...")
        with open(raw_survey_path, "r") as f:
            messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.debug(f"Loaded {len(messages)} messages")
        survey_datasets.append(messages)

    # Add survey data to the messages
Beispiel #16
0
    message_paths = [s02e01_input_path]

    # Load the pipeline configuration file
    print("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(f)

    # Load phone number <-> UUID table
    print("Loading Phone Number <-> UUID Table...")
    with open(phone_number_uuid_table_path, "r") as f:
        phone_number_uuid_table = PhoneNumberUuidTable.load(f)

    # Load demographics
    print("Loading Demographics 1/1...")
    with open(demog_input_path, "r") as f:
        demographics = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
    print(f"Loaded {len(demographics)} contacts")

    # Load messages
    messages_datasets = []
    for i, path in enumerate(message_paths):
        print("Loading Episode {}/{}...".format(i + 1, len(message_paths)))
        with open(path, "r") as f:
            messages_datasets.append(TracedDataJsonIO.import_json_to_traced_data_iterable(f))
    
    # Add survey data to the messages
    print("Combining Datasets...")
    import argparse
import os
import random
Beispiel #17
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir,
                        facebook_uuid_table, facebook_source):
    log.info("Fetching data from Facebook...")
    log.info("Downloading Facebook access token...")
    facebook_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        facebook_source.token_file_url).strip()

    facebook = FacebookClient(facebook_token)

    for dataset in facebook_source.datasets:
        log.info(f"Exporting comments for dataset {dataset.name}...")
        raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json"
        traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl"

        # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook.
        raw_comments = []
        for post_id in dataset.post_ids:
            comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl"
            with open(comments_log_path, "a") as raw_comments_log_file:
                post_comments = facebook.get_all_comments_on_post(
                    post_id,
                    raw_export_log_file=raw_comments_log_file,
                    fields=[
                        "from{id}", "parent", "attachments", "created_time",
                        "message"
                    ])

            # Download the post and add it as context to all the comments. Adding a reference to the post under
            # which a comment was made enables downstream features such as post-type labelling and comment context
            # in Coda, as well as allowing us to track how many comments were made on each post.
            post = facebook.get_post(post_id, fields=["attachments"])
            for comment in post_comments:
                comment["post"] = post

            raw_comments.extend(post_comments)

        # Facebook only returns a parent if the comment is a reply to another comment.
        # If there is no parent, set one to the empty-dict.
        for comment in raw_comments:
            if "parent" not in comment:
                comment["parent"] = {}

        # Convert the comments to TracedData.
        traced_comments = facebook.convert_facebook_comments_to_traced_data(
            user, dataset.name, raw_comments, facebook_uuid_table)

        # Export to disk.
        log.info(
            f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path)
        with open(raw_comments_output_path, "w") as raw_comments_output_file:
            json.dump(raw_comments, raw_comments_output_file)
        log.info(f"Saved {len(raw_comments)} raw comments")

        log.info(
            f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path)
        with open(traced_comments_output_path,
                  "w") as traced_comments_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_comments, traced_comments_output_file)
        log.info(f"Saved {len(traced_comments)} traced comments")
    json_output_path = args.json_output_path
    fgd_csv_output_path = args.fgd_csv_output_path
    cc_csv_output_path = args.cc_csv_output_path
    prev_exports_path = args.prev_exports_path

    MINIMUM_AGE = 18
    TOTAL_CC_CONTACTS = 160
    TOTAL_FGD_CONTACTS = 100

    # Load phone uuid table
    with open(phone_uuid_table_path, "r") as f:
        phone_uuids = PhoneNumberUuidTable.load(f)

    # Load FGD/CC survey responses
    with open(fgd_cc_input_path, "r") as f:
        fgd_cc_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Load the previous export
    prev_exports = []
    if prev_exports_path is not None:
        with open(prev_exports_path, "r") as f:
            prev_exports = list(
                TracedDataCSVIO.import_csv_to_traced_data_iterable(user, f))

    # Load coded demog surveys
    with open(demog_surveys_input_path, "r") as f:
        surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Filter out people who haven't answered the fgd_cc consent question
    fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc"
    fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td]
    if pipeline_configuration.drive_upload is not None:
        log.info(f"Downloading Google Drive service account credentials...")
        credentials_info = json.loads(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, pipeline_configuration.
                drive_upload.drive_credentials_file_url))
        drive_client_wrapper.init_client_from_info(credentials_info)

    # Load messages
    messages_datasets = []
    for i, activation_flow_name in enumerate(
            pipeline_configuration.activation_flow_names):
        raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.json"
        log.info(f"Loading {raw_activation_path}...")
        with open(raw_activation_path, "r") as f:
            messages = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
        log.info(f"Loaded {len(messages)} messages")
        messages_datasets.append(messages)

    log.info("Loading surveys datasets:")
    surveys_datasets = []
    for i, survey_flow_name in enumerate(
            pipeline_configuration.survey_flow_names):
        raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.json"
        log.info(f"Loading {raw_survey_path}...")
        with open(raw_survey_path, "r") as f:
            contacts = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
        log.info(f"Loaded {len(contacts)} contacts")
        surveys_datasets.append(contacts)

    # Add survey data to the messages
    if pipeline_run_mode == "all-stages":
        log.info("Running post labelling pipeline stages...")

        log.info("Applying Manual Codes from Coda...")
        data = ApplyManualCodes.apply_manual_codes(user, data,
                                                   prev_coded_dir_path)

        log.info("Generating Analysis CSVs...")
        messages_data, individuals_data = AnalysisFile.generate(
            user, data, csv_by_message_output_path,
            csv_by_individual_output_path)

        log.info("Writing messages TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(messages_json_output_path)
        with open(messages_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                messages_data, f)

        log.info("Writing individuals TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(individuals_json_output_path)
        with open(individuals_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                individuals_data, f)
    else:
        assert pipeline_run_mode == "auto-code-only", "pipeline run mode must be either auto-code-only or all-stages"
        log.info("Writing Auto-Coding TracedData to file...")
        IOUtils.ensure_dirs_exist_for_file(auto_coding_json_output_path)
        with open(auto_coding_json_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)

    log.info("Python script complete")
        "Path to a coded survey JSON file, containing a list of serialized TracedData objects"
    )
    parser.add_argument(
        "csv_output_path",
        metavar="csv-output-path",
        help="Path to a CSV file to write the summarised stats to")

    args = parser.parse_args()
    user = args.user
    messages_input_path = args.messages_input_path
    survey_input_path = args.survey_input_path
    csv_output_path = args.csv_output_path

    # Load surveys
    with open(survey_input_path, "r") as f:
        surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    def load_show(show_name):
        show_path = path.join(messages_input_path, "{}.json".format(show_name))
        if not path.exists(show_path):
            print("Warning: No show found with file name '{}.json'".format(
                show_name))
            return []
        with open(show_path, "r") as f:
            return list(
                TracedDataJsonIO.import_json_to_traced_data_iterable(f))

    survey_keys = {
        "District (Text) - wt_demog_1": surveys,
        "Gender (Text) - wt_demog_1": surveys,
        "Urban_Rural (Text) - wt_demog_1": surveys,
    # Convert times to ISO
    for td in messages:
        td.append_data(
            {"Date": session.echo_mobile_date_to_iso(td["Date"])},
            Metadata(user, Metadata.get_call_location(), time.time())
        )

    # Filter out messages sent outwith the desired time range.
    messages = list(filter(lambda td: echo_mobile_start_date <= isoparse(td["Date"]) < echo_mobile_end_date, messages))

    # Add a unique id to each message
    for td in messages:
        td.append_data(
            {"avf_message_id": message_uuids.add_message(
                EchoMobileSession.normalise_message(td, "avf_phone_id", "Date", "Message"))},
            Metadata(user, Metadata.get_call_location(), time.time())
        )

    # Write the UUIDs out to a file
    with open(phone_uuid_path, "w") as f:
        phone_uuids.dump(f)
    with open(message_uuid_path, "w") as f:
        message_uuids.dump(f)

    # Write the parsed messages to a json file
    if os.path.dirname(json_output_path) is not "" and not os.path.exists(os.path.dirname(json_output_path)):
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(messages, f, pretty_print=True)
Beispiel #24
0
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            pipeline_configuration.rapid_pro_test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Beispiel #25
0
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    uuids = set()
    skipped_nr = 0
    for path in traced_data_paths:
        # Load the traced data
        log.info(f"Loading previous traced data from file '{path}'...")
        with open(path) as f:
            data = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(data)} traced data objects")

        for td in data:
            if td["consent_withdrawn"] == Codes.TRUE:
                continue

            uuids.add(td["uid"])
    log.info(
        f"Loaded {len(uuids)} uuids from TracedData (skipped {skipped_nr} items with an NR property)"
    )

    if exclusion_list_file_path is not None:
        # Load the exclusion list
        log.info(
            f"Loading the exclusion list from {exclusion_list_file_path}...")
Beispiel #26
0
        "Path to input file to concatenate, containing a list of TracedData objects as JSON",
        nargs=1)
    parser.add_argument("json_output",
                        metavar="json-output",
                        help="Path to write results of cleaning to",
                        nargs=1)

    args = parser.parse_args()
    user = args.user[0]
    json_input_path_1 = args.json_input_1[0]
    json_input_path_2 = args.json_input_2[0]
    json_output_path = args.json_output[0]

    # Load data from JSON file
    with open(json_input_path_1, "r") as f:
        input_data_1 = TracedDataJsonIO.import_json_to_traced_data_iterable(f)
    with open(json_input_path_2, "r") as f:
        input_data_2 = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Concatenate files
    output_data = list(input_data_1)
    output_data.extend(input_data_2)

    # Write json output
    if os.path.dirname(json_output_path) is not "" and not os.path.exists(
            os.path.dirname(json_output_path)):
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(output_data,
                                                             f,
                                                             pretty_print=True)
Beispiel #27
0
        f"{automated_analysis_output_dir}/maps/mogadishu")
    IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs")

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    sys.setrecursionlimit(30000)
    # Read the messages dataset
    log.info(
        f"Loading the messages dataset from {messages_json_input_path}...")
    with open(messages_json_input_path) as f:
        messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        for i in range(len(messages)):
            messages[i] = dict(messages[i].items())
    log.info(f"Loaded {len(messages)} messages")

    # Read the individuals dataset
    log.info(
        f"Loading the individuals dataset from {individuals_json_input_path}..."
    )
    with open(individuals_json_input_path) as f:
        individuals = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        for i in range(len(individuals)):
            individuals[i] = dict(individuals[i].items())
    log.info(f"Loaded {len(individuals)} individuals")

    # Compute the number of messages, individuals, and relevant messages per episode and overall.
Beispiel #28
0
    if pipeline_configuration.drive_upload is not None:
        log.info(f"Downloading Google Drive service account credentials...")
        credentials_info = json.loads(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, pipeline_configuration.
                drive_upload.drive_credentials_file_url))
        drive_client_wrapper.init_client_from_info(credentials_info)

    # Load messages
    messages_datasets = []
    for i, activation_flow_name in enumerate(
            pipeline_configuration.activation_flow_names):
        raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.jsonl"
        log.info(f"Loading {raw_activation_path}...")
        with open(raw_activation_path, "r") as f:
            messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(messages)} messages")
        messages_datasets.append(messages)

    log.info("Loading surveys datasets:")
    surveys_datasets = []
    for i, survey_flow_name in enumerate(
            pipeline_configuration.survey_flow_names):
        raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.jsonl"
        log.info(f"Loading {raw_survey_path}...")
        with open(raw_survey_path, "r") as f:
            contacts = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(contacts)} contacts")
        surveys_datasets.append(contacts)

    # Add survey data to the messages
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                         raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [
                Contact.deserialize(contact_json)
                for contact_json in json.load(raw_contacts_file)
            ]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(
            f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server"
        )
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(
                raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [
                        Run.deserialize(run_json)
                        for run_json in json.load(raw_runs_file)
                    ]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id,
                    raw_runs,
                    raw_export_log_file=raw_runs_log_file,
                    ignore_archives=True)
            except FileNotFoundError:
                log.info(
                    f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'"
                )
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(
                    flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            rapid_pro_source.test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Beispiel #30
0
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_json(traced_runs, traced_runs_output_file, pretty_print=True)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")