Ejemplo n.º 1
0
    def test_import_json_to_traced_data_iterable(self):
        file_path = "tests/traced_data/resources/json_export_expected.json"
        expected = self.generate_test_data()

        with open(file_path, "r") as f:
            imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f))

        self.assertListEqual(expected, imported)
 def load_datasets(flow_names):
     datasets = []
     for i, flow_name in enumerate(flow_names):
         raw_flow_path = f"{raw_data_dir}/{flow_name}.jsonl"
         log.info(f"Loading {i + 1}/{len(flow_names)}: {raw_flow_path}...")
         with open(raw_flow_path, "r") as f:
             runs = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
         log.info(f"Loaded {len(runs)} runs")
         datasets.append(runs)
     return datasets
Ejemplo n.º 3
0
    def test_flush_history_from_traced_data_iterable(self):
        history_file_path = path.join(self.test_dir, "flush_test_history.jsonl")

        data = self.generate_test_data()
        data_0_sha = data[0].get_sha()
        self.assertEqual(len(data[1].get_history("Gender")), 2)

        TracedDataJsonIO.flush_history_from_traced_data_iterable("test_user", data, history_file_path)

        self.assertTrue(filecmp.cmp(history_file_path, "tests/traced_data/resources/flush_history_expected_history.jsonl"))
        self.assertEqual(len(data[1].get_history("Gender")), 1)
        self.assertEqual(data[0]["_PrevTracedDataSHA"], data_0_sha)

        # Test the remaining data can be round-tripped
        latest_file_path = path.join(self.test_dir, "flush_test_latest.jsonl")
        with open(latest_file_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(data, f)
        with open(latest_file_path, "r") as f:
            TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
Ejemplo n.º 4
0
    def test_round_trip(self):
        expected = self.generate_test_data()
        temp_file = tempfile.NamedTemporaryFile()

        with open(temp_file.name, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(expected, f)

        with open(temp_file.name, "r") as f:
            imported = list(TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f))

        self.assertEqual(len(expected), len(imported))
        for x, y in zip(expected, imported):
            x_attributes = {k: getattr(x, k) for k in dir(x) if not k.startswith("__") and not callable(getattr(x, k)) and k != "_cache"}
            y_attributes = {k: getattr(y, k) for k in dir(y) if not k.startswith("__") and not callable(getattr(y, k)) and k != "_cache"}

            self.assertDictEqual(x_attributes, y_attributes)
Ejemplo n.º 5
0
        f"{automated_analysis_output_dir}/maps/mogadishu")
    IOUtils.ensure_dirs_exist(f"{automated_analysis_output_dir}/graphs")

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    sys.setrecursionlimit(30000)
    # Read the messages dataset
    log.info(
        f"Loading the messages dataset from {messages_json_input_path}...")
    with open(messages_json_input_path) as f:
        messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        for i in range(len(messages)):
            messages[i] = dict(messages[i].items())
    log.info(f"Loaded {len(messages)} messages")

    # Read the individuals dataset
    log.info(
        f"Loading the individuals dataset from {individuals_json_input_path}..."
    )
    with open(individuals_json_input_path) as f:
        individuals = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        for i in range(len(individuals)):
            individuals[i] = dict(individuals[i].items())
    log.info(f"Loaded {len(individuals)} individuals")

    # Compute the number of messages, individuals, and relevant messages per episode and overall.
Ejemplo n.º 6
0
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    uuids = set()
    skipped_nr = 0
    for path in traced_data_paths:
        # Load the traced data
        log.info(f"Loading previous traced data from file '{path}'...")
        with open(path) as f:
            data = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(data)} traced data objects")

        for td in data:
            if td["consent_withdrawn"] == Codes.TRUE:
                continue

            uuids.add(td["uid"])
    log.info(
        f"Loaded {len(uuids)} uuids from TracedData (skipped {skipped_nr} items with an NR property)"
    )

    if exclusion_list_file_path is not None:
        # Load the exclusion list
        log.info(
            f"Loading the exclusion list from {exclusion_list_file_path}...")
Ejemplo n.º 7
0
    if pipeline_configuration.drive_upload is not None:
        log.info(f"Downloading Google Drive service account credentials...")
        credentials_info = json.loads(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, pipeline_configuration.
                drive_upload.drive_credentials_file_url))
        drive_client_wrapper.init_client_from_info(credentials_info)

    # Load messages
    messages_datasets = []
    for i, activation_flow_name in enumerate(
            pipeline_configuration.activation_flow_names):
        raw_activation_path = f"{raw_data_dir}/{activation_flow_name}.jsonl"
        log.info(f"Loading {raw_activation_path}...")
        with open(raw_activation_path, "r") as f:
            messages = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(messages)} messages")
        messages_datasets.append(messages)

    log.info("Loading surveys datasets:")
    surveys_datasets = []
    for i, survey_flow_name in enumerate(
            pipeline_configuration.survey_flow_names):
        raw_survey_path = f"{raw_data_dir}/{survey_flow_name}.jsonl"
        log.info(f"Loading {raw_survey_path}...")
        with open(raw_survey_path, "r") as f:
            contacts = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(contacts)} contacts")
        surveys_datasets.append(contacts)

    # Add survey data to the messages