Ejemplo n.º 1
0
    def test_all_files_unparseable_output(self, mock_check_cron):
        # TODO possible bug: if no pre-existing table, results in bq table not found error
        folder_prefix = 'dummy-prefix-2018-03-22/'
        for cdm_table in common.CDM_FILES:
            test_util.write_cloud_str(self.hpo_bucket,
                                      folder_prefix + cdm_table, ".\n .")

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            bucket_item_names = [
                item['name'] for item in list_bucket_result
                if item['name'].startswith(folder_prefix)
            ]
            expected_items = common.CDM_FILES + common.IGNORE_LIST
            expected_items = [
                folder_prefix + item_name for item_name in expected_items
            ]
            self.assertSetEqual(set(bucket_item_names), set(expected_items))

            # check content of the file is correct
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.RESULT_CSV)
            actual_result = resources._csv_file_to_list(
                StringIO.StringIO(actual_result))
            expected = [{
                'cdm_file_name': cdm_file_name,
                'found': '1',
                'parsed': '0',
                'loaded': '0'
            } for cdm_file_name in common.CDM_FILES]
            self.assertEqual(expected, actual_result)
Ejemplo n.º 2
0
def get_full_result_log():
    full_log = []
    for hpo in resources.hpo_csv():
        hpo_id = hpo['hpo_id']
        hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)

        try:
            # TODO : figure out possible errors and catch specific bucket inexistence error
            obj_metadata = gcs_utils.get_metadata(hpo_bucket, RESULT_CSV)
        except:
            logging.warning(
                'skipping hpo {}. bucket does not exist.'.format(hpo))
            continue

        if obj_metadata is None:
            logging.info('%s was not found in %s' % (RESULT_CSV, hpo_bucket))
        else:
            hpo_result = gcs_utils.get_object(hpo_bucket, RESULT_CSV)
            hpo_result_file = StringIO.StringIO(hpo_result)
            hpo_result_items = resources._csv_file_to_list(hpo_result_file)
            result_objects = map(
                lambda item: hpo_log_item_to_obj(hpo_id, item),
                hpo_result_items)
            full_log.extend(result_objects)
    return full_log
Ejemplo n.º 3
0
    def test_bad_file_names(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        exclude_file_list = ["person_final.csv",
                             "condition_occurence.csv",  # misspelled
                             "avisit_occurrence.csv",
                             "procedure_occurrence.tsv"]  # unsupported file extension

        exclude_file_list = [folder_prefix + item for item in exclude_file_list]
        expected_result_items = []
        for file_name in exclude_file_list:
            test_util.write_cloud_str(self.hpo_bucket, file_name, ".")
            expected_item = dict(file_name=file_name.split('/')[1], message=main.UNKNOWN_FILE)
            expected_result_items.append(expected_item)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check content of the bucket is correct
            expected_bucket_items = exclude_file_list + [folder_prefix + item for item in common.IGNORE_LIST]
            # [common.RESULT_CSV, common.WARNINGS_CSV]
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items))

            # check content of the warnings file is correct
            actual_result = test_util.read_cloud_file(self.hpo_bucket,
                                                      folder_prefix + common.WARNINGS_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(actual_result_file)
            # sort in order to compare
            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
Ejemplo n.º 4
0
    def _test_validate_missing_files_output(self, mock_check_cron):
        # enable exception propagation as described at https://goo.gl/LqDgnj
        folder_prefix = 'dummy-prefix-2018-03-22/'
        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result files were placed in bucket
            bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
            item_names = []
            for item in bucket_items:
                item_names.append(item['name'])
            for ignore_file in common.IGNORE_LIST:
                self.assertIn(folder_prefix + ignore_file, item_names)

            # check content of result.csv is correct
            # TODO fix this for all cdm files and use object comparison
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.RESULT_CSV)
            actual = resources._csv_file_to_list(
                StringIO.StringIO(actual_result))
            expected = [{
                'cdm_file_name': cdm_file_name,
                'found': '0',
                'parsed': '0',
                'loaded': '0'
            } for cdm_file_name in common.REQUIRED_FILES]
            self.assertEqual(expected, actual)
            self.assertFalse(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix))
Ejemplo n.º 5
0
    def test_errors_csv(self, mock_check_cron):
        folder_prefix = 'dummy-prefix-2018-03-22/'
        test_util.write_cloud_str(self.hpo_bucket,
                                  folder_prefix + 'person.csv', ".\n .,.,.")

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            bucket_item_names = [
                item['name'] for item in list_bucket_result
                if item['name'].startswith(folder_prefix)
            ]
            expected_items = ['person.csv'] + common.IGNORE_LIST
            expected_items = [folder_prefix + item for item in expected_items]
            self.assertSetEqual(set(bucket_item_names), set(expected_items))

            # check content of the file is correct
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.ERRORS_CSV)
            actual = resources._csv_file_to_list(
                StringIO.StringIO(actual_result))
            for row in actual:
                row.pop('message', None)
            expected = [{'file_name': 'person.csv', 'type': 'error'}]
            self.assertEqual(actual, expected)
Ejemplo n.º 6
0
    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=prefix)

        expected_tables = [
            'person', 'visit_occurrence', 'condition_occurrence',
            'procedure_occurrence', 'drug_exposure', 'measurement'
        ]
        cdm_files = [table + '.csv' for table in expected_tables]

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files
            expected_objects = [
                prefix + item for item in expected_object_names
            ]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_objects = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_objects), set(actual_objects))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix=prefix))

        # check tables exist and are clustered as expected
        for table in expected_tables:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)
Ejemplo n.º 7
0
def all_required_files_loaded(hpo_id, folder_prefix):
    result_file = gcs_utils.get_object(gcs_utils.get_hpo_bucket(hpo_id),
                                       folder_prefix + common.RESULT_CSV)
    result_file = StringIO.StringIO(result_file)
    result_items = resources._csv_file_to_list(result_file)
    for item in result_items:
        if item['file_name'] in common.REQUIRED_FILES:
            if item['loaded'] != '1':
                return False
    return True
Ejemplo n.º 8
0
    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was putin bucket
            expected_bucket_items = common.REQUIRED_FILES + common.IGNORE_LIST + json_export_files
            # want to keep this test the same. So adding all the old required files.
            expected_bucket_items = expected_bucket_items + [
                'measurement.csv', 'procedure_occurrence.csv',
                'drug_exposure.csv', 'condition_occurrence.csv',
                'visit_occurrence.csv'
            ]
            expected_bucket_items = [
                prefix + item for item in expected_bucket_items
            ]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items),
                                set(actual_bucket_items))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix=prefix))
Ejemplo n.º 9
0
    def test_pii_files_loaded(self, mock_check_cron):
        # tests if pii files are loaded
        folder_prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.PII_FILE_LOAD_RESULT_CSV)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_NAME_FILE,
                                   prefix=folder_prefix)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_MRN_BAD_PERSON_ID_FILE,
                                   prefix=folder_prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)
            # sort in order to compare
            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)