Ejemplo n.º 1
0
def main(args=None):
    """
    :param args: list of all the arguments to apply the cleaning rules
    :return:
    """
    args, kwargs = fetch_args_kwargs(args)

    rules = DATA_STAGE_RULES_MAPPING[args.data_stage.value]
    validate_custom_params(rules, **kwargs)

    if args.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            project_id=args.project_id,
            dataset_id=args.dataset_id,
            sandbox_dataset_id=args.sandbox_dataset_id,
            rules=rules,
            **kwargs)
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(args.console_log)
        clean_engine.clean_dataset(project_id=args.project_id,
                                   dataset_id=args.dataset_id,
                                   sandbox_dataset_id=args.sandbox_dataset_id,
                                   rules=rules,
                                   **kwargs)
Ejemplo n.º 2
0
def clean_combined_de_identified_clean_dataset(project_id=None,
                                               dataset_id=None):
    """
    Run all clean rules defined for the deidentified ehr and rdr clean dataset.
    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_combined_deid_clean_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_combined_de_identified_clean_queries(
        project_id, dataset_id, sandbox_dataset_id)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.DEID_CLEAN)
Ejemplo n.º 3
0
    def test_execute_queries(self):
        gender_nonbinary_concept_id = 1585841
        gender_nonbinary_source_concept_id = 123
        sex_female_concept_id = 1585847
        sex_female_source_concept_id = 45878463
        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            query = tmpl.render(
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                gender_concept_id=repopulate_person_post_deid.GENDER_CONCEPT_ID,
                gender_nonbinary_concept_id=gender_nonbinary_concept_id,
                gender_nonbinary_source_concept_id=
                gender_nonbinary_source_concept_id,
                sex_at_birth_concept_id=repopulate_person_post_deid.
                SEX_AT_BIRTH_CONCEPT_ID,
                sex_female_concept_id=sex_female_concept_id,
                sex_female_source_concept_id=sex_female_source_concept_id)
            try:
                resp = bq_utils.query(query)
            except HttpError as e:
                self.fail("failed to execute query '{}': {}".format(
                    query, e.content))
            self.assertTrue(resp["jobComplete"])

        clean_cdr_engine.clean_dataset(
            self.project_id, self.dataset_id, self.sandbox_dataset_id,
            [(repopulate_person_post_deid.
              get_repopulate_person_post_deid_queries,)])

        rows = bq_utils.response2rows(
            bq_utils.query("SELECT * FROM `{}.{}.person`".format(
                self.project_id, self.dataset_id)))
        self.assertEquals(len(rows), 2)

        by_participant = {r["person_id"]: r for r in rows}
        self.assertPersonFields(
            by_participant[1], {
                "gender_concept_id": gender_nonbinary_concept_id,
                "gender_source_value": "nonbinary_src",
                "gender_source_concept_id": gender_nonbinary_source_concept_id,
                "sex_at_birth_concept_id": sex_female_concept_id,
                "sex_at_birth_source_value": "female_src",
                "sex_at_birth_source_concept_id": sex_female_source_concept_id
            })
        self.assertPersonFields(
            by_participant[2], {
                "gender_concept_id": 0,
                "gender_source_value": "No matching concept",
                "gender_source_concept_id": 0,
                "sex_at_birth_concept_id": 0,
                "sex_at_birth_source_value": "No matching concept",
                "sex_at_birth_source_concept_id": 0
            })
Ejemplo n.º 4
0
def clean_ehr_rdr_de_identified_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the deidentified ehr and rdr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if dataset_id is None or dataset_id == '' or dataset_id.isspace():
        dataset_id = bq_utils.get_combined_deid_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s', dataset_id)

    query_list = _gather_ehr_rdr_de_identified_queries(project_id, dataset_id)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project_id, dataset_id, query_list)
    def test_execute_queries(self):
        project_id = bq_utils.app_identity.get_application_id()
        dataset_id = bq_utils.get_combined_dataset_id()
        sandbox_id = bq_utils.get_unioned_dataset_id()
        test_util.delete_all_tables(dataset_id)

        create_tables = (
            ['person'] + common.CLINICAL_DATA_TABLES +
            ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES])
        # TODO(calbach): Make the setup/teardown of these concept tables hermetic.
        for tbl in ['concept', 'concept_ancestor']:
            if not bq_utils.table_exists(tbl, dataset_id=dataset_id):
                create_tables.push(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(tbl,
                                           tbl,
                                           dataset_id=dataset_id,
                                           force_all_nullable=True)

        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            resp = bq_utils.query(
                tmpl.render(project_id=project_id,
                            dataset_id=dataset_id,
                            rdr_basics_concept_id=123,
                            rdr_consent_concept_id=345,
                            ehr_obs_concept_id=567,
                            rdr_basics_module_concept_id=
                            drop_participants_without_ppi_or_ehr.
                            BASICS_MODULE_CONCEPT_ID))
            self.assertTrue(resp["jobComplete"])

        clean_cdr_engine.clean_dataset(
            project_id, dataset_id, sandbox_id,
            [(drop_participants_without_ppi_or_ehr.get_queries, )])

        def table_to_person_ids(t):
            rows = bq_utils.response2rows(
                bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format(
                    project_id, dataset_id, t)))
            return set([r["person_id"] for r in rows])

        # We expect participants 1, 5 to have been removed from all tables.
        self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person"))
        self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation"))
        self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure"))

        test_util.delete_all_tables(dataset_id)
Ejemplo n.º 6
0
    def test_clean_dataset(self, mock_bq_utils, mock_wait_on_jobs,
                           mock_job_status_errored,
                           mock_format_failure_message):

        mock_bq_utils.side_effect = [
            self.job_results_success, self.job_results_failure
        ]
        mock_wait_on_jobs.return_value = []
        mock_job_status_errored.side_effect = [(False, None),
                                               (True,
                                                self.exception_statement_one)]

        clean_cdr_engine.clean_dataset(self.project, self.statements)

        self.assertEqual(mock_bq_utils.call_count, len(self.statements))
        self.assertEqual(mock_format_failure_message.call_count, 1)

        mock_bq_utils.assert_any_call(
            self.statement_one.get(cdr_consts.QUERY),
            use_legacy_sql=False,
            destination_table_id=self.statement_one.get(
                cdr_consts.DESTINATION_TABLE),
            retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
            write_disposition=self.statement_one.get(cdr_consts.DISPOSITION),
            destination_dataset_id=self.statement_one.get(
                cdr_consts.DESTINATION_DATASET),
            batch=None)

        mock_bq_utils.assert_any_call(
            self.statement_two.get(cdr_consts.QUERY),
            use_legacy_sql=False,
            destination_table_id=self.statement_two.get(
                cdr_consts.DESTINATION_TABLE),
            retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
            write_disposition=self.statement_two.get(cdr_consts.DISPOSITION),
            destination_dataset_id=self.statement_two.get(
                cdr_consts.DESTINATION_DATASET),
            batch=None)
Ejemplo n.º 7
0
    def test_clean_dataset_exceptions(self, mock_bq_utils, mock_wait_on_jobs,
                                      mock_job_status_errored,
                                      mock_format_failure_message):

        # Test the case where BigQuery throws an error
        mock_bq_utils.side_effect = HttpError(
            mock.Mock(return_value={'status': 404}),
            self.exception_statement_one)

        clean_cdr_engine.clean_dataset(self.project, self.statements)

        self.assertEqual(mock_wait_on_jobs.call_count, 0)
        self.assertEqual(mock_job_status_errored.call_count, 0)
        self.assertEqual(mock_format_failure_message.call_count, 2)

        # Test the case where there is an incomplete job
        mock_bq_utils.reset_mock()
        mock_format_failure_message.reset_mock()

        mock_bq_utils.side_effect = [self.job_results_success]
        mock_wait_on_jobs.return_value = [self.job_id_success]

        with self.assertRaises(bq_utils.BigQueryJobWaitError):
            clean_cdr_engine.clean_dataset(self.project, self.statements)
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    combined_dataset_arg = {
        parser.SHORT_ARGUMENT: '-c',
        parser.LONG_ARGUMENT: '--combined_dataset_id',
        parser.ACTION: 'store',
        parser.DEST: 'combined_dataset_id',
        parser.HELP: 'Identifies the combined dataset',
        parser.REQUIRED: True
    }

    ARGS = parser.default_parse_args([combined_dataset_arg])

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id,
            ARGS.dataset_id,
            ARGS.sandbox_dataset_id, [(RemoveFitbitDataIfMaxAgeExceeded, )],
            combined_dataset_id=ARGS.combined_dataset_id)
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(
            ARGS.project_id,
            ARGS.dataset_id,
            ARGS.sandbox_dataset_id, [(RemoveFitbitDataIfMaxAgeExceeded, )],
            combined_dataset_id=ARGS.combined_dataset_id)
Ejemplo n.º 9
0
                    project=self.project_id,
                    dataset=self.dataset_id,
                    domain_table=table,
                    string_fields=string_fields)
                result = client.query(validation_query).result()
                if result.total_rows > 0:
                    raise RuntimeError(
                        f'{table} has {result.total_rows} records that have non-null string values'
                    )


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.default_parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(StringFieldsSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(StringFieldsSuppression, )])
Ejemplo n.º 10
0
    }

    mapping_table_arg = {
        parser.SHORT_ARGUMENT: '-t',
        parser.LONG_ARGUMENT: '--mapping_table_id',
        parser.ACTION: 'store',
        parser.DEST: 'mapping_table_id',
        parser.HELP: 'Identifies the pid-rid map table, typically _deid_map',
        parser.REQUIRED: True
    }

    ARGS = parser.default_parse_args([mapping_dataset_arg, mapping_table_arg])

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id,
            ARGS.dataset_id,
            ARGS.sandbox_dataset_id, [(PIDtoRID, )],
            mapping_dataset_id=ARGS.mapping_dataset_id,
            mapping_table_id=ARGS.mapping_table_id)
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id,
                                   ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id, [(PIDtoRID, )],
                                   mapping_dataset_id=ARGS.mapping_dataset_id,
                                   mapping_table_id=ARGS.mapping_table_id)
        A helper function to retrieve the sandbox table name for the affected_table
        :param affected_table: 
        :return: 
        """
        if affected_table not in self._affected_tables:
            raise LookupError(
                f'{affected_table} is not define as an affected table in {self._affected_tables}'
            )
        return f'{"_".join(self._issue_numbers).lower()}_{affected_table}'


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(FixUnmappedSurveyAnswers,)])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(FixUnmappedSurveyAnswers,)])
Ejemplo n.º 12
0
    queries_list = []

    query = dict()
    query[cdr_consts.QUERY] = REMOVE_ADDITIONAL_RESPONSES_OTHER_THAN_NOT.format(
        dataset=dataset_id,
        project=project_id,
    )
    queries_list.append(query)

    return queries_list


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(get_remove_multiple_race_ethnicity_answers_queries,)])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(get_remove_multiple_race_ethnicity_answers_queries,)])
Ejemplo n.º 13
0
        Run required steps for validation setup
        """
        raise NotImplementedError("Please fix me.")

    def validate_rule(self, client, *args, **keyword_args):
        """
        Validates the cleaning rule which deletes or updates the data from the tables
        """
        raise NotImplementedError("Please fix me.")


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(ExplicitIdentifierSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(ExplicitIdentifierSuppression, )])
Ejemplo n.º 14
0
    def validate_rule(self):
        """
        Validate the cleaning rule which deletes or updates the data from the tables
        """

        raise NotImplementedError("Please fix me.")

    def get_sandbox_tablenames(self):
        return [HEIGHT_TABLE, WEIGHT_TABLE, NEW_HEIGHT_ROWS, NEW_WEIGHT_ROWS]


if __name__ == '__main__':
    import cdr_cleaner.clean_cdr_engine as clean_engine
    import cdr_cleaner.args_parser as parser

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(CleanHeightAndWeight,)])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(CleanHeightAndWeight,)])
Ejemplo n.º 15
0
        raise NotImplementedError("Please fix me.")

    def get_sandbox_tablenames(self):
        """
        Generates list of sandbox table names created by this rule.
        """
        return [
            self.sandbox_table_for(table) for table in self.affected_tables
        ]


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ext_parser = parser.get_argument_parser()
    ARGS = ext_parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(RemoveEhrDataWithoutConsent, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id, ARGS.cutoff_date,
                                   [(RemoveEhrDataWithoutConsent, )])
Ejemplo n.º 16
0
        Validates the cleaning rule which deletes or updates the data from the tables
        """
        pass

    def get_sandbox_table_name(self):
        return f'{self._issue_numbers[0].lower()}_measurement'

    def get_sandbox_tablenames(self):
        return [self.get_sandbox_table_name()]


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(UnitNormalization, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(UnitNormalization, )])
Ejemplo n.º 17
0
        Run required steps for validation setup
        """
        raise NotImplementedError("Please fix me.")

    def validate_rule(self, client, *args, **keyword_args):
        """
        Validates the cleaning rule which deletes or updates the data from the tables
        """
        raise NotImplementedError("Please fix me.")


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(ValidDeathDates, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(ValidDeathDates, )])
    def get_suppressed_concept_ids(self):
        # https://athena.ohdsi.org/search-terms/terms/1585259
        # https://athena.ohdsi.org/search-terms/terms/4083587
        return [1585259, 4083587]

    def setup_validation(self, client, *args, **keyword_args):
        pass

    def validate_rule(self, client, *args, **keyword_args):
        pass


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.default_parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(BirthInformationSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(BirthInformationSuppression, )])
Ejemplo n.º 19
0
        affected_tables=[],
        project_id=project,
        dataset_id=dataset,
        sandbox_dataset_id=sandbox_dataset_id,
        namer='data_stage')

    # generate queries to remove person_ids of people not in the person table
    query_list.extend(
        drop_rows_for_missing_persons_rule_instance.get_query_specs())

    return query_list


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()
    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(get_person_id_validation_queries, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(get_person_id_validation_queries, )])
Ejemplo n.º 20
0
        """
        Validates the cleaning rule which deletes or updates the data from the tables

        This abstract method was added to the base class after this rule was authored.
        This rule needs to implement logic to run validation on cleaning rules that will
        be updating or deleting the values.
        Until done no issue exists for this yet.
        """
        raise NotImplementedError("Please fix me.")


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(CleanMappingExtTables, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(CleanMappingExtTables, )])
Ejemplo n.º 21
0
def parse_args():
    """
    Add file_path to the default cdr_cleaner.args_parser argument list

    :return: an expanded argument list object
    """
    import cdr_cleaner.args_parser as parser
    help_text = 'path to csv file (with header row) containing pids whose observation records are to be removed'
    additional_argument_1 = {
        parser.SHORT_ARGUMENT: '-f',
        parser.LONG_ARGUMENT: '--file_path',
        parser.ACTION: 'store',
        parser.DEST: 'file_path',
        parser.HELP: help_text,
        parser.REQUIRED: True
    }

    args = parser.default_parse_args([additional_argument_1])
    return args


if __name__ == '__main__':
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parse_args()

    clean_engine.add_console_logging(ARGS.console_log)
    query_list = main(ARGS.project_id, ARGS.dataset_id, ARGS.file_path)
    clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id, query_list)
Ejemplo n.º 22
0
        def default_test(self, tables_and_test_values):
            """
            Test passing the query specifications to the clean engine module.

            This should test that the specifications for the query perform
            as designed.  The rule should drop only what it is designed to
            drop.  No more and no less.  This expects the rule to create the
            required sandbox tables.  Because it is using the clean_dataset
            function, special setup features do not need to be accounted for
            in this test because they will be executed by the engine.

            :param tables_and_test_values: a list of dictionaries where each
                dictionary defines table expectations for each OMOP table to
                validate with rule execution.  The dictionaries require:
             'fq_table_name':  the fully qualified name of the table being cleaned
             'fq_sanbox_table_name':  the fully qualified name of the sandbox
                                      table the rule will create if one is
                                      expected.
             'loaded_ids':  The list of ids loaded by the sql insert statement
             'sandboxed_ids':  the list of ids that will be in the sandbox if
                               the rule sandboxes information
             'cleaned_values':  the list of tupled ids and expected values that will
                                exist in the cleaned table after
                                running the cleaning rule.  the order of the
                                expected values must match the order of the fields
                                defined in fields.
             'fields': a list of fields to select from the table after it has
                       been cleaned. the listed order should match the expected
                       order of the cleaned_values tuples.  the first item in
                       the list should be a unique identifier, e.g. primary key field
            """
            # pre-conditions
            # validate sandbox tables don't exist yet
            for fq_table_name in self.fq_sandbox_table_names:
                self.assertTableDoesNotExist(fq_table_name)

            # validate only anticipated input records exist before starting
            for table_info in tables_and_test_values:
                fq_table_name = table_info.get('fq_table_name', 'UNSET')
                values = table_info.get('loaded_ids', [])
                # this is assuming the uniquely identifiable field name is specified
                # first in the fields list.  this check verifies by id field
                # that the table data loaded correctly.
                fields = [table_info.get('fields', [])[0]]
                self.assertRowIDsMatch(fq_table_name, fields, values)

            if self.rule_instance:
                # test: run the queries
                rule_class = self.rule_instance.__class__
                engine.clean_dataset(self.project_id, self.dataset_id,
                                     self.sandbox_id, [(rule_class, )],
                                     **self.kwargs)
            else:
                raise RuntimeError(f"Cannot use the default_test method for "
                                   f"{self.__class__.__name__} because "
                                   f"rule_instance is undefined.")

            # post conditions
            for table_info in tables_and_test_values:
                # validate records are dropped
                fq_table_name = table_info.get('fq_table_name', 'UNSET')
                values = table_info.get('cleaned_values', [])
                fields = table_info.get('fields', [])
                self.assertTableValuesMatch(fq_table_name, fields, values)

                # validate records are sandboxed
                fq_sandbox_name = table_info.get('fq_sandbox_table_name')
                if fq_sandbox_name:
                    values = table_info.get('sandboxed_ids', [])
                    # this is assuming the uniquely identifiable field name is specified
                    # first in the fields list.  this check verifies by id field
                    # that the table data loaded correctly.
                    fields = [table_info.get('fields', [])[0]]
                    self.assertRowIDsMatch(fq_sandbox_name, fields, values)
Ejemplo n.º 23
0
    def validate_rule(self, client):
        """
        Validates the cleaning rule which deletes or updates the data from the tables
        """
        raise NotImplementedError("Please fix me.")

    def get_sandbox_tablenames(self):
        pass


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(NegativeAges, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id, [(NegativeAges, )])
Ejemplo n.º 24
0
            raise RuntimeError(
                f'Backup table {backup_table_obj.table_id} for branching cleaning rule was not '
                f'found on the server')
        query = BACKUP_ROWS_QUERY.render(lookup_table=self.lookup_table,
                                         src_table=self.observation_table)
        result = client.query(query).result()
        if result.total_rows > 0:
            raise RuntimeError(
                f'Branching cleaning rule was run but still identifies {result.total_rows} '
                f'rows from the observation table to drop')


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(PpiBranching, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id, [(PpiBranching, )])
            1333234, 1310066, 715725, 1310147, 702686, 1310054, 715726, 715724,
            715714, 1310146, 1310058
        ]

    def setup_validation(self, client, *args, **keyword_args):
        pass

    def validate_rule(self, client, *args, **keyword_args):
        pass


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.default_parse_args()
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(CopeSurveyResponseSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(CopeSurveyResponseSuppression, )])
Ejemplo n.º 26
0
    def validate_rule(self, client):
        """
        Validates the cleaning rule which deletes or updates the data from the tables
        """
        pass

    def get_sandbox_tablenames(self):
        return [self.sandbox_table_for(table) for table in self.affected_tables]


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(ARGS.project_id,
                                                 ARGS.dataset_id,
                                                 ARGS.sandbox_dataset_id,
                                                 [(DateShiftCopeResponses,)])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(DateShiftCopeResponses,)])
        raise NotImplementedError("Please fix me.")

    def get_sandbox_tablenames(self):
        return [
            self.sandbox_table_for(table) for table in [
                INVALID_VALUES_RECORDS, SITES_WITH_ONLY_BAD_DATA,
                SAVE_BAD_SITE_DATA, SAVE_NULL_VALUE_RECORDS,
                SAVE_DUPLICATE_RECORDS
            ]
        ]


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(MeasurementRecordsSuppression, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(MeasurementRecordsSuppression, )])
Ejemplo n.º 28
0
        if hasattr(result, 'errors') and result.errors:
            LOGGER.error(f"Error running job {result.job_id}: {result.errors}")
            raise GoogleCloudError(
                f"Error running job {result.job_id}: {result.errors}")

    def setup_validation(self, client, *args, **keyword_args):
        pass

    def validate_rule(self, client, *args, **keyword_args):
        pass


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parser.default_parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(MotorVehicleAccidentSuppression,)])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(MotorVehicleAccidentSuppression,)])
        parser.REQUIRED: True
    }, {
        parser.SHORT_ARGUMENT: '-v',
        parser.LONG_ARGUMENT: '--validation_dataset_id',
        parser.ACTION: 'store',
        parser.DEST: 'validation_dataset_id',
        parser.HELP: 'validation_dataset_id',
        parser.REQUIRED: True
    }]
    args = parser.default_parse_args(additional_arguments)
    return args


if __name__ == '__main__':
    import cdr_cleaner.clean_cdr_engine as clean_engine

    ARGS = parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(delete_records_for_non_matching_participants, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(delete_records_for_non_matching_participants, )])
    def validate_rule(self, client):
        """
        Validates the cleaning rule which deletes or updates the data from the tables

        This abstract method was added to the base class after this rule was authored.
        This rule needs to implement logic to run validation on cleaning rules that will
        be updating or deleting the values.
        Until done no issue exists for this yet.
        """
        raise NotImplementedError("Please fix me.")


if __name__ == '__main__':
    import cdr_cleaner.clean_cdr_engine as clean_engine
    import cdr_cleaner.args_parser as parser

    ARGS = parser.parse_args()

    if ARGS.list_queries:
        clean_engine.add_console_logging()
        query_list = clean_engine.get_query_list(
            ARGS.project_id, ARGS.dataset_id, ARGS.sandbox_dataset_id,
            [(EnsureDateDatetimeConsistency, )])
        for query in query_list:
            LOGGER.info(query)
    else:
        clean_engine.add_console_logging(ARGS.console_log)
        clean_engine.clean_dataset(ARGS.project_id, ARGS.dataset_id,
                                   ARGS.sandbox_dataset_id,
                                   [(EnsureDateDatetimeConsistency, )])