コード例 #1
0
ファイル: clean_cdr.py プロジェクト: amcgrenera-vumc/curation
def _gather_ehr_rdr_de_identified_queries(project_id, dataset_id):
    """
    gathers all the queries required to clean de_identified dataset

    :param project_id: project name
    :param dataset_id: de_identified dataset name
    :return: returns list of queries
    """
    query_list = []
    query_list.extend(
        id_dedup.get_id_deduplicate_queries(project_id, dataset_id))
    query_list.extend(
        clean_years.get_year_of_birth_queries(project_id, dataset_id))
    query_list.extend(
        neg_ages.get_negative_ages_queries(project_id, dataset_id))
    query_list.extend(
        bad_end_dates.get_bad_end_date_queries(project_id, dataset_id))
    query_list.extend(
        person_validator.get_person_id_validation_queries(
            project_id, dataset_id))
    query_list.extend(
        valid_death_dates.get_valid_death_date_queries(project_id, dataset_id))
    query_list.extend(
        drug_refills_supply.get_days_supply_refills_queries(
            project_id, dataset_id))
    query_list.extend(
        fill_source_value.get_fill_freetext_source_value_fields_queries(
            project_id, dataset_id))
    return query_list
コード例 #2
0
    def test_get_person_id_validation_queries_deid(self):
        # pre conditions

        # test
        results = validator.get_person_id_validation_queries('foo', 'bar_deid')

        # post conditions
        self.assertEqual(len(results), ((len(self.all_tables) * 2) - 1))

        existing_and_consenting = validator.EXISTING_AND_VALID_CONSENTING_RECORDS
        existing_in_person_table = validator.SELECT_EXISTING_PERSON_IDS

        expected = []
        for table in self.mapped_tables:
            field_names = [
                'entry.' + field['name']
                for field in resources.fields_for(table)
            ]
            fields = ', '.join(field_names)

            expected.append({
                clean_consts.QUERY:
                existing_and_consenting.format(project='foo',
                                               dataset='bar_deid',
                                               mapping_dataset='bar',
                                               table=table,
                                               fields=fields),
                clean_consts.DESTINATION_TABLE:
                table,
                clean_consts.DESTINATION_DATASET:
                'bar_deid',
                clean_consts.DISPOSITION:
                bq_consts.WRITE_TRUNCATE,
            })

        for table in self.all_tables:
            field_names = [
                'entry.' + field['name']
                for field in resources.fields_for(table)
            ]
            fields = ', '.join(field_names)

            expected.append({
                clean_consts.QUERY:
                existing_in_person_table.format(project='foo',
                                                dataset='bar_deid',
                                                table=table,
                                                fields=fields),
                clean_consts.DESTINATION_TABLE:
                table,
                clean_consts.DESTINATION_DATASET:
                'bar_deid',
                clean_consts.DISPOSITION:
                bq_consts.WRITE_TRUNCATE,
            })

        self.assertEqual(expected, results)
コード例 #3
0
    def test_get_person_id_validation_queries(self):
        # pre conditions

        # test
        results = validator.get_person_id_validation_queries(
            self.project, self.dataset, self.sandbox)

        # post conditions
        self.assertEqual(len(results),
                         len(self.mapped_tables) + len(self.drop_tables) * 2)

        existing_and_consenting = validator.EXISTING_AND_VALID_CONSENTING_RECORDS
        existing_in_person_table = drop_rows_for_missing_persons.RECORDS_FOR_NON_EXISTING_PIDS

        expected = []
        for table in self.mapped_tables:
            field_names = [
                'entry.' + field['name']
                for field in resources.fields_for(table)
            ]
            fields = ', '.join(field_names)

            expected.append({
                clean_consts.QUERY:
                    existing_and_consenting.format(project=self.project,
                                                   dataset=self.dataset,
                                                   mapping_dataset=self.dataset,
                                                   table=table,
                                                   fields=fields),
                clean_consts.DESTINATION_TABLE:
                    table,
                clean_consts.DESTINATION_DATASET:
                    self.dataset,
                clean_consts.DISPOSITION:
                    bq_consts.WRITE_TRUNCATE,
            })

        for table in self.drop_tables:
            sandbox_ddl_query = common.JINJA_ENV.from_string(
                SANDBOX_DDL).render(
                    project=self.project,
                    sandbox_dataset=self.sandbox,
                    sandbox_table=drop_rows_for_missing_persons_sandbox_for(
                        table))
            expected.append({
                clean_consts.QUERY:
                    existing_in_person_table.render(
                        query_type=sandbox_ddl_query,
                        project=self.project,
                        dataset=self.dataset,
                        table=table)
            })
            expected.append({
                clean_consts.QUERY:
                    existing_in_person_table.render(query_type='DELETE',
                                                    project=self.project,
                                                    dataset=self.dataset,
                                                    table=table)
            })

        self.assertEqual(expected, results)