Exemple #1
0
    def test_cdm_schemas(self):
        schemas = resources.cdm_schemas()
        table_names = schemas.keys()

        result_internal_tables = filter(resources.is_internal_table,
                                        table_names)
        self.assertListEqual(
            [],
            result_internal_tables,
            msg='Internal tables %s should not be in result of cdm_schemas()' %
            result_internal_tables)

        achilles_tables = common.ACHILLES_TABLES + common.ACHILLES_HEEL_TABLES
        result_achilles_tables = [
            table_name for table_name in table_names
            if table_name in achilles_tables
        ]
        self.assertListEqual(
            [],
            result_achilles_tables,
            msg='Achilles tables %s should not be in result of cdm_schemas()' %
            result_achilles_tables)

        result_vocab_tables = [
            table_name for table_name in table_names
            if table_name in resources.VOCABULARY_TABLES
        ]
        self.assertListEqual(
            [],
            result_vocab_tables,
            msg='Vocabulary tables %s should not be in result of cdm_schemas()'
            % result_vocab_tables)
Exemple #2
0
    def test_cdm_schemas(self):
        schemas = resources.cdm_schemas()
        table_names = schemas.keys()

        result_internal_tables = [
            table_name for table_name in table_names
            if resources.is_internal_table(table_name)
        ]
        self.assertCountEqual(
            [],
            result_internal_tables,
            msg='Internal tables %s should not be in result of cdm_schemas()' %
            result_internal_tables)

        achilles_tables = common.ACHILLES_TABLES + common.ACHILLES_HEEL_TABLES
        result_achilles_tables = [
            table_name for table_name in table_names
            if table_name in achilles_tables
        ]
        self.assertCountEqual(
            [],
            result_achilles_tables,
            msg='Achilles tables %s should not be in result of cdm_schemas()' %
            result_achilles_tables)

        result_vocab_tables = [
            table_name for table_name in table_names
            if table_name in common.VOCABULARY_TABLES +
            [common.SOURCE_TO_CONCEPT_MAP]
        ]
        self.assertCountEqual(
            [],
            result_vocab_tables,
            msg='Vocabulary tables %s should not be in result of cdm_schemas()'
            % result_vocab_tables)
Exemple #3
0
DOMAIN_SPECIFIC_FIELDS = 'specific_fields'
DOMAIN_DATE_FIELDS = 'date_fields'

COMMON_DOMAIN_FIELD_SUFFIXES = [
    'person_id', 'visit_occurrence_id', 'visit_detail_id', 'provider_id',
    '_concept_id', '_type_concept_id', '_source_value', '_source_concept_id'
]

DATE_FIELD_SUFFIXES = [
    '_start_date', '_start_datetime', '_end_date', '_end_datetime', '_date',
    '_datetime'
]

ATTRIBUTE_MAPPING_TEMPLATE = '{src_table},{dest_table},{src_field},{dest_field},{translation}'

CDM_TABLE_SCHEMAS = resources.cdm_schemas(False, False)

FIELD_MAPPING_HEADER = 'src_table,dest_table,src_field,dest_field,translation\n'


def generate_field_mappings(_src_table, _dest_table, src_table_fields,
                            dest_table_fields):
    """
    This functions generates a list of field mappings between the src_table and dest_table

    :param _src_table: the source CDM table
    :param _dest_table: the destination CDM table
    :param src_table_fields: the dictionary that contains all the source fields (common fields, date fields and domain specific fields)
    :param dest_table_fields: the dictionary that contains all the destination fields (common fields, date fields and domain specific fields)
    :return: a list of field mappings between _src_table and _dest_table
    """
Exemple #4
0
import resources
import os

CDM_TABLES = resources.cdm_schemas().keys()
CDM_FILES = map(lambda t: t + '.csv', CDM_TABLES)
ACHILLES_INDEX_FILES = resources.achilles_index_files()
ALL_ACHILLES_INDEX_FILES = [
    name.split(resources.resource_path + os.sep)[1].strip()
    for name in ACHILLES_INDEX_FILES
]
DATASOURCES_JSON = os.path.join(resources.achilles_index_path,
                                'data/datasources.json')
RESULT_CSV = 'result.csv'
ERRORS_CSV = 'errors.csv'
WARNINGS_CSV = 'warnings.csv'
PROCESSED_TXT = 'processed.txt'
LOG_JSON = 'log.json'
ACHILLES_HEEL_REPORT = 'achillesheel'
PERSON_REPORT = 'person'
DATA_DENSITY_REPORT = 'datadensity'
ALL_REPORTS = [ACHILLES_HEEL_REPORT, PERSON_REPORT, DATA_DENSITY_REPORT]
ALL_REPORT_FILES = map(lambda s: s + '.json', ALL_REPORTS)
IGNORE_LIST = [RESULT_CSV, ERRORS_CSV, WARNINGS_CSV, PROCESSED_TXT
               ] + ALL_ACHILLES_INDEX_FILES
VOCABULARY_TABLES = [
    'concept', 'concept_ancestor', 'concept_class', 'concept_relationship',
    'concept_synonym', 'domain', 'drug_strength', 'relationship', 'vocabulary'
]
REQUIRED_TABLES = ['person']
REQUIRED_FILES = [table + '.csv' for table in REQUIRED_TABLES]
ACHILLES_EXPORT_PREFIX_STRING = "curation_report/data/"
def create_rdr_tables(client, rdr_dataset, bucket):
    """
    Create tables from the data in the RDR bucket.

    Uses the client to load data directly from the bucket into
    a table.

    :param client: a bigquery client object
    :param rdr_dataset: The existing dataset to load file data into
    :param bucket: the gcs bucket containing the file data.
    """
    schema_dict = resources.cdm_schemas()
    schema_dict.update(resources.rdr_specific_schemas())

    project = client.project

    for table, schema in schema_dict.items():
        schema_list = bq.get_table_schema(table, schema)
        table_id = f'{project}.{rdr_dataset}.{table}'
        job_config = bigquery.LoadJobConfig(
            schema=schema_list,
            skip_leading_rows=1,
            source_format=bigquery.SourceFormat.CSV,
            field_delimiter=',',
            allow_quoted_newlines=True,
            quote_character='"',
            write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
        if table == 'observation_period':
            job_config.allow_jagged_rows = True

        for schema_item in schema_list:
            if 'person_id' in schema_item.name and table.lower(
            ) != 'pid_rid_mapping':
                job_config.clustering_fields = 'person_id'
                job_config.time_partitioning = bigquery.table.TimePartitioning(
                    type_='DAY')

        # path to bucketed csv file
        uri = f'gs://{bucket}/{table}.csv'

        # job_id defined to the second precision
        job_id = f'rdr_load_{table.lower()}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'

        LOGGER.info(f'Loading `{uri}` into `{table_id}`')
        try:
            load_job = client.load_table_from_uri(
                uri, table_id, job_config=job_config,
                job_id=job_id)  # Make an API request.

            load_job.result()  # Waits for the job to complete.
        except NotFound:
            LOGGER.info(
                f'{table} not provided by RDR team.  Creating empty table '
                f'in dataset: `{rdr_dataset}`')

            LOGGER.info(f'Creating empty CDM table, `{table}`')
            destination_table = bigquery.Table(table_id, schema=schema_list)
            destination_table = client.create_table(destination_table)
            LOGGER.info(f'Created empty table `{destination_table.table_id}`')
        else:
            destination_table = client.get_table(
                table_id)  # Make an API request.
        LOGGER.info(f'Loaded {destination_table.num_rows} rows into '
                    f'`{destination_table.table_id}`.')

    LOGGER.info(f"Finished RDR table LOAD from bucket gs://{bucket}")