def test_cdm_schemas(self): schemas = resources.cdm_schemas() table_names = schemas.keys() result_internal_tables = filter(resources.is_internal_table, table_names) self.assertListEqual( [], result_internal_tables, msg='Internal tables %s should not be in result of cdm_schemas()' % result_internal_tables) achilles_tables = common.ACHILLES_TABLES + common.ACHILLES_HEEL_TABLES result_achilles_tables = [ table_name for table_name in table_names if table_name in achilles_tables ] self.assertListEqual( [], result_achilles_tables, msg='Achilles tables %s should not be in result of cdm_schemas()' % result_achilles_tables) result_vocab_tables = [ table_name for table_name in table_names if table_name in resources.VOCABULARY_TABLES ] self.assertListEqual( [], result_vocab_tables, msg='Vocabulary tables %s should not be in result of cdm_schemas()' % result_vocab_tables)
def test_cdm_schemas(self): schemas = resources.cdm_schemas() table_names = schemas.keys() result_internal_tables = [ table_name for table_name in table_names if resources.is_internal_table(table_name) ] self.assertCountEqual( [], result_internal_tables, msg='Internal tables %s should not be in result of cdm_schemas()' % result_internal_tables) achilles_tables = common.ACHILLES_TABLES + common.ACHILLES_HEEL_TABLES result_achilles_tables = [ table_name for table_name in table_names if table_name in achilles_tables ] self.assertCountEqual( [], result_achilles_tables, msg='Achilles tables %s should not be in result of cdm_schemas()' % result_achilles_tables) result_vocab_tables = [ table_name for table_name in table_names if table_name in common.VOCABULARY_TABLES + [common.SOURCE_TO_CONCEPT_MAP] ] self.assertCountEqual( [], result_vocab_tables, msg='Vocabulary tables %s should not be in result of cdm_schemas()' % result_vocab_tables)
DOMAIN_SPECIFIC_FIELDS = 'specific_fields' DOMAIN_DATE_FIELDS = 'date_fields' COMMON_DOMAIN_FIELD_SUFFIXES = [ 'person_id', 'visit_occurrence_id', 'visit_detail_id', 'provider_id', '_concept_id', '_type_concept_id', '_source_value', '_source_concept_id' ] DATE_FIELD_SUFFIXES = [ '_start_date', '_start_datetime', '_end_date', '_end_datetime', '_date', '_datetime' ] ATTRIBUTE_MAPPING_TEMPLATE = '{src_table},{dest_table},{src_field},{dest_field},{translation}' CDM_TABLE_SCHEMAS = resources.cdm_schemas(False, False) FIELD_MAPPING_HEADER = 'src_table,dest_table,src_field,dest_field,translation\n' def generate_field_mappings(_src_table, _dest_table, src_table_fields, dest_table_fields): """ This functions generates a list of field mappings between the src_table and dest_table :param _src_table: the source CDM table :param _dest_table: the destination CDM table :param src_table_fields: the dictionary that contains all the source fields (common fields, date fields and domain specific fields) :param dest_table_fields: the dictionary that contains all the destination fields (common fields, date fields and domain specific fields) :return: a list of field mappings between _src_table and _dest_table """
import resources import os CDM_TABLES = resources.cdm_schemas().keys() CDM_FILES = map(lambda t: t + '.csv', CDM_TABLES) ACHILLES_INDEX_FILES = resources.achilles_index_files() ALL_ACHILLES_INDEX_FILES = [ name.split(resources.resource_path + os.sep)[1].strip() for name in ACHILLES_INDEX_FILES ] DATASOURCES_JSON = os.path.join(resources.achilles_index_path, 'data/datasources.json') RESULT_CSV = 'result.csv' ERRORS_CSV = 'errors.csv' WARNINGS_CSV = 'warnings.csv' PROCESSED_TXT = 'processed.txt' LOG_JSON = 'log.json' ACHILLES_HEEL_REPORT = 'achillesheel' PERSON_REPORT = 'person' DATA_DENSITY_REPORT = 'datadensity' ALL_REPORTS = [ACHILLES_HEEL_REPORT, PERSON_REPORT, DATA_DENSITY_REPORT] ALL_REPORT_FILES = map(lambda s: s + '.json', ALL_REPORTS) IGNORE_LIST = [RESULT_CSV, ERRORS_CSV, WARNINGS_CSV, PROCESSED_TXT ] + ALL_ACHILLES_INDEX_FILES VOCABULARY_TABLES = [ 'concept', 'concept_ancestor', 'concept_class', 'concept_relationship', 'concept_synonym', 'domain', 'drug_strength', 'relationship', 'vocabulary' ] REQUIRED_TABLES = ['person'] REQUIRED_FILES = [table + '.csv' for table in REQUIRED_TABLES] ACHILLES_EXPORT_PREFIX_STRING = "curation_report/data/"
def create_rdr_tables(client, rdr_dataset, bucket): """ Create tables from the data in the RDR bucket. Uses the client to load data directly from the bucket into a table. :param client: a bigquery client object :param rdr_dataset: The existing dataset to load file data into :param bucket: the gcs bucket containing the file data. """ schema_dict = resources.cdm_schemas() schema_dict.update(resources.rdr_specific_schemas()) project = client.project for table, schema in schema_dict.items(): schema_list = bq.get_table_schema(table, schema) table_id = f'{project}.{rdr_dataset}.{table}' job_config = bigquery.LoadJobConfig( schema=schema_list, skip_leading_rows=1, source_format=bigquery.SourceFormat.CSV, field_delimiter=',', allow_quoted_newlines=True, quote_character='"', write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE) if table == 'observation_period': job_config.allow_jagged_rows = True for schema_item in schema_list: if 'person_id' in schema_item.name and table.lower( ) != 'pid_rid_mapping': job_config.clustering_fields = 'person_id' job_config.time_partitioning = bigquery.table.TimePartitioning( type_='DAY') # path to bucketed csv file uri = f'gs://{bucket}/{table}.csv' # job_id defined to the second precision job_id = f'rdr_load_{table.lower()}_{datetime.now().strftime("%Y%m%d_%H%M%S")}' LOGGER.info(f'Loading `{uri}` into `{table_id}`') try: load_job = client.load_table_from_uri( uri, table_id, job_config=job_config, job_id=job_id) # Make an API request. load_job.result() # Waits for the job to complete. except NotFound: LOGGER.info( f'{table} not provided by RDR team. Creating empty table ' f'in dataset: `{rdr_dataset}`') LOGGER.info(f'Creating empty CDM table, `{table}`') destination_table = bigquery.Table(table_id, schema=schema_list) destination_table = client.create_table(destination_table) LOGGER.info(f'Created empty table `{destination_table.table_id}`') else: destination_table = client.get_table( table_id) # Make an API request. LOGGER.info(f'Loaded {destination_table.num_rows} rows into ' f'`{destination_table.table_id}`.') LOGGER.info(f"Finished RDR table LOAD from bucket gs://{bucket}")