def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") DBUtil.execute \ ("""delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' ) """ % TEST_SOURCE_TABLE ) DBUtil.execute \ ("""delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = '%s' ) """ % TEST_SOURCE_TABLE ) DBUtil.execute("delete from clinical_item_category where source_table = '%s';" % TEST_SOURCE_TABLE) bqCursor = self.bqConn.cursor() bqCursor.execute('DELETE FROM %s.patient_item WHERE true;' % TEST_DEST_DATASET) bqCursor.execute('DELETE FROM %s.clinical_item WHERE true;' % TEST_DEST_DATASET) bqCursor.execute('DELETE FROM %s.clinical_item_category WHERE true;' % TEST_DEST_DATASET) DBTestCase.tearDown(self)
def dumpPatientItemToCsv(self, tempDir, batchCounter=999): log.info( 'Dumping patient_item for batch {} to CSV'.format(batchCounter)) DBUtil.dumpTableToCsv( 'patient_item', '{}/{}_patient_item.csv'.format(tempDir, batchCounter))
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) ClinicalItemDataLoader.build_clinical_item_psql_schemata() log.info("Sourcing from BigQuery DB") self.patientIds = [ 'JCd5ef6e', 'JCce317d', 'JCe83f82', 'JCe5fc81', 'JCdb8fe4', 'JCcdc6a0', 'JCd37637', 'JCdbb57e', 'JCcebdef', 'JCcc41b3', 'JCe160b3', 'JCe8415d', 'JCdb1735', 'JCeb8fe9', 'JCe362b1', 'JCcca36e', 'JCddddf4', 'JCe683c1', 'JCe74388', 'JCd30ac4', 'JCd1bb22', 'JCe3397c', 'JCccb16c', 'JCd5da6d', 'JCd6f915', 'JCe3e96d', 'JCd43db0', 'JCe5a52f', 'JCd9f7b5', 'JCd60bb3', 'JCe66004', 'JCe4a6c2', 'JCceb239', 'JCda9846', 'JCce3176', 'JCe098ca', 'JCd31af1', 'JCe796fd', 'JCcc9243', 'JCd05308', 'JCea3982', 'JCd99619', 'JCd99366', 'JCdb087f', 'JCd9f2b3', 'JCe8a2d4', 'JCd19201', 'JCcdc146', 'JCe05414', 'JCd98ef5' ] self.pat_id_csv = '/tmp/tmp_test_pat_id.csv' with open(self.pat_id_csv, 'wb') as f: for id in ['rit_uid'] + self.patientIds: f.write("%s\n" % id) self.bqConn = bigQueryUtil.connection() self.converter = STARRDemographicsConversion() # Instance to test on
def convertSourceItems(self, convOptions, conn=None): """Primary run function to process the contents of the raw source table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeated conversions. startDate - If provided, only return items whose ordering_date is on or after that date. endDate - If provided, only return items whose ordering_date is before that date. """ log.info("Conversion for items dated %s to %s" % (convOptions.startDate, convOptions.endDate)) progress = ProgressDots() extConn = conn is not None if not extConn: conn = self.connFactory.connection() try: # Next round for medications directly from order_med table not addressed in medmix TODO (nodir) seems like an unrelated comment? category = self.categoryFromSourceItem(conn) for sourceItem in self.querySourceItems(convOptions): log.debug('sourceItem: {}'.format(sourceItem)) self.convertSourceItem(category, sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def create_new_table_from_schema(self, dataset_id, table_id, schema): ''' https://cloud.google.com/bigquery/docs/tables#create-table :param dataset_id: dataset name :param table_id: table name :param schema: schema = [ bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED', description='blah'), bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED'), ] :return: None ''' dataset_ref = self.client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) try: self.client.get_table(table_ref) #print(f'Table {table_id} in dataset {dataset_id} already exists! Skipping create operation.') except NotFound: # Construct a full Table object to send to the API. table = bigquery.Table(table_ref, schema=schema) table = self.client.create_table(table) # API request log.info(''' Table {} in dataset {} created successfully project: {}. '''.format(table.table_id, dataset_id, self.client.project)) '''
def uploadClinicalTablesCsvToBQ(self, tempDir, datasetId): log.info('Uploading clinical_item_category CSV to BQ dataset %s' % datasetId) clinical_item_category_schema = [ bigquery.SchemaField('clinical_item_category_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('source_table', 'STRING', 'REQUIRED', None, ()), bigquery.SchemaField('description', 'STRING', 'NULLABLE', None, ()), bigquery.SchemaField('default_recommend', 'INT64', 'NULLABLE', None, ())] clinical_item_category_csv_path = tempDir + '/clinical_item_category.csv' bigQueryUtil.headerChecker(clinical_item_category_csv_path, [sf.name for sf in clinical_item_category_schema]) self.bqClient.load_csv_to_table(datasetId, 'clinical_item_category', clinical_item_category_csv_path, skip_rows=1, append_to_table=True) # auto_detect_schema=False, schema=clinical_item_category_schema) log.info('Uploading clinical_item CSV to BQ dataset %s' % datasetId) clinical_item_schema = [bigquery.SchemaField('clinical_item_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('clinical_item_category_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('external_id', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('name', 'STRING', 'REQUIRED', None, ()), bigquery.SchemaField('description', 'STRING', 'NULLABLE', None, ()), bigquery.SchemaField('default_recommend', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('item_count', 'FLOAT64', 'NULLABLE', None, ()), bigquery.SchemaField('patient_count', 'FLOAT64', 'NULLABLE', None, ()), bigquery.SchemaField('encounter_count', 'FLOAT64', 'NULLABLE', None, ()), bigquery.SchemaField('analysis_status', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('outcome_interest', 'INT64', 'NULLABLE', None, ())] clinical_item_csv_path = tempDir + '/clinical_item.csv' bigQueryUtil.headerChecker(clinical_item_csv_path, [sf.name for sf in clinical_item_schema]) self.bqClient.load_csv_to_table(datasetId, 'clinical_item', clinical_item_csv_path, skip_rows=1, append_to_table=True)
def dumpItemCollectionTablesToCsv(self, tempDir): log.info('Dumping item_collection_item and item_collection to CSV') DBUtil.dumpTableToCsv('item_collection_item', '{}/item_collection_item.csv'.format(tempDir)) DBUtil.dumpTableToCsv('item_collection', '{}/item_collection.csv'.format(tempDir))
def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # Produce a patient_item record model for the given sourceItem patientItem = \ RowItemModel \ ({"external_id": int(sourceItem["prov_map_id"][2:], 16), "patient_id": int(sourceItem["rit_uid"][2:], 16), "encounter_id": sourceItem["pat_enc_csn_id_coded"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": sourceItem["trtmnt_tm_begin_dt_jittered"], } ) insertQuery = DBUtil.buildInsertQuery("patient_item", patientItem.keys()) insertParams = patientItem.values() try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] except conn.IntegrityError, err: # If turns out to be a duplicate, okay, pull out existing ID and continue to insert whatever else is possible log.info( err ) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = \ {"patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn)
def convertSourceItems(self, convOptions): """Primary run function to process the contents of the order_med table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies after the fact to catch repeatEd conversions. startDate - If provided, only return items whose order_time_jittered is on or after that date. endDate - If provided, only return items whose order_time_jittered is before that date. """ log.info("Conversion for items dated {} to {}".format( convOptions.startDate, convOptions.endDate)) progress = ProgressDots() conn = self.connFactory.connection() try: # Load up the medication mapping table to facilitate subsequent conversions rxcuiDataByMedId = self.loadRXCUIData() # Next round for medications directly from order_med table not addressed in medmix for sourceItem in self.querySourceItems(rxcuiDataByMedId, convOptions, progress=progress, conn=conn): self.convertSourceItem(sourceItem, conn=conn) progress.Update() finally: conn.close() progress.PrintStatus()
def dumpClinicalTablesToCsv(self, tempDir): log.info('Dumping clinical_item and clinical_item_category to CSV') DBUtil.dumpTableToCsv('clinical_item', '{}/clinical_item.csv'.format(tempDir)) DBUtil.dumpTableToCsv('clinical_item_category', '{}/clinical_item_category.csv'.format(tempDir))
def uploadClinicalTablesCsvToBQ(self, tempDir, datasetId): log.info('Uploading clinical_item_category CSV to BQ dataset %s' % datasetId) clinical_item_category_schema = self.bqClient.client.get_table( self.bqClient.client.dataset('clinical_item2018', 'mining-clinical-decisions') .table('clinical_item_category') ).schema clinical_item_category_csv_path = tempDir + '/clinical_item_category.csv' bigQueryUtil.headerChecker(clinical_item_category_csv_path, [sf.name for sf in clinical_item_category_schema]) self.bqClient.load_csv_to_table(datasetId, 'clinical_item_category', clinical_item_category_csv_path, skip_rows=1, append_to_table=True) # auto_detect_schema=False, schema=clinical_item_category_schema) log.info('Uploading clinical_item CSV to BQ dataset %s' % datasetId) clinical_item_schema = self.bqClient.client.get_table( self.bqClient.client.dataset('clinical_item2018', 'mining-clinical-decisions').table('clinical_item') ).schema clinical_item_csv_path = tempDir + '/clinical_item.csv' bigQueryUtil.headerChecker(clinical_item_csv_path, [sf.name for sf in clinical_item_schema]) self.bqClient.load_csv_to_table(datasetId, 'clinical_item', clinical_item_csv_path, skip_rows=1, append_to_table=True)
def uploadPatientItemCsvToBQ(self, tempDir, datasetId, batchCounter=999): log.info('Uploading patient_item CSV to BQ dataset %s for batch %s' % (datasetId, batchCounter)) patient_item_schema = [ bigquery.SchemaField('patient_item_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('external_id', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('patient_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('clinical_item_id', 'INT64', 'REQUIRED', None, ()), bigquery.SchemaField('item_date', 'TIMESTAMP', 'REQUIRED', None, ()), bigquery.SchemaField('analyze_date', 'TIMESTAMP', 'NULLABLE', None, ()), bigquery.SchemaField('encounter_id', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('text_value', 'STRING', 'NULLABLE', None, ()), bigquery.SchemaField('num_value', 'FLOAT64', 'NULLABLE', None, ()), bigquery.SchemaField('source_id', 'INT64', 'NULLABLE', None, ()), bigquery.SchemaField('item_date_utc', 'TIMESTAMP', 'NULLABLE', None, ()) ] csv_path = tempDir + os.path.sep + str( batchCounter) + '_patient_item.csv' bigQueryUtil.headerChecker(csv_path, [sf.name for sf in patient_item_schema]) self.bqClient.load_csv_to_table(datasetId, 'patient_item', csv_path, schema=patient_item_schema, skip_rows=1, append_to_table=True)
def removePatientItemCsv(self, tempDir, batchCounter): log.info('Removing patient_item CSV for batch %s' % batchCounter) if os.path.exists(tempDir + '/' + str(batchCounter) + '_patient_item.csv'): os.remove(tempDir + '/' + str(batchCounter) + '_patient_item.csv') else: print(tempDir + '/' + str(batchCounter) + '_patient_item.csv does not exist')
def dumpPatientItemToCsv(self, tempDir, batchCounter): log.info('Dumping patient_item for batch %s to CSV' % batchCounter) DBUtil.execute( ''' COPY patient_item TO '%s/%s_patient_item.csv' DELIMITER ',' CSV HEADER; ''' % (tempDir, batchCounter) )
def dumpPatientItemCollectionLinkToCsv(self, tempDir, batchCounter=999): log.info( 'Dumping patient_item_collection_link for batch {} to CSV'.format( batchCounter)) DBUtil.dumpTableToCsv( 'patient_item_collection_link', '{}/{}_patient_item_collection_link.csv'.format( tempDir, batchCounter))
def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) log.info("Sourcing from BigQuery DB") ClinicalItemDataLoader.build_clinical_item_psql_schemata() self.converter = STARRTreatmentTeamConversion() # Instance to test on self.bqConn = self.converter.bqConn self.starrUtil = STARRUtil.StarrCommonUtils(self.converter.bqClient)
def removeClinicalTablesCsv(self, tempDir): log.info('Removing clinical_item and clinical_item_category CSVs') if os.path.exists(tempDir + '/clinical_item.csv'): os.remove(tempDir + '/clinical_item.csv') else: print(tempDir + '/clinical_item.csv does not exist') if os.path.exists(tempDir + '/clinical_item_category.csv'): os.remove(tempDir + '/clinical_item_category.csv') else: print(tempDir + '/clinical_item_category.csv does not exist')
def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") bqCursor = self.bqConn.cursor() bqCursor.execute('DELETE FROM %s.%s WHERE true;' % (TEST_DEST_DATASET, TEST_TABLE_ID)) log.info("Removing tmp CSV files") if os.path.exists(self.tmp_csv_path): os.remove(self.tmp_csv_path) if os.path.exists(self.tmp_dummy_csv_path): os.remove(self.tmp_dummy_csv_path)
def uploadPatientItemCsvToBQ(self, tempDir, batchCounter, datasetId): log.info('Uploading patient_item CSV to BQ dataset %s for batch %s' % (datasetId, batchCounter)) patient_item_schema = self.bqClient.client.get_table( self.bqClient.client.dataset('clinical_item2018', 'mining-clinical-decisions').table('patient_item') ).schema csv_path = tempDir + '/' + str(batchCounter) + '_patient_item.csv' bigQueryUtil.headerChecker(csv_path, [sf.name for sf in patient_item_schema]) self.bqClient.load_csv_to_table(datasetId, 'patient_item', csv_path, skip_rows=1, append_to_table=True)
def patientItemFromSourceItem(self, sourceItem, clinicalItem, conn): # some prov_map_id values are NULL in starr_datalake2018 if sourceItem["prov_map_id"] is not None: # prov_map_id starts with letters, we're interested only in number parts external_id = int( re.sub("[A-Z]+(\\d+)", "\\1", sourceItem["prov_map_id"]), 16) else: external_id = None # Produce a patient_item record model for the given sourceItem patientItem = RowItemModel({ "external_id": external_id, "patient_id": int(sourceItem["rit_uid"][2:], 16), "encounter_id": sourceItem["pat_enc_csn_id_coded"], "clinical_item_id": clinicalItem["clinical_item_id"], "item_date": str(sourceItem["trtmnt_tm_begin_dt_jittered"] ), # without str(), the time is being converted in postgres "item_date_utc": str(sourceItem["trtmnt_tm_begin_dt_jittered_utc"] ) # without str(), the time is being converted in postgres }) insertQuery = DBUtil.buildInsertQuery("patient_item", list(patientItem.keys())) insertParams = list(patientItem.values()) try: # Optimistic insert of a new unique item DBUtil.execute(insertQuery, insertParams, conn=conn) # Retrieve id of just inserted row patientItem["patient_item_id"] = DBUtil.execute( DBUtil.identityQuery("patient_item"), conn=conn)[0][0] except conn.IntegrityError as err: # If turns out to be a duplicate, okay, pull out existing ID and continue to insert whatever else is possible log.info( err ) # Lookup just by the composite key components to avoid attempting duplicate insertion again searchPatientItem = { "patient_id": patientItem["patient_id"], "clinical_item_id": patientItem["clinical_item_id"], "item_date": patientItem["item_date"], } (patientItem["patient_item_id"], isNew) = DBUtil.findOrInsertItem("patient_item", searchPatientItem, conn=conn) return patientItem
def removePatientItemAddedLines(self, source_table): """delete added records""" log.info('Removing patient_item added lines in PSQL DB') DBUtil.execute("""delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '{}' ); """.format(source_table), conn=self.pgConn)
def removePatientItemCollectionLinkAddedLines(self, source_table): """delete added records""" log.info( 'Removing patient_item_collection_link added lines in PSQL DB') DBUtil.execute("""delete from patient_item_collection_link pi using item_collection_item ici, clinical_item ci, clinical_item_category cic where pi.item_collection_item_id = ici.item_collection_item_id and ici.clinical_item_id = ci.clinical_item_id and ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '{}'; """.format(source_table), conn=self.pgConn)
def dumpClinicalTablesToCsv(self, tempDir): log.info('Dumping clinical_item and clinical_item_category to CSV') DBUtil.execute( ''' COPY clinical_item TO '%s/clinical_item.csv' DELIMITER ',' CSV HEADER; ''' % tempDir ) DBUtil.execute( ''' COPY clinical_item_category TO '%s/clinical_item_category.csv' DELIMITER ',' CSV HEADER; ''' % tempDir )
def convertSourceItems(self, patientIds=None): """Primary run function to process the contents of the starr_datalake2018.demographic table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. patientIds - If provided, only process items for patient IDs matching those provided """ log.info("Conversion for patients starting with: %s, %s total" % (patientIds[:5], len(patientIds))) progress = ProgressDots() with self.connFactory.connection() as conn: category_model = self.categoryFromSourceItem(conn) # only 1 category - no need to have it in the loop for sourceItem in self.querySourceItems(patientIds, progress): self.convertSourceItem(category_model, sourceItem, conn)
def removeClinicalTablesAddedLines(self, source_table): """delete added records""" log.info('Removing clinical_item and clinical_item_category added lines in PSQL DB') DBUtil.execute \ ("""delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = '%s' ); """ % source_table, conn=self.pgConn ) DBUtil.execute("delete from clinical_item_category where source_table = '%s';" % source_table, conn=self.pgConn)
def test_dataConversion(self, name, aggregation): log.info("Generating test source data") self.generate_test_and_expected_data(self.TEST_DATA_SIZE, aggregate=aggregation) self.starrUtil.dump_test_data_to_csv(self.converter.HEADERS, self.test_data, self.test_data_csv) self.starrUtil.upload_csv_to_bigquery( 'starr_datalake2018', 'treatment_team', TEST_DEST_DATASET, 'starr_treatment_team', self.test_data_csv, self.converter.HEADERS) log.debug("Run the conversion process...") conv_options = STARRTreatmentTeamConversion.ConversionOptions() conv_options.aggregate = aggregation temp_dir = tempfile.gettempdir() self.converter.convertAndUpload(conv_options, tempDir=temp_dir, targetDatasetId=TEST_DEST_DATASET) # Just query back for the same data, de-normalizing the data back to a general table test_query = \ """ select pi.external_id as pi_external_id, pi.patient_id, pi.encounter_id, cic.description as cic_description, ci.external_id as ci_external_id, ci.name, ci.description as ci_description, pi.item_date, pi.item_date_utc from %s.patient_item as pi, %s.clinical_item as ci, %s.clinical_item_category as cic where pi.clinical_item_id = ci.clinical_item_id and ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' order by pi.external_id desc, ci.external_id desc """ % (TEST_DEST_DATASET, TEST_DEST_DATASET, TEST_DEST_DATASET, TEST_SOURCE_TABLE) bq_cursor = self.bqConn.cursor() bq_cursor.execute(test_query) actual_data = [row.values() for row in bq_cursor.fetchall()] log.debug('actual data: {}'.format(actual_data)) log.debug('expected data: {}'.format(self.expected_data)) self.assertEqualTable(self.expected_data, actual_data)
def removePatientItemAddedLines(self): """delete added records""" log.info('Removing patient_item added lines in PSQL DB') DBUtil.execute \ ("""delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' ); """ % SOURCE_TABLE )
def main(self, argv): """Main method, callable from command line""" log.setLevel(logging.FATAL) usage_str = "usage: %prog [options]\n" parser = OptionParser(usage=usage_str) parser.add_option( "-s", "--startDate", dest="startDate", metavar="<startDate>", help= "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time on or after this date." ) parser.add_option( "-e", "--endDate", dest="endDate", metavar="<endDate>", help= "Date string (e.g., 2011-12-15), if provided, will only run conversion on items with ordering time before this date." ) parser.add_option( "-n", "--normalizeMixtures", dest="normalizeMixtures", action="store_true", help= "If set, when find medication mixtures, will unravel / normalize into separate entries, one for each ingredient" ) parser.add_option( "-d", "--doseCountLimit", dest="doseCountLimit", help= "Medication orders with a finite number of doses specified less than this limit will be labeled as different items than those without a number specified, or whose number is >= to this limit. Intended to distinguish things like IV single bolus / use vs. continuous infusions and standing medication orders" ) (options, args) = parser.parse_args(argv[1:]) log.info("Starting: " + str.join(" ", argv)) timer = time.time() conv_options = ConversionOptions() conv_options.extract_parser_options(options) self.convertAndUpload(conv_options) timer = time.time() - timer log.info("%.3f seconds to complete", timer)
def convertSourceItems(self, patientIds=None): """Primary run function to process the contents of the stride_patient table and convert them into equivalent patient_item, clinical_item, and clinical_item_category entries. Should look for redundancies to avoid repeating conversion. patientIds - If provided, only process items for patient IDs matching those provided """ log.info("Conversion for patients starting with: %s, %s total" % (patientIds[:5], len(patientIds))) progress = ProgressDots() with self.connFactory.connection() as conn: for sourceItem in self.querySourceItems(patientIds, progress=progress): self.convertSourceItem(sourceItem, conn=conn)
def setUp(self): log.setLevel(logging.INFO) # without this no logs are printed """Prepare state for test cases""" DBTestCase.setUp(self) log.info("Sourcing from BigQuery DB") ClinicalItemDataLoader.build_clinical_item_psql_schemata() self.converter = STARRTreatmentTeamConversion.STARRTreatmentTeamConversion( ) # Instance to test on self.bqConn = self.converter.bqConn self.starrUtil = STARRUtil.StarrCommonUtils(self.converter.bqClient) # point the converter to dummy source table STARRTreatmentTeamConversion.SOURCE_TABLE = TEST_SOURCE_TABLE