def setUp(self): """Prepare state for test cases""" DBTestCase.setUp(self) ClinicalItemDataLoader.build_clinical_item_psql_schemata() log.info("Sourcing from BigQuery DB") self.patientIds = [ 'JCd5ef6e', 'JCce317d', 'JCe83f82', 'JCe5fc81', 'JCdb8fe4', 'JCcdc6a0', 'JCd37637', 'JCdbb57e', 'JCcebdef', 'JCcc41b3', 'JCe160b3', 'JCe8415d', 'JCdb1735', 'JCeb8fe9', 'JCe362b1', 'JCcca36e', 'JCddddf4', 'JCe683c1', 'JCe74388', 'JCd30ac4', 'JCd1bb22', 'JCe3397c', 'JCccb16c', 'JCd5da6d', 'JCd6f915', 'JCe3e96d', 'JCd43db0', 'JCe5a52f', 'JCd9f7b5', 'JCd60bb3', 'JCe66004', 'JCe4a6c2', 'JCceb239', 'JCda9846', 'JCce3176', 'JCe098ca', 'JCd31af1', 'JCe796fd', 'JCcc9243', 'JCd05308', 'JCea3982', 'JCd99619', 'JCd99366', 'JCdb087f', 'JCd9f2b3', 'JCe8a2d4', 'JCd19201', 'JCcdc146', 'JCe05414', 'JCd98ef5' ] self.pat_id_csv = '/tmp/tmp_test_pat_id.csv' with open(self.pat_id_csv, 'wb') as f: for id in ['rit_uid'] + self.patientIds: f.write("%s\n" % id) self.bqConn = bigQueryUtil.connection() self.converter = STARRDemographicsConversion() # Instance to test on
def __init__(self): """Default constructor""" self.bqConn = bigQueryUtil.connection() self.bqClient = bigQueryUtil.BigQueryClient() self.connFactory = DBUtil.ConnectionFactory() # Default connection source self.categoryBySourceDescr = dict() self.clinicalItemByCategoryIdExtId = dict()
def __init__(self): """Default constructor""" self.bqConn = bigQueryUtil.connection() self.bqClient = bigQueryUtil.BigQueryClient() self.connFactory = DBUtil.ConnectionFactory( ) # Default connection source, but Allow specification of alternative DB connection source self.categoryBySourceDescr = dict( ) # Local cache to track the clinical item category table contents self.clinicalItemByCompositeKey = dict( ) # Local cache to track clinical item table contents
def setUp(self): """Prepare state for test cases""" # create dummy CSV self.tmp_dummy_csv_path = TMP_DIR + '/unittest_bq_dummy.csv' self.dummy_table = lines = [['num', 'char']] + [[n, chr(ord('a')+n)] for n in range(26)] with open(self.tmp_dummy_csv_path, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerows(lines) self.tmp_csv_path = TMP_DIR + '/unittest_bq.csv' self.bqConn = bigQueryUtil.connection() self.bqClient = bigQueryUtil.BigQueryClient()
def __init__(self): """Default constructor""" self.bqConn = bigQueryUtil.connection() self.bqClient = bigQueryUtil.BigQueryClient() self.connFactory = DBUtil.ConnectionFactory( ) # Default connection source, but Allow specification of alternative DB connection source self.starrUtil = STARRUtil.StarrCommonUtils(self.bqClient) self.categoryBySourceDescr = dict( ) # Local cache to track the clinical item category table contents self.clinicalItemByCategoryIdExtId = dict( ) # Local cache to track clinical item table contents self.itemCollectionByKeyStr = dict( ) # Local cache to track item collections self.itemCollectionItemByCollectionIdItemId = dict( ) # Local cache to track item collection items self.patient_items = dict() # Local cache of processed patient items self.patient_item_collection_links = set( ) # Local cache of processed patient item collection links
class TestSTARRDemographicsConversion(DBTestCase): TEST_DATA_SIZE = 50 BATCH_SIZE = 10 STARTING_BATCH = 4 header = [ 'rit_uid', 'birth_date_jittered', 'death_date_jittered', 'gender', 'canonical_race', 'canonical_ethnicity', 'marital_status', 'religion', 'language', 'intrptr_needed_yn', 'insurance_payor_name', 'cur_pcp_prov_map_id', 'recent_conf_enc_jittered', 'recent_ht_in_cms', 'recent_wt_in_kgs', 'bmi', 'charlson_score', 'n_hospitalizations', 'days_in_hospital', 'pat_status' ] patientIds = [] test_data = [] expected_data = [] test_data_csv = '/tmp/test_starr_demographic_dummy_data.csv' pat_id_csv = '/tmp/tmp_test_pat_id.csv' bqConn = bigQueryUtil.connection() converter = STARRDemographicsConversion.STARRDemographicsConversion( ) # Instance to test on starrUtil = StarrCommonUtils(converter.bqClient) def setUp(self): """Prepare state for test cases""" log.setLevel(logging.INFO) # without this no logs are printed DBTestCase.setUp(self) ClinicalItemDataLoader.build_clinical_item_psql_schemata() # point the converter to dummy source table STARRDemographicsConversion.SOURCE_TABLE = TEST_SOURCE_TABLE log.info("Generating test source data") self.generate_test_and_expected_data(self.TEST_DATA_SIZE) self.starrUtil.dump_test_data_to_csv(self.header, self.test_data, self.test_data_csv) self.starrUtil.upload_csv_to_bigquery('starr_datalake2018', 'demographic', 'test_dataset', 'starr_demographic', self.test_data_csv) self.dump_patient_ids_to_test_to_csv(self.pat_id_csv) def generate_test_and_expected_data(self, test_data_size): for curr_row in range(test_data_size): patient_id = 'JC' + format(curr_row, '06') self.patientIds.append(patient_id) test_data_row = self.generate_test_data_row( curr_row, StarrCommonUtils.random_period(), patient_id) self.test_data.append(test_data_row) # prepare expected data starting from requested batch if curr_row >= self.STARTING_BATCH * self.BATCH_SIZE: self.generate_expected_data_rows(test_data_row, self.expected_data) self.expected_data.sort(key=lambda tup: (-tup[1], tup[5])) # patient_id desc, name asc def dump_patient_ids_to_test_to_csv(self, pat_id_csv): with open(pat_id_csv, 'wb') as f: for rit_uid in ['rit_uid'] + self.patientIds: f.write("%s\n" % rit_uid) @staticmethod def generate_test_data_row(curr_row, lifespan, patient_id): return (patient_id, lifespan[0], [None, lifespan[1]][random.randint( 0, 1)], GENDER[random.randint(0, len(GENDER) - 1)], RACE[random.randint(0, len(RACE) - 1)], ETHNICITY[random.randint(0, len(ETHNICITY) - 1)], MARITAL_STATUS[random.randint(0, len(MARITAL_STATUS) - 1)], RELIGION[random.randint(0, len(RELIGION) - 1)], LANGUAGE[random.randint(0, len(LANGUAGE) - 1)], [None, 'N', 'Y'][random.randint(0, 2)], ''.join( random.choice(string.ascii_uppercase) for _ in range(10)), 'SS' + format(curr_row, '07'), datetime.fromtimestamp(random.randint(1, int(time.time())), pytz.utc), random.randint(150, 210), random.randint(50, 150), random.randint(18, 24), random.randint(1, 27), random.randint(0, 300), random.randint(0, 1000), PAT_STATUS[random.randint( 0, len(PAT_STATUS) - 1)]) def generate_expected_data_rows(self, row, expected_data): birth_list = [ None, StarrCommonUtils.convertPatIdToSTRIDE(row[0]), None, "Demographics", None, "Birth", "Birth Year", datetime(row[1].year, 1, 1, tzinfo=pytz.UTC) ] expected_data.append(tuple(birth_list)) expected_data.append(self.birth_decade_tuple_from(birth_list, row)) expected_data.append(self.race_tuple_from(birth_list, row)) expected_data.append(self.gender_tuple_from(birth_list, row)) if row[2]: expected_data.append(self.death_date_tuple_from(birth_list, row)) @staticmethod def birth_decade_tuple_from(birth_list, row): birth_decade_list = list(birth_list) decade = (row[1].year / 10) * 10 birth_decade_list[5] = "Birth%ds" % decade birth_decade_list[6] = "Birth Decade %ds" % decade return tuple(birth_decade_list) def race_tuple_from(self, birth_list, row): race_list = list(birth_list) race_ethnicity = self.converter.summarizeRaceEthnicity(row[4], row[5]) race_list[5] = "Race%s" % race_ethnicity.translate(None, " ()-/") race_list[6] = "Race/Ethnicity: %s" % race_ethnicity return tuple(race_list) @staticmethod def gender_tuple_from(birth_list, row): gender_list = list(birth_list) gender_list[5] = row[3] gender_list[6] = "%s Gender" % row[3] return tuple(gender_list) @staticmethod def death_date_tuple_from(birth_list, row): death_list = list(birth_list) death_list[5] = "Death" death_list[6] = "Death Date" death_list[7] = row[2] return tuple(death_list) def tearDown(self): """Restore state from any setUp or test steps""" log.info("Purge test records from the database") os.remove(self.pat_id_csv) os.remove(self.test_data_csv) DBUtil.execute("""delete from patient_item where clinical_item_id in ( select clinical_item_id from clinical_item as ci, clinical_item_category as cic where ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' ); """ % TEST_SOURCE_TABLE) DBUtil.execute("""delete from clinical_item where clinical_item_category_id in ( select clinical_item_category_id from clinical_item_category where source_table = '%s' ); """ % TEST_SOURCE_TABLE) DBUtil.execute( "delete from clinical_item_category where source_table = '%s';" % TEST_SOURCE_TABLE) bq_cursor = self.bqConn.cursor() bq_cursor.execute('DELETE FROM %s.patient_item WHERE true;' % TEST_DEST_DATASET) bq_cursor.execute('DELETE FROM %s.clinical_item WHERE true;' % TEST_DEST_DATASET) bq_cursor.execute('DELETE FROM %s.clinical_item_category WHERE true;' % TEST_DEST_DATASET) bq_cursor.execute('DROP TABLE %s;' % TEST_SOURCE_TABLE) DBTestCase.tearDown(self) def test_batchDataConversion(self): # Run the data conversion on the same data and look for expected records log.debug( "Run the batch conversion process, and upload to test dataset in BigQuery..." ) self.converter.convertItemsByBatch(self.pat_id_csv, self.BATCH_SIZE, datasetId=TEST_DEST_DATASET, startBatch=self.STARTING_BATCH) # Just query back for the same data, de-normalizing the data back to a general table test_query = \ """ select pi.external_id as pi_external_id, pi.patient_id, pi.encounter_id, cic.description as cic_description, ci.external_id as ci_external_id, ci.name, ci.description as ci_description, pi.item_date from %s.patient_item as pi, %s.clinical_item as ci, %s.clinical_item_category as cic where pi.clinical_item_id = ci.clinical_item_id and ci.clinical_item_category_id = cic.clinical_item_category_id and cic.source_table = '%s' order by pi.patient_id desc, ci.name """ % (TEST_DEST_DATASET, TEST_DEST_DATASET, TEST_DEST_DATASET, TEST_SOURCE_TABLE) bq_cursor = self.bqConn.cursor() bq_cursor.execute(test_query) actual_data = [row.values() for row in bq_cursor.fetchall()] log.debug('actual data %s' % actual_data) log.debug('expected data %s' % self.expected_data) self.assertEqualTable(self.expected_data, actual_data)