def generate_samples(fraction_missing): """Creates fake sample CSV data in GCS. Args: fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows generated in the fake CSV. """ bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) now = clock.CLOCK.now() file_name = '/%s/fake_%s.csv' % (bucket_name, now.strftime(INPUT_CSV_TIME_FORMAT)) num_rows = 0 sample_id_start = random.randint(1000000, 10000000) with cloudstorage_api.open(file_name, mode='w') as dest: writer = csv.writer(dest, delimiter="\t") writer.writerow(_HEADERS) biobank_order_dao = BiobankOrderDao() with biobank_order_dao.session() as session: rows = biobank_order_dao.get_ordered_samples_sample( session, 1 - fraction_missing, _BATCH_SIZE) for biobank_id, collected_time, test in rows: if collected_time is None: logging.warning( 'biobank_id=%s test=%s skipped (collected=%s)', biobank_id, test, collected_time) continue minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED) confirmed_time = collected_time + datetime.timedelta( minutes=minutes_delta) writer.writerow([ sample_id_start + num_rows, None, # no parent confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test, confirmed_time.strftime(_TIME_FORMAT), 'KIT' ]) # reuse confirmed time as created time num_rows += 1 participant_dao = ParticipantDao() with participant_dao.session() as session: rows = participant_dao.get_biobank_ids_sample( session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE) for biobank_id, sign_up_time in rows: minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED) confirmed_time = sign_up_time + datetime.timedelta( minutes=minutes_delta) tests = random.sample(BIOBANK_TESTS, random.randint(1, len(BIOBANK_TESTS))) for test in tests: writer.writerow([ sample_id_start + num_rows, None, confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test, confirmed_time.strftime(_TIME_FORMAT), 'KIT' ]) num_rows += 1 logging.info("Generated %d samples in %s.", num_rows, file_name)
def testDeidentifiedExport_participantIds(self): TableExporter.export_tables('rdr', ['ppi_participant_view'], 'dir', deidentify=True) p1 = self._participant_with_defaults( participantId=1, version=2, biobankId=2, providerLink=make_primary_provider_link_for_name('PITT')) ParticipantDao().insert(p1) p2 = self._participant_with_defaults( participantId=2, version=3, biobankId=3, providerLink=make_primary_provider_link_for_name('PITT')) ParticipantDao().insert(p2) tasks = self.taskqueue_stub.get_filtered_tasks() self.assertEqual(len(tasks), 1) csv_path = deferred.run(tasks[0].payload) with cloudstorage_api.open('/' + csv_path, mode='r') as output: reader = csv.reader(output) rows = list(reader)[1:] self.assertEqual(2, len(rows)) pmi_ids = set([p1.participantId, p2.participantId]) obf_ids = set([row[0] for row in rows]) self.assertFalse(pmi_ids.intersection(obf_ids), 'should be no overlap between pmi_ids and obfuscated IDs') self.assertEquals(2, len(obf_ids))
def test_file_exists(self): consent_pdf_path = '/%s/Participant/somefile.pdf' % _FAKE_BUCKET with self.assertRaises(BadRequest): _raise_if_gcloud_file_missing(consent_pdf_path) with cloudstorage_api.open(consent_pdf_path, mode='w') as cloud_file: cloud_file.write('I am a fake PDF in a fake Cloud.') _raise_if_gcloud_file_missing(consent_pdf_path)
def process_genotyping_manifest_file_from_bucket(bucket_name, genotyping_folder_name): bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name) if not bucket_stat_list: logging.info('No files in cloud bucket %r.' % bucket_name) return None bucket_stat_list = [ s for s in bucket_stat_list if s.filename.lower().endswith('.csv') and '%s' % genotyping_folder_name in s.filename ] if not bucket_stat_list: logging.info('No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, genotyping_folder_name, bucket_stat_list)) return None bucket_stat_list.sort(key=lambda s: s.st_ctime) path = bucket_stat_list[-1].filename timestamp = datetime.datetime.utcfromtimestamp( bucket_stat_list[-1].st_ctime) csv_file = cloudstorage_api.open(path) logging.info('Opening latest genotyping manifest CSV in %r: %r.', bucket_name + '/' + genotyping_folder_name, path) now = clock.CLOCK.now() if now - timestamp > _MAX_INPUT_AGE: logging.info( 'Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (path, timestamp, now)) return None update_sample_info_from_genotyping_manifest_file(csv_file)
def open_writer(self, file_name, predicate=None): gcs_path = '/%s/%s' % (self._bucket_name, file_name) logging.info('Exporting data to %s...', gcs_path) with cloudstorage_api.open(gcs_path, mode='w') as dest: writer = SqlExportFileWriter(dest, predicate, use_unicode=self._use_unicode) yield writer logging.info('Export to %s complete.', gcs_path)
def _open_latest_genomic_set_file(cloud_bucket_name): """Returns an open stream for the most recently created CSV in the given bucket.""" path = _find_latest_genomic_set_csv(cloud_bucket_name) filename = path.replace('/' + cloud_bucket_name + '/', '') logging.info('Opening latest samples CSV in %r: %r.', cloud_bucket_name, path) return cloudstorage_api.open(path), filename
def process_genomic_manifest_result_file_from_bucket(): bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) result_folder_name = config.getSetting(GENOMIC_BIOBANK_MANIFEST_RESULT_FOLDER_NAME) bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name) if not bucket_stat_list: logging.info('No files in cloud bucket %r.' % bucket_name) return None bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv') and '%s' % result_folder_name in s.filename] if not bucket_stat_list: logging.info( 'No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, result_folder_name, bucket_stat_list)) return None bucket_stat_list.sort(key=lambda s: s.st_ctime) path = bucket_stat_list[-1].filename csv_file = cloudstorage_api.open(path) filename = path.replace('/' + bucket_name + '/' + result_folder_name + '/', '') logging.info('Opening latest genomic manifest result CSV in %r: %r.', bucket_name + '/' + result_folder_name, path) timestamp = timestamp_from_filename(filename) now = clock.CLOCK.now() if now - timestamp > _MAX_INPUT_AGE: logging.info('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (filename, timestamp, now)) print('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.' % (filename, timestamp, now)) return None genomic_set_id = _get_genomic_set_id_from_filename(filename) update_package_id_from_manifest_result_file(genomic_set_id, csv_file)
def uploadFile(self, bucket, memeImage): # Upload to Cloud Storage uniqueIdString = str(uuid.uuid4()) uniqueIdString = uniqueIdString.replace('-', '') filename = bucket + '/meme-' + uniqueIdString + '.png' url = CLOUD_STORAGE_URL + filename write_retry_params = api_utils.RetryParams(backoff_factor=1.1) cloud_file = cloudstorage_api.open(filename, 'w', content_type='image/png', options={'x-goog-acl': 'public-read'}, retry_params=write_retry_params) output = StringIO.StringIO() memeImage.save(output, format="PNG") contents = output.getvalue() output.close() try: cloud_file.write(contents) cloud_file.close() except Exception, e: logging.error(e) return Meme(image_url="", text="")
def _write_cloud_csv(self, file_name, contents_str, bucket=None, folder=None): bucket = _FAKE_BUCKET if bucket is None else bucket if folder is None: path = '/%s/%s' % (bucket, file_name) else: path = '/%s/%s/%s' % (bucket, folder, file_name) with cloudstorage_api.open(path, mode='w') as cloud_file: cloud_file.write(contents_str.encode('utf-8'))
def _get_participant_ids_from_person_file(person_file): """ reads the specified CSV file from cloud storage and returns a list of the first column integers """ def parse_pid(row): try: return int(row[0]) except (KeyError, ValueError): pass with cloudstorage_api.open(person_file) as gcs_file: return filter(bool, map(parse_pid, csv.reader(gcs_file)))
def _rewrite_tmpfile(self, mainfile, tmpfile, writer_spec): """Copies contents of tmpfile (name) to mainfile (buffer).""" if mainfile.closed: # can happen when finalize fails return account_id = self._get_tmp_account_id(writer_spec) f = cloudstorage_api.open(tmpfile, _account_id=account_id) # both reads and writes are buffered - the number here doesn't matter data = f.read(self._REWRITE_BLOCK_SIZE) while data: mainfile.write(data) data = f.read(self._REWRITE_BLOCK_SIZE) f.close() mainfile.flush()
def _open_latest_samples_file(cloud_bucket_name): """Returns an open stream for the most recently created CSV in the given bucket.""" path = _find_latest_samples_csv(cloud_bucket_name) logging.info('Opening latest samples CSV in %r: %r.', cloud_bucket_name, path) return cloudstorage_api.open(path), path
def test_create_and_upload_biobank_manifest_file(self): participant = self.participant_dao.insert(Participant(participantId=123, biobankId=123)) self.summary_dao.insert(self.participant_summary(participant)) bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678')]) BiobankOrderDao().insert(bo) participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=124)) self.summary_dao.insert(self.participant_summary(participant2)) bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679')]) BiobankOrderDao().insert(bo2) participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=125)) self.summary_dao.insert(self.participant_summary(participant3)) bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680')]) BiobankOrderDao().insert(bo3) genomic_set = self._create_fake_genomic_set('fake_genomic_set_name', 'fake_genomic_set_criteria', 'Genomic-Test-Set-v12019-04-05-00-30-10.CSV') self._create_fake_genomic_member(genomic_set.id, participant.participantId, bo.biobankOrderId, participant.biobankId, bo.identifiers[0].value, validation_status=GenomicValidationStatus.VALID, sex_at_birth='F', genome_type='aou_array', ny_flag='Y') self._create_fake_genomic_member(genomic_set.id, participant2.participantId, bo2.biobankOrderId, participant2.biobankId, bo2.identifiers[0].value, validation_status=GenomicValidationStatus.INVALID_AGE, sex_at_birth='M', genome_type='aou_array', ny_flag='N') self._create_fake_genomic_member(genomic_set.id, participant3.participantId, bo3.biobankOrderId, participant3.biobankId, bo3.identifiers[0].value, validation_status=GenomicValidationStatus.INVALID_CONSENT, sex_at_birth='F', genome_type='aou_wgs', ny_flag='Y') now = clock.CLOCK.now() genomic_biobank_menifest_handler\ .create_and_upload_genomic_biobank_manifest_file(genomic_set.id, now) bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) # convert UTC to CDT now_cdt_str = _UTC.localize(now).astimezone(_US_CENTRAL).replace(tzinfo=None) \ .strftime(_OUTPUT_CSV_TIME_FORMAT) class ExpectedCsvColumns(object): VALUE = 'value' BIOBANK_ID = 'biobank_id' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' NY_FLAG = 'ny_flag' REQUEST_ID = 'request_id' PACKAGE_ID = 'package_id' ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID) expected_result_filename = 'rdr_fake_sub_folder/Genomic-Manifest-AoU-1-v1' + \ now_cdt_str + '.CSV' path = '/' + bucket_name + '/' + expected_result_filename csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678') self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '123') self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679') self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '124') self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680') self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '125') self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'Y')
def _write_cloud_csv(self, file_name, contents_str): with cloudstorage_api.open('/%s/%s' % (_FAKE_BUCKET, file_name), mode='w') as cloud_file: cloud_file.write(contents_str.encode('utf-8'))
def _read_csv_from_gcs(self, bucket_name, file_name): with cloudstorage_api.open('/%s/%s' % (bucket_name, file_name), mode='r') as infile: return list(csv.DictReader(infile))
def test_end_to_end_valid_case(self): participant = self._make_participant() self._make_summary(participant) self._make_biobank_order(participantId=participant.participantId, biobankOrderId=participant.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678') ]) participant2 = self._make_participant() self._make_summary(participant2) self._make_biobank_order(participantId=participant2.participantId, biobankOrderId=participant2.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679') ]) participant3 = self._make_participant() self._make_summary(participant3) self._make_biobank_order(participantId=participant3.participantId, biobankOrderId=participant3.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680') ]) samples_file = test_data.open_genomic_set_file( 'Genomic-Test-Set-test-2.csv') input_filename = 'Genomic-Test-Set-v1%s.csv' % self\ ._naive_utc_to_naive_central(clock.CLOCK.now())\ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) manifest_result_file = test_data.open_genomic_set_file( 'Genomic-Manifest-Result-test.csv') manifest_result_filename = 'Genomic-Manifest-Result-AoU-1-v1%s.csv' % self \ ._naive_utc_to_naive_central(clock.CLOCK.now()) \ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(manifest_result_filename, manifest_result_file, bucket=_FAKE_BIOBANK_SAMPLE_BUCKET, folder=_FAKE_BUCKET_RESULT_FOLDER) genomic_pipeline.process_genomic_water_line() # verify result file bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME) path = self._find_latest_genomic_set_csv(bucket_name, 'Validation-Result') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') class ResultCsvColumns(object): """Names of CSV columns that we read from the genomic set upload.""" GENOMIC_SET_NAME = 'genomic_set_name' GENOMIC_SET_CRITERIA = 'genomic_set_criteria' PID = 'pid' BIOBANK_ORDER_ID = 'biobank_order_id' NY_FLAG = 'ny_flag' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' STATUS = 'status' INVALID_REASON = 'invalid_reason' ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID, BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE, STATUS, INVALID_REASON) missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(len(rows), 3) self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[0][ResultCsvColumns.PID], '1') self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1') self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[1][ResultCsvColumns.PID], '2') self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2') self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[2][ResultCsvColumns.PID], '3') self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3') self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M') # verify manifest files bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) class ExpectedCsvColumns(object): VALUE = 'value' BIOBANK_ID = 'biobank_id' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' NY_FLAG = 'ny_flag' REQUEST_ID = 'request_id' PACKAGE_ID = 'package_id' ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID) path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678') self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1') self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679') self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2') self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680') self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3') self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N') # verify manifest result files bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) class ExpectedCsvColumns(object): VALUE = 'value' BIOBANK_ID = 'biobank_id' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' NY_FLAG = 'ny_flag' REQUEST_ID = 'request_id' PACKAGE_ID = 'package_id' ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID) path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest-Result') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678') self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1') self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX1') self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679') self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2') self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX2') self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680') self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3') self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ExpectedCsvColumns.PACKAGE_ID], 'PKG-XXXX-XXXX3') # verify package id in database member_dao = GenomicSetMemberDao() members = member_dao.get_all() for member in members: self.assertIn( member.packageId, ['PKG-XXXX-XXXX1', 'PKG-XXXX-XXXX2', 'PKG-XXXX-XXXX3'])
def assertCsvContents(test, bucket_name, file_name, contents): with cloudstorage_api.open('/%s/%s' % (bucket_name, file_name), mode='r') as output: reader = csv.reader(output) rows = sorted(reader) test.assertEquals(sorted(contents), rows)
def test_end_to_end_invalid_case(self): participant = self._make_participant() self._make_summary(participant, dateOfBirth='2018-02-14') self._make_biobank_order(participantId=participant.participantId, biobankOrderId=participant.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678') ]) participant2 = self._make_participant() self._make_summary(participant2, consentForStudyEnrollmentTime=datetime.datetime( 1990, 1, 1)) self._make_biobank_order(participantId=participant2.participantId, biobankOrderId=participant2.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679') ]) participant3 = self._make_participant() self._make_summary(participant3, zipCode='') self._make_biobank_order(participantId=participant3.participantId, biobankOrderId=participant3.participantId, identifiers=[ BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680') ]) participant4 = self._make_participant() self._make_summary(participant4) self._make_biobank_order( participantId=participant4.participantId, biobankOrderId=participant4.participantId, identifiers=[BiobankOrderIdentifier(system=u'c', value=u'e')]) samples_file = test_data.open_genomic_set_file( 'Genomic-Test-Set-test-3.csv') input_filename = 'Genomic-Test-Set-v1%s.csv' % self\ ._naive_utc_to_naive_central(clock.CLOCK.now())\ .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) genomic_pipeline.process_genomic_water_line() # verify result file bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME) path = self._find_latest_genomic_set_csv(bucket_name, 'Validation-Result') csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') class ResultCsvColumns(object): """Names of CSV columns that we read from the genomic set upload.""" GENOMIC_SET_NAME = 'genomic_set_name' GENOMIC_SET_CRITERIA = 'genomic_set_criteria' PID = 'pid' BIOBANK_ORDER_ID = 'biobank_order_id' NY_FLAG = 'ny_flag' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' STATUS = 'status' INVALID_REASON = 'invalid_reason' ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID, BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE, STATUS, INVALID_REASON) missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(len(rows), 4) self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], 'INVALID_AGE') self.assertEqual(rows[0][ResultCsvColumns.PID], '1') self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1') self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], 'INVALID_CONSENT') self.assertEqual(rows[1][ResultCsvColumns.PID], '2') self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2') self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], 'INVALID_NY_ZIPCODE') self.assertEqual(rows[2][ResultCsvColumns.PID], '3') self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3') self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[3][ResultCsvColumns.GENOMIC_SET_NAME], 'name_xxx') self.assertEqual(rows[3][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'criteria_xxx') self.assertEqual(rows[3][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[3][ResultCsvColumns.INVALID_REASON], 'INVALID_BIOBANK_ORDER_CLIENT_ID') self.assertEqual(rows[3][ResultCsvColumns.PID], '4') self.assertEqual(rows[3][ResultCsvColumns.BIOBANK_ORDER_ID], '4') self.assertEqual(rows[3][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[3][ResultCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[3][ResultCsvColumns.SEX_AT_BIRTH], 'F')
def test_create_genomic_set_result_file(self): participant = self.participant_dao.insert(Participant(participantId=123, biobankId=123)) self.summary_dao.insert(self.participant_summary(participant)) bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345678')]) BiobankOrderDao().insert(bo) participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=124)) self.summary_dao.insert(self.participant_summary(participant2)) bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345679')]) BiobankOrderDao().insert(bo2) participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=125)) self.summary_dao.insert(self.participant_summary(participant3)) bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125', identifiers=[BiobankOrderIdentifier( system=u'https://www.pmi-ops.org', value=u'12345680')]) BiobankOrderDao().insert(bo3) genomic_set = self._create_fake_genomic_set('fake_genomic_set_name', 'fake_genomic_set_criteria', 'Genomic-Test-Set-v12019-04-05-00-30-10.CSV') self._create_fake_genomic_member(genomic_set.id, participant.participantId, bo.biobankOrderId, participant.biobankId, bo.identifiers[0].value, validation_status=GenomicValidationStatus.VALID, sex_at_birth='F', genome_type='aou_array', ny_flag='Y') self._create_fake_genomic_member(genomic_set.id, participant2.participantId, bo2.biobankOrderId, participant2.biobankId, bo2.identifiers[0].value, validation_status=GenomicValidationStatus.INVALID_AGE, sex_at_birth='M', genome_type='aou_array', ny_flag='N') self._create_fake_genomic_member(genomic_set.id, participant3.participantId, bo3.biobankOrderId, participant3.biobankId, bo3.identifiers[0].value, validation_status=GenomicValidationStatus.INVALID_CONSENT, sex_at_birth='F', genome_type='aou_wgs', ny_flag='Y') genomic_set_file_handler.create_genomic_set_status_result_file(genomic_set.id) expected_result_filename = 'Genomic-Test-Set-v12019-04-05-00-30-10-Validation-Result.CSV' bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME) path = '/' + bucket_name + '/' + expected_result_filename csv_file = cloudstorage_api.open(path) csv_reader = csv.DictReader(csv_file, delimiter=',') class ResultCsvColumns(object): """Names of CSV columns that we read from the genomic set upload.""" GENOMIC_SET_NAME = 'genomic_set_name' GENOMIC_SET_CRITERIA = 'genomic_set_criteria' PID = 'pid' BIOBANK_ORDER_ID = 'biobank_order_id' NY_FLAG = 'ny_flag' SEX_AT_BIRTH = 'sex_at_birth' GENOME_TYPE = 'genome_type' STATUS = 'status' INVALID_REASON = 'invalid_reason' ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID, BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE, STATUS, INVALID_REASON) missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames) self.assertEqual(len(missing_cols), 0) rows = list(csv_reader) self.assertEqual(len(rows), 3) self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name') self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria') self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid') self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '') self.assertEqual(rows[0][ResultCsvColumns.PID], '123') self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '123') self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'F') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name') self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria') self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], 'INVALID_AGE') self.assertEqual(rows[1][ResultCsvColumns.PID], '124') self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '124') self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N') self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array') self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'M') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name') self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria') self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'invalid') self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], 'INVALID_CONSENT') self.assertEqual(rows[2][ResultCsvColumns.PID], '125') self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '125') self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'Y') self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_wgs') self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'F')