def generate_samples(fraction_missing): """Creates fake sample CSV data in GCS. Args: fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows generated in the fake CSV. """ bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME) now = clock.CLOCK.now() file_name = '/%s/fake_%s.csv' % (bucket_name, now.strftime(INPUT_CSV_TIME_FORMAT)) num_rows = 0 sample_id_start = random.randint(1000000, 10000000) with cloudstorage_api.open(file_name, mode='w') as dest: writer = csv.writer(dest, delimiter="\t") writer.writerow(_HEADERS) biobank_order_dao = BiobankOrderDao() with biobank_order_dao.session() as session: rows = biobank_order_dao.get_ordered_samples_sample( session, 1 - fraction_missing, _BATCH_SIZE) for biobank_id, collected_time, test in rows: if collected_time is None: logging.warning( 'biobank_id=%s test=%s skipped (collected=%s)', biobank_id, test, collected_time) continue minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED) confirmed_time = collected_time + datetime.timedelta( minutes=minutes_delta) writer.writerow([ sample_id_start + num_rows, None, # no parent confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test, confirmed_time.strftime(_TIME_FORMAT), 'KIT' ]) # reuse confirmed time as created time num_rows += 1 participant_dao = ParticipantDao() with participant_dao.session() as session: rows = participant_dao.get_biobank_ids_sample( session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE) for biobank_id, sign_up_time in rows: minutes_delta = random.randint( 0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED) confirmed_time = sign_up_time + datetime.timedelta( minutes=minutes_delta) tests = random.sample(BIOBANK_TESTS, random.randint(1, len(BIOBANK_TESTS))) for test in tests: writer.writerow([ sample_id_start + num_rows, None, confirmed_time.strftime(_TIME_FORMAT), to_client_biobank_id(biobank_id), test, confirmed_time.strftime(_TIME_FORMAT), 'KIT' ]) num_rows += 1 logging.info("Generated %d samples in %s.", num_rows, file_name)
class ParticipantDaoTest(SqlTestBase): def setUp(self): super(ParticipantDaoTest, self).setUp() self.dao = ParticipantDao() self.participant_summary_dao = ParticipantSummaryDao() self.participant_history_dao = ParticipantHistoryDao() def test_get_before_insert(self): self.assertIsNone(self.dao.get(1)) self.assertIsNone(self.participant_summary_dao.get(1)) self.assertIsNone(self.participant_history_dao.get([1, 1])) def test_insert(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) expected_participant = self._participant_with_defaults( participantId=1, version=1, biobankId=2, lastModified=time, signUpTime=time) self.assertEquals(expected_participant.asdict(), p.asdict()) p2 = self.dao.get(1) self.assertEquals(p.asdict(), p2.asdict()) # Creating a participant also creates a ParticipantHistory row, but not a ParticipantSummary row ps = self.participant_summary_dao.get(1) self.assertIsNone(ps) ph = self.participant_history_dao.get([1, 1]) expected_ph = self._participant_history_with_defaults( participantId=1, biobankId=2, lastModified=time, signUpTime=time) self.assertEquals(expected_ph.asdict(), ph.asdict()) def test_insert_with_external_id(self): p = Participant(externalId=3) time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) expected_participant = self._participant_with_defaults( participantId=1, externalId=3, version=1, biobankId=2, lastModified=time, signUpTime=time) self.assertEquals(expected_participant.asdict(), p.asdict()) p2 = self.dao.get(1) self.assertEquals(p.asdict(), p2.asdict()) # Creating a participant also creates a ParticipantHistory row, but not a ParticipantSummary row ps = self.participant_summary_dao.get(1) self.assertIsNone(ps) ph = self.participant_history_dao.get([1, 1]) expected_ph = self._participant_history_with_defaults( participantId=1, externalId=3, biobankId=2, lastModified=time, signUpTime=time) self.assertEquals(expected_ph.asdict(), ph.asdict()) def test_insert_duplicate_participant_id_retry(self): p = Participant() with random_ids([1, 2]): self.dao.insert(p) p2 = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 3, 2, 3]): with FakeClock(time): p2 = self.dao.insert(p2) expected_participant = self._participant_with_defaults( participantId=2, version=1, biobankId=3, lastModified=time, signUpTime=time) self.assertEquals(expected_participant.asdict(), p2.asdict()) def test_insert_duplicate_participant_id_give_up(self): p = Participant() with random_ids([1, 2]): self.dao.insert(p) rand_ints = [] for i in range(0, MAX_INSERT_ATTEMPTS): rand_ints.append(1) rand_ints.append(i) p2 = Participant() with random_ids(rand_ints): with self.assertRaises(ServiceUnavailable): self.dao.insert(p2) def test_insert_duplicate_biobank_id_give_up(self): p = Participant() with random_ids([1, 2]): self.dao.insert(p) rand_ints = [] for i in range(0, MAX_INSERT_ATTEMPTS): rand_ints.append(i + 2) rand_ints.append(2) p2 = Participant() with random_ids(rand_ints): with self.assertRaises(ServiceUnavailable): self.dao.insert(p2) def test_update_no_expected_version_no_ps(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.providerLink = make_primary_provider_link_for_name('PITT') time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) # lastModified, hpoId, version is updated on p after being passed in p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_participant.asdict(), p2.asdict()) self.assertEquals(p.asdict(), p2.asdict()) ps = self.participant_summary_dao.get(1) self.assertIsNone(ps) expected_ph = self._participant_history_with_defaults( participantId=1, biobankId=2, lastModified=time, signUpTime=time) # Updating the participant adds a new ParticipantHistory row. ph = self.participant_history_dao.get([1, 1]) self.assertEquals(expected_ph.asdict(), ph.asdict()) ph2 = self.participant_history_dao.get([1, 2]) expected_ph2 = self._participant_history_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_ph2.asdict(), ph2.asdict()) def test_update_no_expected_version_with_ps(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.providerLink = make_primary_provider_link_for_name('PITT') time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) summary = self.participant_summary(p) self.participant_summary_dao.insert(summary) # lastModified, hpoId, version is updated on p after being passed in p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_participant.asdict(), p2.asdict()) self.assertEquals(p.asdict(), p2.asdict()) # Updating the participant provider link also updates the HPO ID on the participant summary. ps = self.participant_summary_dao.get(1) expected_ps = self._participant_summary_with_defaults( participantId=1, biobankId=2, signUpTime=time, hpoId=PITT_HPO_ID, lastModified=time2, firstName=summary.firstName, lastName=summary.lastName, email=summary.email) self.assertEquals(expected_ps.asdict(), ps.asdict()) p2_last_modified = p2.lastModified p2.hpoId = 2 self.dao.update(p2) p2_update = self.dao.get(1) self.assertNotEquals(p2_last_modified, p2_update.lastModified) self.assertEquals(p2_update.lastModified, p2.lastModified) expected_ph = self._participant_history_with_defaults( participantId=1, biobankId=2, lastModified=time, signUpTime=time) # And updating the participant adds a new ParticipantHistory row. ph = self.participant_history_dao.get([1, 1]) self.assertEquals(expected_ph.asdict(), ph.asdict()) ph2 = self.participant_history_dao.get([1, 2]) expected_ph2 = self._participant_history_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_ph2.asdict(), ph2.asdict()) def test_update_right_expected_version(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.version = 1 p.providerLink = make_primary_provider_link_for_name('PITT') time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_participant.asdict(), p2.asdict()) def test_update_withdraw(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.version = 1 p.withdrawalStatus = WithdrawalStatus.NO_USE time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, withdrawalStatus=WithdrawalStatus.NO_USE, withdrawalTime=time2) self.assertEquals(expected_participant.asdict(), p2.asdict()) p.version = 2 p.providerLink = make_primary_provider_link_for_name('PITT') p.withdrawalTime = None time3 = datetime.datetime(2016, 1, 3) with FakeClock(time3): self.dao.update(p) # Withdrawal time should get copied over. p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=3, biobankId=2, lastModified=time3, signUpTime=time, withdrawalStatus=WithdrawalStatus.NO_USE, withdrawalTime=time2, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_participant.asdict(), p2.asdict()) def test_update_suspend(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.version = 1 p.suspensionStatus = SuspensionStatus.NO_CONTACT time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, suspensionStatus=SuspensionStatus.NO_CONTACT, suspensionTime=time2) self.assertEquals(expected_participant.asdict(), p2.asdict()) p.version = 2 p.providerLink = make_primary_provider_link_for_name('PITT') p.suspensionTime = None time3 = datetime.datetime(2016, 1, 3) with FakeClock(time3): self.dao.update(p) # Withdrawal time should get copied over. p2 = self.dao.get(1) expected_participant = self._participant_with_defaults( participantId=1, version=3, biobankId=2, lastModified=time3, signUpTime=time, suspensionStatus=SuspensionStatus.NO_CONTACT, suspensionTime=time2, hpoId=PITT_HPO_ID, providerLink=p2.providerLink) self.assertEquals(expected_participant.asdict(), p2.asdict()) def test_update_wrong_expected_version(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.version = 2 p.providerLink = make_primary_provider_link_for_name('PITT') time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): with self.assertRaises(PreconditionFailed): self.dao.update(p) def test_update_withdrawn_hpo_succeeds(self): p = Participant(withdrawalStatus=WithdrawalStatus.NO_USE) time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) expected_participant = self._participant_with_defaults( participantId=1, version=1, biobankId=2, lastModified=time, signUpTime=time, withdrawalStatus=WithdrawalStatus.NO_USE) self.assertEquals(expected_participant.asdict(), p.asdict()) p2 = self.dao.get(1) self.assertEquals(p.asdict(), p2.asdict()) p.version = 1 p.providerLink = make_primary_provider_link_for_name('PITT') self.dao.update(p) def test_update_withdrawn_status_fails(self): p = Participant(withdrawalStatus=WithdrawalStatus.NO_USE) time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) expected_participant = self._participant_with_defaults( participantId=1, version=1, biobankId=2, lastModified=time, signUpTime=time, withdrawalStatus=WithdrawalStatus.NO_USE) self.assertEquals(expected_participant.asdict(), p.asdict()) p2 = self.dao.get(1) self.assertEquals(p.asdict(), p2.asdict()) p.version = 1 p.withdrawalStatus = WithdrawalStatus.NOT_WITHDRAWN with self.assertRaises(Forbidden): self.dao.update(p) def test_update_not_exists(self): p = self._participant_with_defaults(participantId=1, biobankId=2) with self.assertRaises(NotFound): self.dao.update(p) def test_bad_hpo_insert(self): p = Participant( participantId=1, version=1, biobankId=2, providerLink=make_primary_provider_link_for_name('FOO')) with self.assertRaises(BadRequest): self.dao.insert(p) def test_bad_hpo_update(self): p = Participant(participantId=1, biobankId=2) time = datetime.datetime(2016, 1, 1) with FakeClock(time): self.dao.insert(p) p.providerLink = make_primary_provider_link_for_name('FOO') with self.assertRaises(BadRequest): self.dao.update(p) def test_pairs_unset(self): participant_id = 22 self.dao.insert(Participant(participantId=participant_id, biobankId=2)) refetched = self.dao.get(participant_id) self.assertEquals(refetched.hpoId, UNSET_HPO_ID) # sanity check self.participant_summary_dao.insert( self.participant_summary(refetched)) with self.dao.session() as session: self.dao.add_missing_hpo_from_site(session, participant_id, self._test_db.site_id) paired = self.dao.get(participant_id) self.assertEquals(paired.hpoId, self._test_db.hpo_id) self.assertEquals( paired.providerLink, make_primary_provider_link_for_id(self._test_db.hpo_id)) self.assertEquals( self.participant_summary_dao.get(participant_id).hpoId, self._test_db.hpo_id) self.assertEquals(paired.organizationId, self._test_db.organization_id) self.assertEquals(paired.siteId, self._test_db.site_id) def test_overwrite_existing_pairing(self): participant_id = 99 created = self.dao.insert( Participant(participantId=participant_id, biobankId=2, hpoId=self._test_db.hpo_id, providerLink=make_primary_provider_link_for_id( self._test_db.hpo_id))) self.participant_summary_dao.insert(self.participant_summary(created)) self.assertEquals(created.hpoId, self._test_db.hpo_id) # sanity check other_hpo = HPODao().insert( HPO(hpoId=PITT_HPO_ID + 1, name='DIFFERENT_HPO')) other_site = SiteDao().insert( Site(hpoId=other_hpo.hpoId, siteName='Arbitrary Site', googleGroup='*****@*****.**')) with self.dao.session() as session: self.dao.add_missing_hpo_from_site(session, participant_id, other_site.siteId) # Original Participant + summary is affected. refetched = self.dao.get(participant_id) self.assertEquals(refetched.hpoId, other_hpo.hpoId) self.assertEquals(refetched.providerLink, make_primary_provider_link_for_id(other_hpo.hpoId)) self.assertEquals( self.participant_summary_dao.get(participant_id).hpoId, other_hpo.hpoId) def test_pairing_at_different_levels(self): p = Participant() time = datetime.datetime(2016, 1, 1) with random_ids([1, 2]): with FakeClock(time): self.dao.insert(p) p.version = 1 p.siteId = 1 time2 = datetime.datetime(2016, 1, 2) with FakeClock(time2): self.dao.update(p) p2 = self.dao.get(1) ep = self._participant_with_defaults(participantId=1, version=2, biobankId=2, lastModified=time2, signUpTime=time, hpoId=PITT_HPO_ID, siteId=1, organizationId=PITT_ORG_ID, providerLink=p2.providerLink) self.assertEquals(ep.siteId, p2.siteId) # ensure that p2 get paired with expected awardee and organization from update(). self.assertEquals(ep.hpoId, p2.hpoId) self.assertEquals(ep.organizationId, p2.organizationId)
class BiobankSamplesPipelineTest(CloudStorageSqlTestBase, NdbTestBase): def setUp(self): super(BiobankSamplesPipelineTest, self).setUp(use_mysql=True) NdbTestBase.doSetUp(self) TestBase.setup_fake(self) config.override_setting(config.BASELINE_SAMPLE_TEST_CODES, _BASELINE_TESTS) # Everything is stored as a list, so override bucket name as a 1-element list. config.override_setting(config.BIOBANK_SAMPLES_BUCKET_NAME, [_FAKE_BUCKET]) self.participant_dao = ParticipantDao() self.summary_dao = ParticipantSummaryDao() def _write_cloud_csv(self, file_name, contents_str): with cloudstorage_api.open('/%s/%s' % (_FAKE_BUCKET, file_name), mode='w') as cloud_file: cloud_file.write(contents_str.encode('utf-8')) def _make_biobank_order(self, **kwargs): """Makes a new BiobankOrder (same values every time) with valid/complete defaults. Kwargs pass through to BiobankOrder constructor, overriding defaults. """ participantId = kwargs['participantId'] modified = datetime.datetime(2019, 03, 25, 15, 59, 30) for k, default_value in ( ('biobankOrderId', u'1'), ('created', clock.CLOCK.now()), # ('participantId', self.participant.participantId), ('sourceSiteId', 1), ('sourceUsername', u'*****@*****.**'), ('collectedSiteId', 1), ('collectedUsername', u'*****@*****.**'), ('processedSiteId', 1), ('processedUsername', u'*****@*****.**'), ('finalizedSiteId', 2), ('finalizedUsername', u'*****@*****.**'), ('version', 1), ('identifiers', [BiobankOrderIdentifier(system=u'a', value=u'c')]), ('samples', [ BiobankOrderedSample(test=u'1SAL2', description=u'description', processingRequired=True) ]), ('dvOrders', [ BiobankDVOrder(participantId=participantId, modified=modified, version=1) ])): if k not in kwargs: kwargs[k] = default_value return BiobankOrder(**kwargs) def test_dv_order_sample_update(self): """ Test Biobank Direct Volunteer order """ participant = self.participant_dao.insert(Participant()) self.summary_dao.insert(self.participant_summary(participant)) created_ts = datetime.datetime(2019, 03, 22, 18, 30, 45) confirmed_ts = datetime.datetime(2019, 03, 23, 12, 13, 00) bo = self._make_biobank_order(participantId=participant.participantId) BiobankOrderDao().insert(bo) boi = bo.identifiers[0] bss = BiobankStoredSample(biobankStoredSampleId=u'23523523', biobankId=participant.biobankId, test=u'1SAL2', created=created_ts, biobankOrderIdentifier=boi.value, confirmed=confirmed_ts) with self.participant_dao.session() as session: session.add(bss) ps = self.summary_dao.get(participant.participantId) self.assertIsNone(ps.sampleStatusDV1SAL2) self.assertIsNone(ps.sampleStatusDV1SAL2Time) self.summary_dao.update_from_biobank_stored_samples() ps = self.summary_dao.get(participant.participantId) self.assertEqual(ps.sampleStatus1SAL2, SampleStatus.RECEIVED) self.assertEqual(ps.sampleStatus1SAL2Time, confirmed_ts) def test_end_to_end(self): dao = BiobankStoredSampleDao() self.assertEquals(dao.count(), 0) # Create 3 participants and pass their (random) IDs into sample rows. summary_dao = ParticipantSummaryDao() biobank_ids = [] participant_ids = [] nids = 16 # equal to the number of parent rows in 'biobank_samples_1.csv' cids = 1 # equal to the number of child rows in 'biobank_samples_1.csv' for _ in xrange(nids): participant = self.participant_dao.insert(Participant()) summary_dao.insert(self.participant_summary(participant)) participant_ids.append(participant.participantId) biobank_ids.append(participant.biobankId) self.assertEquals( summary_dao.get( participant.participantId).numBaselineSamplesArrived, 0) test_codes = random.sample(_BASELINE_TESTS, nids) samples_file = test_data.open_biobank_samples(biobank_ids=biobank_ids, tests=test_codes) lines = samples_file.split('\n')[1:] # remove field name line input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central( clock.CLOCK.now()).strftime( biobank_samples_pipeline.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, samples_file) biobank_samples_pipeline.upsert_from_latest_csv() self.assertEquals(dao.count(), nids - cids) for x in range(0, nids): cols = lines[x].split('\t') if cols[10].strip(): # skip child sample continue # If status is 'In Prep', then sample confirmed timestamp should be empty if cols[2] == 'In Prep': self.assertEquals(len(cols[11]), 0) else: status = SampleStatus.RECEIVED ts_str = cols[11] # DA-814 - Participant Summary test status should be: Unset, Received or Disposed only. # If sample is disposed, then check disposed timestamp, otherwise check confirmed timestamp. # DA-871 - Only check status is disposed when reason code is a bad disposal. if cols[2] == 'Disposed' and get_sample_status_enum_value( cols[8]) > SampleStatus.UNKNOWN: status = SampleStatus.DISPOSED ts_str = cols[9] ts = datetime.datetime.strptime(ts_str, '%Y/%m/%d %H:%M:%S') self._check_summary(participant_ids[x], test_codes[x], ts, status) def test_old_csv_not_imported(self): now = clock.CLOCK.now() too_old_time = now - datetime.timedelta(hours=25) input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central( too_old_time).strftime( biobank_samples_pipeline.INPUT_CSV_TIME_FORMAT) self._write_cloud_csv(input_filename, '') with self.assertRaises(biobank_samples_pipeline.DataError): biobank_samples_pipeline.upsert_from_latest_csv() def _naive_utc_to_naive_central(self, naive_utc_date): utc_date = pytz.utc.localize(naive_utc_date) central_date = utc_date.astimezone(pytz.timezone('US/Central')) return central_date.replace(tzinfo=None) def _check_summary(self, participant_id, test, date_formatted, status): summary = ParticipantSummaryDao().get(participant_id) self.assertEquals(summary.numBaselineSamplesArrived, 1) # DA-614 - All specific disposal statuses in biobank_stored_samples are changed to DISPOSED # in the participant summary. self.assertEquals(status, getattr(summary, 'sampleStatus' + test)) sample_time = self._naive_utc_to_naive_central( getattr(summary, 'sampleStatus' + test + 'Time')) self.assertEquals(date_formatted, sample_time) def test_find_latest_csv(self): # The cloud storage testbed does not expose an injectable time function. # Creation time is stored at second granularity. self._write_cloud_csv('a_lex_first_created_first.csv', 'any contents') time.sleep(1.0) self._write_cloud_csv('z_lex_last_created_middle.csv', 'any contents') time.sleep(1.0) created_last = 'b_lex_middle_created_last.csv' self._write_cloud_csv(created_last, 'any contents') self._write_cloud_csv( '%s/created_last_in_subdir.csv' % biobank_samples_pipeline._REPORT_SUBDIR, 'any contents') latest_filename = biobank_samples_pipeline._find_latest_samples_csv( _FAKE_BUCKET) self.assertEquals(latest_filename, '/%s/%s' % (_FAKE_BUCKET, created_last)) def test_sample_from_row(self): samples_file = test_data.open_biobank_samples([112, 222, 333], []) reader = csv.DictReader(StringIO.StringIO(samples_file), delimiter='\t') row = reader.next() sample = biobank_samples_pipeline._create_sample_from_row( row, get_biobank_id_prefix()) self.assertIsNotNone(sample) cols = biobank_samples_pipeline.CsvColumns self.assertEquals(sample.biobankStoredSampleId, row[cols.SAMPLE_ID]) self.assertEquals(to_client_biobank_id(sample.biobankId), row[cols.EXTERNAL_PARTICIPANT_ID]) self.assertEquals(sample.test, row[cols.TEST_CODE]) confirmed_date = self._naive_utc_to_naive_central(sample.confirmed) self.assertEquals( confirmed_date.strftime( biobank_samples_pipeline._INPUT_TIMESTAMP_FORMAT), row[cols.CONFIRMED_DATE]) received_date = self._naive_utc_to_naive_central(sample.created) self.assertEquals( received_date.strftime( biobank_samples_pipeline._INPUT_TIMESTAMP_FORMAT), row[cols.CREATE_DATE]) def test_sample_from_row_wrong_prefix(self): samples_file = test_data.open_biobank_samples([111, 222, 333], []) reader = csv.DictReader(StringIO.StringIO(samples_file), delimiter='\t') row = reader.next() row[biobank_samples_pipeline.CsvColumns.CONFIRMED_DATE] = '2016 11 19' self.assertIsNone( biobank_samples_pipeline._create_sample_from_row(row, 'Q')) def test_sample_from_row_invalid(self): samples_file = test_data.open_biobank_samples([111, 222, 333], []) reader = csv.DictReader(StringIO.StringIO(samples_file), delimiter='\t') row = reader.next() row[biobank_samples_pipeline.CsvColumns.CONFIRMED_DATE] = '2016 11 19' with self.assertRaises(biobank_samples_pipeline.DataError): biobank_samples_pipeline._create_sample_from_row( row, get_biobank_id_prefix()) def test_sample_from_row_old_test(self): samples_file = test_data.open_biobank_samples([111, 222, 333], []) reader = csv.DictReader(StringIO.StringIO(samples_file), delimiter='\t') row = reader.next() row[biobank_samples_pipeline.CsvColumns.TEST_CODE] = '2PST8' sample = biobank_samples_pipeline._create_sample_from_row( row, get_biobank_id_prefix()) self.assertIsNotNone(sample) cols = biobank_samples_pipeline.CsvColumns self.assertEquals(sample.biobankStoredSampleId, row[cols.SAMPLE_ID]) self.assertEquals(sample.test, row[cols.TEST_CODE]) def test_column_missing(self): with open(test_data.data_path( 'biobank_samples_missing_field.csv')) as samples_file: reader = csv.DictReader(samples_file, delimiter='\t') with self.assertRaises(biobank_samples_pipeline.DataError): biobank_samples_pipeline._upsert_samples_from_csv(reader) def test_get_reconciliation_report_paths(self): dt = datetime.datetime(2016, 12, 22, 18, 30, 45) expected_prefix = 'reconciliation/report_2016-12-22' paths = biobank_samples_pipeline._get_report_paths(dt) self.assertEquals(len(paths), 4) for path in paths: self.assertTrue( path.startswith(expected_prefix), 'Report path %r must start with %r.' % (expected_prefix, path)) self.assertTrue(path.endswith('.csv'))