def generate_samples(fraction_missing):
    """Creates fake sample CSV data in GCS.

  Args:
    fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows
        generated in the fake CSV.
  """
    bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
    now = clock.CLOCK.now()
    file_name = '/%s/fake_%s.csv' % (bucket_name,
                                     now.strftime(INPUT_CSV_TIME_FORMAT))
    num_rows = 0
    sample_id_start = random.randint(1000000, 10000000)
    with cloudstorage_api.open(file_name, mode='w') as dest:
        writer = csv.writer(dest, delimiter="\t")
        writer.writerow(_HEADERS)
        biobank_order_dao = BiobankOrderDao()
        with biobank_order_dao.session() as session:
            rows = biobank_order_dao.get_ordered_samples_sample(
                session, 1 - fraction_missing, _BATCH_SIZE)
            for biobank_id, collected_time, test in rows:
                if collected_time is None:
                    logging.warning(
                        'biobank_id=%s test=%s skipped (collected=%s)',
                        biobank_id, test, collected_time)
                    continue
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED)
                confirmed_time = collected_time + datetime.timedelta(
                    minutes=minutes_delta)
                writer.writerow([
                    sample_id_start + num_rows,
                    None,  # no parent
                    confirmed_time.strftime(_TIME_FORMAT),
                    to_client_biobank_id(biobank_id),
                    test,
                    confirmed_time.strftime(_TIME_FORMAT),
                    'KIT'
                ])  # reuse confirmed time as created time
                num_rows += 1
        participant_dao = ParticipantDao()
        with participant_dao.session() as session:
            rows = participant_dao.get_biobank_ids_sample(
                session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE)
            for biobank_id, sign_up_time in rows:
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED)
                confirmed_time = sign_up_time + datetime.timedelta(
                    minutes=minutes_delta)
                tests = random.sample(BIOBANK_TESTS,
                                      random.randint(1, len(BIOBANK_TESTS)))
                for test in tests:
                    writer.writerow([
                        sample_id_start + num_rows, None,
                        confirmed_time.strftime(_TIME_FORMAT),
                        to_client_biobank_id(biobank_id), test,
                        confirmed_time.strftime(_TIME_FORMAT), 'KIT'
                    ])
                    num_rows += 1
    logging.info("Generated %d samples in %s.", num_rows, file_name)
Beispiel #2
0
class ParticipantDaoTest(SqlTestBase):
    def setUp(self):
        super(ParticipantDaoTest, self).setUp()
        self.dao = ParticipantDao()
        self.participant_summary_dao = ParticipantSummaryDao()
        self.participant_history_dao = ParticipantHistoryDao()

    def test_get_before_insert(self):
        self.assertIsNone(self.dao.get(1))
        self.assertIsNone(self.participant_summary_dao.get(1))
        self.assertIsNone(self.participant_history_dao.get([1, 1]))

    def test_insert(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=1,
            biobankId=2,
            lastModified=time,
            signUpTime=time)
        self.assertEquals(expected_participant.asdict(), p.asdict())

        p2 = self.dao.get(1)
        self.assertEquals(p.asdict(), p2.asdict())

        # Creating a participant also creates a ParticipantHistory row, but not a ParticipantSummary row
        ps = self.participant_summary_dao.get(1)
        self.assertIsNone(ps)
        ph = self.participant_history_dao.get([1, 1])
        expected_ph = self._participant_history_with_defaults(
            participantId=1, biobankId=2, lastModified=time, signUpTime=time)
        self.assertEquals(expected_ph.asdict(), ph.asdict())

    def test_insert_with_external_id(self):
        p = Participant(externalId=3)
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            externalId=3,
            version=1,
            biobankId=2,
            lastModified=time,
            signUpTime=time)
        self.assertEquals(expected_participant.asdict(), p.asdict())

        p2 = self.dao.get(1)
        self.assertEquals(p.asdict(), p2.asdict())

        # Creating a participant also creates a ParticipantHistory row, but not a ParticipantSummary row
        ps = self.participant_summary_dao.get(1)
        self.assertIsNone(ps)
        ph = self.participant_history_dao.get([1, 1])
        expected_ph = self._participant_history_with_defaults(
            participantId=1,
            externalId=3,
            biobankId=2,
            lastModified=time,
            signUpTime=time)
        self.assertEquals(expected_ph.asdict(), ph.asdict())

    def test_insert_duplicate_participant_id_retry(self):
        p = Participant()
        with random_ids([1, 2]):
            self.dao.insert(p)
        p2 = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 3, 2, 3]):
            with FakeClock(time):
                p2 = self.dao.insert(p2)
        expected_participant = self._participant_with_defaults(
            participantId=2,
            version=1,
            biobankId=3,
            lastModified=time,
            signUpTime=time)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

    def test_insert_duplicate_participant_id_give_up(self):
        p = Participant()
        with random_ids([1, 2]):
            self.dao.insert(p)
        rand_ints = []
        for i in range(0, MAX_INSERT_ATTEMPTS):
            rand_ints.append(1)
            rand_ints.append(i)
        p2 = Participant()
        with random_ids(rand_ints):
            with self.assertRaises(ServiceUnavailable):
                self.dao.insert(p2)

    def test_insert_duplicate_biobank_id_give_up(self):
        p = Participant()
        with random_ids([1, 2]):
            self.dao.insert(p)
        rand_ints = []
        for i in range(0, MAX_INSERT_ATTEMPTS):
            rand_ints.append(i + 2)
            rand_ints.append(2)
        p2 = Participant()
        with random_ids(rand_ints):
            with self.assertRaises(ServiceUnavailable):
                self.dao.insert(p2)

    def test_update_no_expected_version_no_ps(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)

        p.providerLink = make_primary_provider_link_for_name('PITT')
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)
        # lastModified, hpoId, version is updated on p after being passed in
        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_participant.asdict(), p2.asdict())
        self.assertEquals(p.asdict(), p2.asdict())

        ps = self.participant_summary_dao.get(1)
        self.assertIsNone(ps)

        expected_ph = self._participant_history_with_defaults(
            participantId=1, biobankId=2, lastModified=time, signUpTime=time)
        # Updating the participant adds a new ParticipantHistory row.
        ph = self.participant_history_dao.get([1, 1])
        self.assertEquals(expected_ph.asdict(), ph.asdict())
        ph2 = self.participant_history_dao.get([1, 2])
        expected_ph2 = self._participant_history_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_ph2.asdict(), ph2.asdict())

    def test_update_no_expected_version_with_ps(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        p.providerLink = make_primary_provider_link_for_name('PITT')
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)

        summary = self.participant_summary(p)
        self.participant_summary_dao.insert(summary)

        # lastModified, hpoId, version is updated on p after being passed in
        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_participant.asdict(), p2.asdict())
        self.assertEquals(p.asdict(), p2.asdict())

        # Updating the participant provider link also updates the HPO ID on the participant summary.
        ps = self.participant_summary_dao.get(1)
        expected_ps = self._participant_summary_with_defaults(
            participantId=1,
            biobankId=2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            lastModified=time2,
            firstName=summary.firstName,
            lastName=summary.lastName,
            email=summary.email)
        self.assertEquals(expected_ps.asdict(), ps.asdict())

        p2_last_modified = p2.lastModified
        p2.hpoId = 2
        self.dao.update(p2)
        p2_update = self.dao.get(1)
        self.assertNotEquals(p2_last_modified, p2_update.lastModified)
        self.assertEquals(p2_update.lastModified, p2.lastModified)

        expected_ph = self._participant_history_with_defaults(
            participantId=1, biobankId=2, lastModified=time, signUpTime=time)
        # And updating the participant adds a new ParticipantHistory row.
        ph = self.participant_history_dao.get([1, 1])
        self.assertEquals(expected_ph.asdict(), ph.asdict())
        ph2 = self.participant_history_dao.get([1, 2])
        expected_ph2 = self._participant_history_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_ph2.asdict(), ph2.asdict())

    def test_update_right_expected_version(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        p.version = 1
        p.providerLink = make_primary_provider_link_for_name('PITT')
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)

        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

    def test_update_withdraw(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        p.version = 1
        p.withdrawalStatus = WithdrawalStatus.NO_USE
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)

        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            withdrawalStatus=WithdrawalStatus.NO_USE,
            withdrawalTime=time2)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

        p.version = 2
        p.providerLink = make_primary_provider_link_for_name('PITT')
        p.withdrawalTime = None
        time3 = datetime.datetime(2016, 1, 3)
        with FakeClock(time3):
            self.dao.update(p)

        # Withdrawal time should get copied over.
        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=3,
            biobankId=2,
            lastModified=time3,
            signUpTime=time,
            withdrawalStatus=WithdrawalStatus.NO_USE,
            withdrawalTime=time2,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

    def test_update_suspend(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)
        p.version = 1
        p.suspensionStatus = SuspensionStatus.NO_CONTACT
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)

        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=2,
            biobankId=2,
            lastModified=time2,
            signUpTime=time,
            suspensionStatus=SuspensionStatus.NO_CONTACT,
            suspensionTime=time2)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

        p.version = 2
        p.providerLink = make_primary_provider_link_for_name('PITT')
        p.suspensionTime = None
        time3 = datetime.datetime(2016, 1, 3)
        with FakeClock(time3):
            self.dao.update(p)

        # Withdrawal time should get copied over.
        p2 = self.dao.get(1)
        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=3,
            biobankId=2,
            lastModified=time3,
            signUpTime=time,
            suspensionStatus=SuspensionStatus.NO_CONTACT,
            suspensionTime=time2,
            hpoId=PITT_HPO_ID,
            providerLink=p2.providerLink)
        self.assertEquals(expected_participant.asdict(), p2.asdict())

    def test_update_wrong_expected_version(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)

        p.version = 2
        p.providerLink = make_primary_provider_link_for_name('PITT')
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            with self.assertRaises(PreconditionFailed):
                self.dao.update(p)

    def test_update_withdrawn_hpo_succeeds(self):
        p = Participant(withdrawalStatus=WithdrawalStatus.NO_USE)
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)

        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=1,
            biobankId=2,
            lastModified=time,
            signUpTime=time,
            withdrawalStatus=WithdrawalStatus.NO_USE)
        self.assertEquals(expected_participant.asdict(), p.asdict())

        p2 = self.dao.get(1)
        self.assertEquals(p.asdict(), p2.asdict())

        p.version = 1
        p.providerLink = make_primary_provider_link_for_name('PITT')
        self.dao.update(p)

    def test_update_withdrawn_status_fails(self):
        p = Participant(withdrawalStatus=WithdrawalStatus.NO_USE)
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)

        expected_participant = self._participant_with_defaults(
            participantId=1,
            version=1,
            biobankId=2,
            lastModified=time,
            signUpTime=time,
            withdrawalStatus=WithdrawalStatus.NO_USE)
        self.assertEquals(expected_participant.asdict(), p.asdict())

        p2 = self.dao.get(1)
        self.assertEquals(p.asdict(), p2.asdict())

        p.version = 1
        p.withdrawalStatus = WithdrawalStatus.NOT_WITHDRAWN
        with self.assertRaises(Forbidden):
            self.dao.update(p)

    def test_update_not_exists(self):
        p = self._participant_with_defaults(participantId=1, biobankId=2)
        with self.assertRaises(NotFound):
            self.dao.update(p)

    def test_bad_hpo_insert(self):
        p = Participant(
            participantId=1,
            version=1,
            biobankId=2,
            providerLink=make_primary_provider_link_for_name('FOO'))
        with self.assertRaises(BadRequest):
            self.dao.insert(p)

    def test_bad_hpo_update(self):
        p = Participant(participantId=1, biobankId=2)
        time = datetime.datetime(2016, 1, 1)
        with FakeClock(time):
            self.dao.insert(p)

        p.providerLink = make_primary_provider_link_for_name('FOO')
        with self.assertRaises(BadRequest):
            self.dao.update(p)

    def test_pairs_unset(self):
        participant_id = 22
        self.dao.insert(Participant(participantId=participant_id, biobankId=2))
        refetched = self.dao.get(participant_id)
        self.assertEquals(refetched.hpoId, UNSET_HPO_ID)  # sanity check
        self.participant_summary_dao.insert(
            self.participant_summary(refetched))

        with self.dao.session() as session:
            self.dao.add_missing_hpo_from_site(session, participant_id,
                                               self._test_db.site_id)

        paired = self.dao.get(participant_id)
        self.assertEquals(paired.hpoId, self._test_db.hpo_id)
        self.assertEquals(
            paired.providerLink,
            make_primary_provider_link_for_id(self._test_db.hpo_id))
        self.assertEquals(
            self.participant_summary_dao.get(participant_id).hpoId,
            self._test_db.hpo_id)
        self.assertEquals(paired.organizationId, self._test_db.organization_id)
        self.assertEquals(paired.siteId, self._test_db.site_id)

    def test_overwrite_existing_pairing(self):
        participant_id = 99
        created = self.dao.insert(
            Participant(participantId=participant_id,
                        biobankId=2,
                        hpoId=self._test_db.hpo_id,
                        providerLink=make_primary_provider_link_for_id(
                            self._test_db.hpo_id)))
        self.participant_summary_dao.insert(self.participant_summary(created))
        self.assertEquals(created.hpoId, self._test_db.hpo_id)  # sanity check

        other_hpo = HPODao().insert(
            HPO(hpoId=PITT_HPO_ID + 1, name='DIFFERENT_HPO'))
        other_site = SiteDao().insert(
            Site(hpoId=other_hpo.hpoId,
                 siteName='Arbitrary Site',
                 googleGroup='*****@*****.**'))

        with self.dao.session() as session:
            self.dao.add_missing_hpo_from_site(session, participant_id,
                                               other_site.siteId)

        # Original Participant + summary is affected.
        refetched = self.dao.get(participant_id)

        self.assertEquals(refetched.hpoId, other_hpo.hpoId)
        self.assertEquals(refetched.providerLink,
                          make_primary_provider_link_for_id(other_hpo.hpoId))
        self.assertEquals(
            self.participant_summary_dao.get(participant_id).hpoId,
            other_hpo.hpoId)

    def test_pairing_at_different_levels(self):
        p = Participant()
        time = datetime.datetime(2016, 1, 1)
        with random_ids([1, 2]):
            with FakeClock(time):
                self.dao.insert(p)

        p.version = 1
        p.siteId = 1
        time2 = datetime.datetime(2016, 1, 2)
        with FakeClock(time2):
            self.dao.update(p)

        p2 = self.dao.get(1)
        ep = self._participant_with_defaults(participantId=1,
                                             version=2,
                                             biobankId=2,
                                             lastModified=time2,
                                             signUpTime=time,
                                             hpoId=PITT_HPO_ID,
                                             siteId=1,
                                             organizationId=PITT_ORG_ID,
                                             providerLink=p2.providerLink)
        self.assertEquals(ep.siteId, p2.siteId)
        # ensure that p2 get paired with expected awardee and organization from update().
        self.assertEquals(ep.hpoId, p2.hpoId)
        self.assertEquals(ep.organizationId, p2.organizationId)
Beispiel #3
0
class BiobankSamplesPipelineTest(CloudStorageSqlTestBase, NdbTestBase):
    def setUp(self):
        super(BiobankSamplesPipelineTest, self).setUp(use_mysql=True)
        NdbTestBase.doSetUp(self)
        TestBase.setup_fake(self)
        config.override_setting(config.BASELINE_SAMPLE_TEST_CODES,
                                _BASELINE_TESTS)
        # Everything is stored as a list, so override bucket name as a 1-element list.
        config.override_setting(config.BIOBANK_SAMPLES_BUCKET_NAME,
                                [_FAKE_BUCKET])
        self.participant_dao = ParticipantDao()
        self.summary_dao = ParticipantSummaryDao()

    def _write_cloud_csv(self, file_name, contents_str):
        with cloudstorage_api.open('/%s/%s' % (_FAKE_BUCKET, file_name),
                                   mode='w') as cloud_file:
            cloud_file.write(contents_str.encode('utf-8'))

    def _make_biobank_order(self, **kwargs):
        """Makes a new BiobankOrder (same values every time) with valid/complete defaults.

    Kwargs pass through to BiobankOrder constructor, overriding defaults.
    """
        participantId = kwargs['participantId']
        modified = datetime.datetime(2019, 03, 25, 15, 59, 30)

        for k, default_value in (
            ('biobankOrderId', u'1'),
            ('created', clock.CLOCK.now()),
                # ('participantId', self.participant.participantId),
            ('sourceSiteId', 1),
            ('sourceUsername', u'*****@*****.**'),
            ('collectedSiteId', 1),
            ('collectedUsername', u'*****@*****.**'),
            ('processedSiteId', 1),
            ('processedUsername', u'*****@*****.**'),
            ('finalizedSiteId', 2),
            ('finalizedUsername', u'*****@*****.**'),
            ('version', 1),
            ('identifiers', [BiobankOrderIdentifier(system=u'a', value=u'c')]),
            ('samples', [
                BiobankOrderedSample(test=u'1SAL2',
                                     description=u'description',
                                     processingRequired=True)
            ]),
            ('dvOrders', [
                BiobankDVOrder(participantId=participantId,
                               modified=modified,
                               version=1)
            ])):
            if k not in kwargs:
                kwargs[k] = default_value
        return BiobankOrder(**kwargs)

    def test_dv_order_sample_update(self):
        """
    Test Biobank Direct Volunteer order
    """
        participant = self.participant_dao.insert(Participant())
        self.summary_dao.insert(self.participant_summary(participant))

        created_ts = datetime.datetime(2019, 03, 22, 18, 30, 45)
        confirmed_ts = datetime.datetime(2019, 03, 23, 12, 13, 00)

        bo = self._make_biobank_order(participantId=participant.participantId)
        BiobankOrderDao().insert(bo)

        boi = bo.identifiers[0]

        bss = BiobankStoredSample(biobankStoredSampleId=u'23523523',
                                  biobankId=participant.biobankId,
                                  test=u'1SAL2',
                                  created=created_ts,
                                  biobankOrderIdentifier=boi.value,
                                  confirmed=confirmed_ts)

        with self.participant_dao.session() as session:
            session.add(bss)

        ps = self.summary_dao.get(participant.participantId)
        self.assertIsNone(ps.sampleStatusDV1SAL2)
        self.assertIsNone(ps.sampleStatusDV1SAL2Time)

        self.summary_dao.update_from_biobank_stored_samples()
        ps = self.summary_dao.get(participant.participantId)
        self.assertEqual(ps.sampleStatus1SAL2, SampleStatus.RECEIVED)
        self.assertEqual(ps.sampleStatus1SAL2Time, confirmed_ts)

    def test_end_to_end(self):
        dao = BiobankStoredSampleDao()
        self.assertEquals(dao.count(), 0)

        # Create 3 participants and pass their (random) IDs into sample rows.
        summary_dao = ParticipantSummaryDao()
        biobank_ids = []
        participant_ids = []
        nids = 16  # equal to the number of parent rows in 'biobank_samples_1.csv'
        cids = 1  # equal to the number of child rows in 'biobank_samples_1.csv'

        for _ in xrange(nids):
            participant = self.participant_dao.insert(Participant())
            summary_dao.insert(self.participant_summary(participant))
            participant_ids.append(participant.participantId)
            biobank_ids.append(participant.biobankId)
            self.assertEquals(
                summary_dao.get(
                    participant.participantId).numBaselineSamplesArrived, 0)

        test_codes = random.sample(_BASELINE_TESTS, nids)
        samples_file = test_data.open_biobank_samples(biobank_ids=biobank_ids,
                                                      tests=test_codes)
        lines = samples_file.split('\n')[1:]  # remove field name line

        input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central(
            clock.CLOCK.now()).strftime(
                biobank_samples_pipeline.INPUT_CSV_TIME_FORMAT)
        self._write_cloud_csv(input_filename, samples_file)
        biobank_samples_pipeline.upsert_from_latest_csv()

        self.assertEquals(dao.count(), nids - cids)

        for x in range(0, nids):
            cols = lines[x].split('\t')

            if cols[10].strip():  # skip child sample
                continue

            # If status is 'In Prep', then sample confirmed timestamp should be empty
            if cols[2] == 'In Prep':
                self.assertEquals(len(cols[11]), 0)
            else:
                status = SampleStatus.RECEIVED
                ts_str = cols[11]
                # DA-814 - Participant Summary test status should be: Unset, Received or Disposed only.
                # If sample is disposed, then check disposed timestamp, otherwise check confirmed timestamp.
                # DA-871 - Only check status is disposed when reason code is a bad disposal.
                if cols[2] == 'Disposed' and get_sample_status_enum_value(
                        cols[8]) > SampleStatus.UNKNOWN:
                    status = SampleStatus.DISPOSED
                    ts_str = cols[9]

                ts = datetime.datetime.strptime(ts_str, '%Y/%m/%d %H:%M:%S')
                self._check_summary(participant_ids[x], test_codes[x], ts,
                                    status)

    def test_old_csv_not_imported(self):
        now = clock.CLOCK.now()
        too_old_time = now - datetime.timedelta(hours=25)
        input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central(
            too_old_time).strftime(
                biobank_samples_pipeline.INPUT_CSV_TIME_FORMAT)
        self._write_cloud_csv(input_filename, '')
        with self.assertRaises(biobank_samples_pipeline.DataError):
            biobank_samples_pipeline.upsert_from_latest_csv()

    def _naive_utc_to_naive_central(self, naive_utc_date):
        utc_date = pytz.utc.localize(naive_utc_date)
        central_date = utc_date.astimezone(pytz.timezone('US/Central'))
        return central_date.replace(tzinfo=None)

    def _check_summary(self, participant_id, test, date_formatted, status):
        summary = ParticipantSummaryDao().get(participant_id)
        self.assertEquals(summary.numBaselineSamplesArrived, 1)
        # DA-614 - All specific disposal statuses in biobank_stored_samples are changed to DISPOSED
        # in the participant summary.
        self.assertEquals(status, getattr(summary, 'sampleStatus' + test))
        sample_time = self._naive_utc_to_naive_central(
            getattr(summary, 'sampleStatus' + test + 'Time'))
        self.assertEquals(date_formatted, sample_time)

    def test_find_latest_csv(self):
        # The cloud storage testbed does not expose an injectable time function.
        # Creation time is stored at second granularity.
        self._write_cloud_csv('a_lex_first_created_first.csv', 'any contents')
        time.sleep(1.0)
        self._write_cloud_csv('z_lex_last_created_middle.csv', 'any contents')
        time.sleep(1.0)
        created_last = 'b_lex_middle_created_last.csv'
        self._write_cloud_csv(created_last, 'any contents')
        self._write_cloud_csv(
            '%s/created_last_in_subdir.csv' %
            biobank_samples_pipeline._REPORT_SUBDIR, 'any contents')

        latest_filename = biobank_samples_pipeline._find_latest_samples_csv(
            _FAKE_BUCKET)
        self.assertEquals(latest_filename,
                          '/%s/%s' % (_FAKE_BUCKET, created_last))

    def test_sample_from_row(self):
        samples_file = test_data.open_biobank_samples([112, 222, 333], [])
        reader = csv.DictReader(StringIO.StringIO(samples_file),
                                delimiter='\t')
        row = reader.next()
        sample = biobank_samples_pipeline._create_sample_from_row(
            row, get_biobank_id_prefix())
        self.assertIsNotNone(sample)

        cols = biobank_samples_pipeline.CsvColumns
        self.assertEquals(sample.biobankStoredSampleId, row[cols.SAMPLE_ID])
        self.assertEquals(to_client_biobank_id(sample.biobankId),
                          row[cols.EXTERNAL_PARTICIPANT_ID])
        self.assertEquals(sample.test, row[cols.TEST_CODE])
        confirmed_date = self._naive_utc_to_naive_central(sample.confirmed)
        self.assertEquals(
            confirmed_date.strftime(
                biobank_samples_pipeline._INPUT_TIMESTAMP_FORMAT),
            row[cols.CONFIRMED_DATE])
        received_date = self._naive_utc_to_naive_central(sample.created)
        self.assertEquals(
            received_date.strftime(
                biobank_samples_pipeline._INPUT_TIMESTAMP_FORMAT),
            row[cols.CREATE_DATE])

    def test_sample_from_row_wrong_prefix(self):
        samples_file = test_data.open_biobank_samples([111, 222, 333], [])
        reader = csv.DictReader(StringIO.StringIO(samples_file),
                                delimiter='\t')
        row = reader.next()
        row[biobank_samples_pipeline.CsvColumns.CONFIRMED_DATE] = '2016 11 19'
        self.assertIsNone(
            biobank_samples_pipeline._create_sample_from_row(row, 'Q'))

    def test_sample_from_row_invalid(self):
        samples_file = test_data.open_biobank_samples([111, 222, 333], [])
        reader = csv.DictReader(StringIO.StringIO(samples_file),
                                delimiter='\t')
        row = reader.next()
        row[biobank_samples_pipeline.CsvColumns.CONFIRMED_DATE] = '2016 11 19'
        with self.assertRaises(biobank_samples_pipeline.DataError):
            biobank_samples_pipeline._create_sample_from_row(
                row, get_biobank_id_prefix())

    def test_sample_from_row_old_test(self):
        samples_file = test_data.open_biobank_samples([111, 222, 333], [])
        reader = csv.DictReader(StringIO.StringIO(samples_file),
                                delimiter='\t')
        row = reader.next()
        row[biobank_samples_pipeline.CsvColumns.TEST_CODE] = '2PST8'
        sample = biobank_samples_pipeline._create_sample_from_row(
            row, get_biobank_id_prefix())
        self.assertIsNotNone(sample)
        cols = biobank_samples_pipeline.CsvColumns
        self.assertEquals(sample.biobankStoredSampleId, row[cols.SAMPLE_ID])
        self.assertEquals(sample.test, row[cols.TEST_CODE])

    def test_column_missing(self):
        with open(test_data.data_path(
                'biobank_samples_missing_field.csv')) as samples_file:
            reader = csv.DictReader(samples_file, delimiter='\t')
            with self.assertRaises(biobank_samples_pipeline.DataError):
                biobank_samples_pipeline._upsert_samples_from_csv(reader)

    def test_get_reconciliation_report_paths(self):
        dt = datetime.datetime(2016, 12, 22, 18, 30, 45)
        expected_prefix = 'reconciliation/report_2016-12-22'
        paths = biobank_samples_pipeline._get_report_paths(dt)
        self.assertEquals(len(paths), 4)
        for path in paths:
            self.assertTrue(
                path.startswith(expected_prefix),
                'Report path %r must start with %r.' % (expected_prefix, path))
            self.assertTrue(path.endswith('.csv'))