def update_package_id_from_manifest_result_file(genomic_set_id, csv_file):
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames)
    if len(csv_reader.fieldnames) == 1:
        csv_file.seek(0, 0)
        csv_reader = csv.DictReader(csv_file, delimiter='\t')
        missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames)
    if missing_cols:
        raise DataError('CSV is missing columns %s, had columns %s.' %
                        (missing_cols, csv_reader.fieldnames))

    ClientIdPackageIdPair = collections.namedtuple('ClientIdPackageIdPair', [
        'client_id',
        'package_id',
    ])
    update_queue = collections.deque()

    dao = GenomicSetMemberDao()

    try:
        rows = list(csv_reader)
        for row in rows:
            if row[CsvColumns.VALUE] and row[CsvColumns.PACKAGE_ID]:
                update_queue.append(
                    ClientIdPackageIdPair(row[CsvColumns.VALUE],
                                          row[CsvColumns.PACKAGE_ID]))

        dao.bulk_update_package_id(genomic_set_id, update_queue)

    except ValueError, e:
        raise DataError(e)
Beispiel #2
0
def update_sample_info_from_genotyping_manifest_file(csv_file):
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    if not set(CsvColumns.REQUIRED_COLS).issubset(set(csv_reader.fieldnames)):
        raise DataError('CSV is missing columns %s, had columns %s.' %
                        (CsvColumns.REQUIRED_COLS, csv_reader.fieldnames))

    genotypying_data = collections.namedtuple('genotypingData', [
        'biobank_id',
        'genome_type',
        'sample_id',
        'sample_type',
    ])
    update_queue = collections.deque()

    dao = GenomicSetMemberDao()

    try:
        rows = list(csv_reader)
        for row in rows:
            if row[CsvColumns.BIOBANK_ID] and row[CsvColumns.SAMPLE_ID] and row[CsvColumns.SAMPLE_TYPE] \
              and row[CsvColumns.TEST_NAME]:
                biobank_id = row[CsvColumns.BIOBANK_ID][len(BIOBANK_ID_PREFIX):] \
                  if row[CsvColumns.BIOBANK_ID].startswith(BIOBANK_ID_PREFIX) \
                  else row[CsvColumns.BIOBANK_ID]
                update_queue.append(
                    genotypying_data(biobank_id, row[CsvColumns.TEST_NAME],
                                     row[CsvColumns.SAMPLE_ID],
                                     row[CsvColumns.SAMPLE_TYPE]))

        dao.bulk_update_genotyping_sample_manifest_data(update_queue)

    except ValueError, e:
        raise DataError(e)
Beispiel #3
0
 def setUp(self, with_data=True, use_mysql=False):
     super(GenomicSetValidationBaseTestCase,
           self).setUp(with_data=with_data, use_mysql=use_mysql)
     self.participant_dao = ParticipantDao()
     self.summary_dao = ParticipantSummaryDao()
     self.genomic_set_dao = GenomicSetDao()
     self.genomic_member_dao = GenomicSetMemberDao()
     self._participant_i = 0
     self.setup_data()
Beispiel #4
0
def _save_genomic_set_from_csv(csv_reader, csv_filename, timestamp):
  """Inserts GenomicSet and GenomicSetMember from a csv.DictReader."""
  missing_cols = set(CsvColumns.ALL) - set(csv_reader.fieldnames)
  if missing_cols:
    raise DataError(
        'CSV is missing columns %s, had columns %s.' % (missing_cols, csv_reader.fieldnames))
  member_dao = GenomicSetMemberDao()
  genomic_set_id = None
  try:
    members = []
    rows = list(csv_reader)
    for i, row in enumerate(rows):
      if i == 0:
        if row[CsvColumns.GENOMIC_SET_NAME] and row[CsvColumns.GENOMIC_SET_CRITERIA]:
          genomic_set = _insert_genomic_set_from_row(row, csv_filename, timestamp)
          genomic_set_id = genomic_set.id
        else:
          raise DataError('CSV is missing columns genomic_set_name or genomic_set_criteria')
      member = _create_genomic_set_member_from_row(genomic_set_id, row)
      members.append(member)
      if len(members) >= _BATCH_SIZE:
        member_dao.upsert_all(members)
        members = []

    if members:
      member_dao.upsert_all(members)

    member_dao.update_biobank_id(genomic_set_id)

    return genomic_set_id
  except ValueError, e:
    raise DataError(e)
  def test_read_from_csv_file(self):
    participant = self.participant_dao.insert(Participant(participantId=123, biobankId=1234))
    self.summary_dao.insert(self.participant_summary(participant))
    bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123',
                                  identifiers=[BiobankOrderIdentifier(
                                    system=u'https://www.pmi-ops.org', value=u'12345678')])
    BiobankOrderDao().insert(bo)

    participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=1235))
    self.summary_dao.insert(self.participant_summary(participant2))
    bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345679')])
    BiobankOrderDao().insert(bo2)

    participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=1236))
    self.summary_dao.insert(self.participant_summary(participant3))
    bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345680')])
    BiobankOrderDao().insert(bo3)

    samples_file = test_data.open_genomic_set_file('Genomic-Test-Set-test-1.csv')

    input_filename = 'cloud%s.csv' % self._naive_utc_to_naive_central(clock.CLOCK.now()).strftime(
        genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

    self._write_cloud_csv(input_filename, samples_file)
    genomic_set_file_handler.read_genomic_set_from_bucket()
    set_dao = GenomicSetDao()
    obj = set_dao.get_all()[0]

    self.assertEqual(obj.genomicSetName, 'name_xxx')
    self.assertEqual(obj.genomicSetCriteria, 'criteria_xxx')
    self.assertEqual(obj.genomicSetVersion, 1)

    member_dao = GenomicSetMemberDao()
    items = member_dao.get_all()
    for item in items:
      self.assertIn(item.participantId, [123, 124, 125])
      self.assertIn(item.biobankOrderId, ['123', '124', '125'])
      self.assertIn(item.biobankId, ['1234', '1235', '1236'])
      self.assertIn(item.biobankOrderClientId, ['12345678', '12345679', '12345680'])
      self.assertEqual(item.genomicSetId, 1)
      self.assertIn(item.genomeType, ['aou_wgs', 'aou_array'])
      self.assertIn(item.nyFlag, [0, 1])
      self.assertIn(item.sexAtBirth, ['F', 'M'])
Beispiel #6
0
    def test_over_24hours_genomic_set_file_case(self):
        samples_file = test_data.open_genomic_set_file(
            'Genomic-Test-Set-test-3.csv')

        over_24hours_time = clock.CLOCK.now() - datetime.timedelta(hours=25)

        input_filename = 'Genomic-Test-Set-v1%s.csv' % self \
          ._naive_utc_to_naive_central(over_24hours_time) \
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(input_filename, samples_file)

        genomic_pipeline.process_genomic_water_line()

        member_dao = GenomicSetMemberDao()
        members = member_dao.get_all()
        self.assertEqual(len(members), 0)
Beispiel #7
0
    def _create_fake_genomic_member(
            self,
            genomic_set_id,
            participant_id,
            biobank_order_id,
            validation_status=GenomicValidationStatus.VALID,
            sex_at_birth='F',
            genome_type='aou_array',
            ny_flag='Y'):
        genomic_set_member = GenomicSetMember()
        genomic_set_member.genomicSetId = genomic_set_id
        genomic_set_member.validationStatus = validation_status
        genomic_set_member.participantId = participant_id
        genomic_set_member.sexAtBirth = sex_at_birth
        genomic_set_member.genomeType = genome_type
        genomic_set_member.nyFlag = 1 if ny_flag == 'Y' else 0
        genomic_set_member.biobankOrderId = biobank_order_id

        member_dao = GenomicSetMemberDao()
        member_dao.insert(genomic_set_member)
Beispiel #8
0
  def _create_fake_genomic_member(self, genomic_set_id, participant_id, biobank_order_id,
                                  biobank_id, biobank_order_client_id,
                                  validation_status=GenomicSetMemberStatus.VALID,
                                  validation_flags=None,
                                  sex_at_birth='F', genome_type='aou_array', ny_flag='Y'):
    now = clock.CLOCK.now()
    genomic_set_member = GenomicSetMember()
    genomic_set_member.genomicSetId = genomic_set_id
    genomic_set_member.created = now
    genomic_set_member.modified = now
    genomic_set_member.validationStatus = validation_status
    genomic_set_member.validationFlags = validation_flags
    genomic_set_member.participantId = participant_id
    genomic_set_member.sexAtBirth = sex_at_birth
    genomic_set_member.genomeType = genome_type
    genomic_set_member.nyFlag = 1 if ny_flag == 'Y' else 0
    genomic_set_member.biobankOrderId = biobank_order_id
    genomic_set_member.biobankId = biobank_id
    genomic_set_member.biobankOrderClientId = biobank_order_client_id

    member_dao = GenomicSetMemberDao()
    member_dao.insert(genomic_set_member)
Beispiel #9
0
class GenomicSetValidationBaseTestCase(SqlTestBase):
    def setUp(self, with_data=True, use_mysql=False):
        super(GenomicSetValidationBaseTestCase,
              self).setUp(with_data=with_data, use_mysql=use_mysql)
        self.participant_dao = ParticipantDao()
        self.summary_dao = ParticipantSummaryDao()
        self.genomic_set_dao = GenomicSetDao()
        self.genomic_member_dao = GenomicSetMemberDao()
        self._participant_i = 0
        self.setup_data()

    def setup_data(self):
        pass

    def make_participant(self, **kwargs):
        """
    Make a participant with custom settings.
    default should create a valid participant.
    """
        i = self._participant_i
        self._participant_i += 1
        participant = Participant(participantId=i, biobankId=i, **kwargs)
        self.participant_dao.insert(participant)
        return participant

    def make_summary(self, participant, **override_kwargs):
        """
    Make a summary with custom settings.
    default should create a valid summary.
    """
        valid_kwargs = dict(participantId=participant.participantId,
                            biobankId=participant.biobankId,
                            withdrawalStatus=participant.withdrawalStatus,
                            dateOfBirth=datetime.datetime(2000, 1, 1),
                            firstName='foo',
                            lastName='bar',
                            zipCode='12345',
                            sampleStatus1ED04=SampleStatus.RECEIVED,
                            sampleStatus1SAL2=SampleStatus.RECEIVED,
                            samplesToIsolateDNA=SampleStatus.RECEIVED,
                            consentForStudyEnrollmentTime=datetime.datetime(
                                2019, 1, 1))
        kwargs = dict(valid_kwargs, **override_kwargs)
        summary = self._participant_summary_with_defaults(**kwargs)
        self.summary_dao.insert(summary)
        return summary

    def make_genomic_set(self, **override_kwargs):
        """
    Make a genomic set with custom settings.
    default should create a valid set.
    """
        valid_kwargs = dict(genomicSetName='foo',
                            genomicSetCriteria='something',
                            genomicSetVersion=1,
                            genomicSetStatus=GenomicSetStatus.UNSET)
        kwargs = dict(valid_kwargs, **override_kwargs)
        genomic_set = GenomicSet(**kwargs)
        self.genomic_set_dao.insert(genomic_set)
        return genomic_set

    def make_genomic_member(self, genomic_set, participant, **override_kwargs):
        """
    Make a genomic member with custom settings.
    default should create a valid member.
    """
        valid_kwargs = dict(genomicSetId=genomic_set.id,
                            participantId=participant.participantId,
                            sexAtBirth='F',
                            biobankId=participant.biobankId,
                            biobankOrderClientId='12345678')
        kwargs = dict(valid_kwargs, **override_kwargs)
        member = GenomicSetMember(**kwargs)
        self.genomic_member_dao.insert(member)
        return member
Beispiel #10
0
    def test_end_to_end_valid_case(self):
        participant = self._make_participant()
        self._make_summary(participant)
        self._make_biobank_order(participantId=participant.participantId,
                                 biobankOrderId=participant.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345678')
                                 ])

        participant2 = self._make_participant()
        self._make_summary(participant2)
        self._make_biobank_order(participantId=participant2.participantId,
                                 biobankOrderId=participant2.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345679')
                                 ])

        participant3 = self._make_participant()
        self._make_summary(participant3)
        self._make_biobank_order(participantId=participant3.participantId,
                                 biobankOrderId=participant3.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345680')
                                 ])

        samples_file = test_data.open_genomic_set_file(
            'Genomic-Test-Set-test-2.csv')

        input_filename = 'Genomic-Test-Set-v1%s.csv' % self\
          ._naive_utc_to_naive_central(clock.CLOCK.now())\
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(input_filename, samples_file)

        manifest_result_file = test_data.open_genomic_set_file(
            'Genomic-Manifest-Result-test.csv')

        manifest_result_filename = 'Genomic-Manifest-Result-AoU-1-v1%s.csv' % self \
          ._naive_utc_to_naive_central(clock.CLOCK.now()) \
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(manifest_result_filename,
                              manifest_result_file,
                              bucket=_FAKE_BIOBANK_SAMPLE_BUCKET,
                              folder=_FAKE_BUCKET_RESULT_FOLDER)

        genomic_pipeline.process_genomic_water_line()

        # verify result file
        bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME)
        path = self._find_latest_genomic_set_csv(bucket_name,
                                                 'Validation-Result')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        class ResultCsvColumns(object):
            """Names of CSV columns that we read from the genomic set upload."""
            GENOMIC_SET_NAME = 'genomic_set_name'
            GENOMIC_SET_CRITERIA = 'genomic_set_criteria'
            PID = 'pid'
            BIOBANK_ORDER_ID = 'biobank_order_id'
            NY_FLAG = 'ny_flag'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            STATUS = 'status'
            INVALID_REASON = 'invalid_reason'

            ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID,
                   BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE,
                   STATUS, INVALID_REASON)

        missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(len(rows), 3)
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[0][ResultCsvColumns.PID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[1][ResultCsvColumns.PID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F')

        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[2][ResultCsvColumns.PID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        # verify manifest files
        bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)

        class ExpectedCsvColumns(object):
            VALUE = 'value'
            BIOBANK_ID = 'biobank_id'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            NY_FLAG = 'ny_flag'
            REQUEST_ID = 'request_id'
            PACKAGE_ID = 'package_id'

            ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID,
                   PACKAGE_ID)

        path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678')
        self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1')
        self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679')
        self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2')
        self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
        self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680')
        self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3')
        self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N')

        # verify manifest result files
        bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)

        class ExpectedCsvColumns(object):
            VALUE = 'value'
            BIOBANK_ID = 'biobank_id'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            NY_FLAG = 'ny_flag'
            REQUEST_ID = 'request_id'
            PACKAGE_ID = 'package_id'

            ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID,
                   PACKAGE_ID)

        path = self._find_latest_genomic_set_csv(bucket_name,
                                                 'Manifest-Result')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678')
        self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1')
        self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[0][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX1')

        self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679')
        self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2')
        self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
        self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[1][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX2')

        self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680')
        self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3')
        self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX3')

        # verify package id in database
        member_dao = GenomicSetMemberDao()
        members = member_dao.get_all()
        for member in members:
            self.assertIn(
                member.packageId,
                ['PKG-XXXX-XXXX1', 'PKG-XXXX-XXXX2', 'PKG-XXXX-XXXX3'])