def generate_samples(fraction_missing):
    """Creates fake sample CSV data in GCS.

  Args:
    fraction_missing: This many samples which exist as BiobankStoredSamples will not have rows
        generated in the fake CSV.
  """
    bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
    now = clock.CLOCK.now()
    file_name = '/%s/fake_%s.csv' % (bucket_name,
                                     now.strftime(INPUT_CSV_TIME_FORMAT))
    num_rows = 0
    sample_id_start = random.randint(1000000, 10000000)
    with cloudstorage_api.open(file_name, mode='w') as dest:
        writer = csv.writer(dest, delimiter="\t")
        writer.writerow(_HEADERS)
        biobank_order_dao = BiobankOrderDao()
        with biobank_order_dao.session() as session:
            rows = biobank_order_dao.get_ordered_samples_sample(
                session, 1 - fraction_missing, _BATCH_SIZE)
            for biobank_id, collected_time, test in rows:
                if collected_time is None:
                    logging.warning(
                        'biobank_id=%s test=%s skipped (collected=%s)',
                        biobank_id, test, collected_time)
                    continue
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_SAMPLE_COLLECTED_AND_CONFIRMED)
                confirmed_time = collected_time + datetime.timedelta(
                    minutes=minutes_delta)
                writer.writerow([
                    sample_id_start + num_rows,
                    None,  # no parent
                    confirmed_time.strftime(_TIME_FORMAT),
                    to_client_biobank_id(biobank_id),
                    test,
                    confirmed_time.strftime(_TIME_FORMAT),
                    'KIT'
                ])  # reuse confirmed time as created time
                num_rows += 1
        participant_dao = ParticipantDao()
        with participant_dao.session() as session:
            rows = participant_dao.get_biobank_ids_sample(
                session, _PARTICIPANTS_WITH_ORPHAN_SAMPLES, _BATCH_SIZE)
            for biobank_id, sign_up_time in rows:
                minutes_delta = random.randint(
                    0, _MAX_MINUTES_BETWEEN_PARTICIPANT_CREATED_AND_CONFIRMED)
                confirmed_time = sign_up_time + datetime.timedelta(
                    minutes=minutes_delta)
                tests = random.sample(BIOBANK_TESTS,
                                      random.randint(1, len(BIOBANK_TESTS)))
                for test in tests:
                    writer.writerow([
                        sample_id_start + num_rows, None,
                        confirmed_time.strftime(_TIME_FORMAT),
                        to_client_biobank_id(biobank_id), test,
                        confirmed_time.strftime(_TIME_FORMAT), 'KIT'
                    ])
                    num_rows += 1
    logging.info("Generated %d samples in %s.", num_rows, file_name)
Ejemplo n.º 2
0
  def testDeidentifiedExport_participantIds(self):
    TableExporter.export_tables('rdr', ['ppi_participant_view'], 'dir', deidentify=True)

    p1 = self._participant_with_defaults(
        participantId=1,
        version=2,
        biobankId=2,
        providerLink=make_primary_provider_link_for_name('PITT'))
    ParticipantDao().insert(p1)
    p2 = self._participant_with_defaults(
        participantId=2,
        version=3,
        biobankId=3,
        providerLink=make_primary_provider_link_for_name('PITT'))
    ParticipantDao().insert(p2)

    tasks = self.taskqueue_stub.get_filtered_tasks()
    self.assertEqual(len(tasks), 1)
    csv_path = deferred.run(tasks[0].payload)

    with cloudstorage_api.open('/' + csv_path, mode='r') as output:
      reader = csv.reader(output)
      rows = list(reader)[1:]
    self.assertEqual(2, len(rows))

    pmi_ids = set([p1.participantId, p2.participantId])
    obf_ids = set([row[0] for row in rows])
    self.assertFalse(pmi_ids.intersection(obf_ids),
                     'should be no overlap between pmi_ids and obfuscated IDs')
    self.assertEquals(2, len(obf_ids))
Ejemplo n.º 3
0
 def test_file_exists(self):
     consent_pdf_path = '/%s/Participant/somefile.pdf' % _FAKE_BUCKET
     with self.assertRaises(BadRequest):
         _raise_if_gcloud_file_missing(consent_pdf_path)
     with cloudstorage_api.open(consent_pdf_path, mode='w') as cloud_file:
         cloud_file.write('I am a fake PDF in a fake Cloud.')
     _raise_if_gcloud_file_missing(consent_pdf_path)
Ejemplo n.º 4
0
def process_genotyping_manifest_file_from_bucket(bucket_name,
                                                 genotyping_folder_name):
    bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name)
    if not bucket_stat_list:
        logging.info('No files in cloud bucket %r.' % bucket_name)
        return None
    bucket_stat_list = [
        s for s in bucket_stat_list
        if s.filename.lower().endswith('.csv') and '%s' %
        genotyping_folder_name in s.filename
    ]
    if not bucket_stat_list:
        logging.info('No CSVs in cloud bucket %r folder %r (all files: %s).' %
                     (bucket_name, genotyping_folder_name, bucket_stat_list))
        return None

    bucket_stat_list.sort(key=lambda s: s.st_ctime)
    path = bucket_stat_list[-1].filename
    timestamp = datetime.datetime.utcfromtimestamp(
        bucket_stat_list[-1].st_ctime)
    csv_file = cloudstorage_api.open(path)

    logging.info('Opening latest genotyping manifest CSV in %r: %r.',
                 bucket_name + '/' + genotyping_folder_name, path)

    now = clock.CLOCK.now()
    if now - timestamp > _MAX_INPUT_AGE:
        logging.info(
            'Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
            % (path, timestamp, now))

        return None

    update_sample_info_from_genotyping_manifest_file(csv_file)
Ejemplo n.º 5
0
 def open_writer(self, file_name, predicate=None):
   gcs_path = '/%s/%s' % (self._bucket_name, file_name)
   logging.info('Exporting data to %s...', gcs_path)
   with cloudstorage_api.open(gcs_path, mode='w') as dest:
     writer = SqlExportFileWriter(dest, predicate, use_unicode=self._use_unicode)
     yield writer
   logging.info('Export to %s complete.', gcs_path)
def _open_latest_genomic_set_file(cloud_bucket_name):
    """Returns an open stream for the most recently created CSV in the given bucket."""
    path = _find_latest_genomic_set_csv(cloud_bucket_name)
    filename = path.replace('/' + cloud_bucket_name + '/', '')
    logging.info('Opening latest samples CSV in %r: %r.', cloud_bucket_name,
                 path)
    return cloudstorage_api.open(path), filename
def process_genomic_manifest_result_file_from_bucket():
  bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
  result_folder_name = config.getSetting(GENOMIC_BIOBANK_MANIFEST_RESULT_FOLDER_NAME)

  bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name)
  if not bucket_stat_list:
    logging.info('No files in cloud bucket %r.' % bucket_name)
    return None
  bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv')
                      and '%s' % result_folder_name in s.filename]
  if not bucket_stat_list:
    logging.info(
      'No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, result_folder_name,
                                                                 bucket_stat_list))
    return None

  bucket_stat_list.sort(key=lambda s: s.st_ctime)
  path = bucket_stat_list[-1].filename
  csv_file = cloudstorage_api.open(path)
  filename = path.replace('/' + bucket_name + '/' + result_folder_name + '/', '')
  logging.info('Opening latest genomic manifest result CSV in %r: %r.', bucket_name + '/' + result_folder_name,
               path)
  timestamp = timestamp_from_filename(filename)
  now = clock.CLOCK.now()
  if now - timestamp > _MAX_INPUT_AGE:
    logging.info('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
                 % (filename, timestamp, now))
    print('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
                 % (filename, timestamp, now))
    return None

  genomic_set_id = _get_genomic_set_id_from_filename(filename)
  update_package_id_from_manifest_result_file(genomic_set_id, csv_file)
Ejemplo n.º 8
0
  def uploadFile(self, bucket, memeImage):

    # Upload to Cloud Storage
    uniqueIdString = str(uuid.uuid4())
    uniqueIdString = uniqueIdString.replace('-', '')

    filename = bucket + '/meme-' + uniqueIdString + '.png'
    url = CLOUD_STORAGE_URL + filename

    write_retry_params = api_utils.RetryParams(backoff_factor=1.1)
    cloud_file = cloudstorage_api.open(filename,
                                       'w',
                                       content_type='image/png',
                                       options={'x-goog-acl': 'public-read'},
                                       retry_params=write_retry_params)
    output = StringIO.StringIO()
    memeImage.save(output, format="PNG")
    contents = output.getvalue()
    output.close()

    try:
      cloud_file.write(contents)
      cloud_file.close()
    except Exception, e:
      logging.error(e)
      return Meme(image_url="", text="")
Ejemplo n.º 9
0
 def _write_cloud_csv(self,
                      file_name,
                      contents_str,
                      bucket=None,
                      folder=None):
     bucket = _FAKE_BUCKET if bucket is None else bucket
     if folder is None:
         path = '/%s/%s' % (bucket, file_name)
     else:
         path = '/%s/%s/%s' % (bucket, folder, file_name)
     with cloudstorage_api.open(path, mode='w') as cloud_file:
         cloud_file.write(contents_str.encode('utf-8'))
def _get_participant_ids_from_person_file(person_file):
    """
  reads the specified CSV file from cloud storage and returns a list of the first column integers
  """
    def parse_pid(row):
        try:
            return int(row[0])
        except (KeyError, ValueError):
            pass

    with cloudstorage_api.open(person_file) as gcs_file:
        return filter(bool, map(parse_pid, csv.reader(gcs_file)))
Ejemplo n.º 11
0
  def _rewrite_tmpfile(self, mainfile, tmpfile, writer_spec):
    """Copies contents of tmpfile (name) to mainfile (buffer)."""
    if mainfile.closed:
      # can happen when finalize fails
      return

    account_id = self._get_tmp_account_id(writer_spec)
    f = cloudstorage_api.open(tmpfile, _account_id=account_id)
    # both reads and writes are buffered - the number here doesn't matter
    data = f.read(self._REWRITE_BLOCK_SIZE)
    while data:
      mainfile.write(data)
      data = f.read(self._REWRITE_BLOCK_SIZE)
    f.close()
    mainfile.flush()
def _open_latest_samples_file(cloud_bucket_name):
    """Returns an open stream for the most recently created CSV in the given bucket."""
    path = _find_latest_samples_csv(cloud_bucket_name)
    logging.info('Opening latest samples CSV in %r: %r.', cloud_bucket_name,
                 path)
    return cloudstorage_api.open(path), path
  def test_create_and_upload_biobank_manifest_file(self):
    participant = self.participant_dao.insert(Participant(participantId=123, biobankId=123))
    self.summary_dao.insert(self.participant_summary(participant))
    bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123',
                                  identifiers=[BiobankOrderIdentifier(
                                    system=u'https://www.pmi-ops.org', value=u'12345678')])
    BiobankOrderDao().insert(bo)

    participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=124))
    self.summary_dao.insert(self.participant_summary(participant2))
    bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345679')])
    BiobankOrderDao().insert(bo2)

    participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=125))
    self.summary_dao.insert(self.participant_summary(participant3))
    bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345680')])
    BiobankOrderDao().insert(bo3)

    genomic_set = self._create_fake_genomic_set('fake_genomic_set_name',
                                                'fake_genomic_set_criteria',
                                                'Genomic-Test-Set-v12019-04-05-00-30-10.CSV')
    self._create_fake_genomic_member(genomic_set.id, participant.participantId, bo.biobankOrderId,
                                     participant.biobankId, bo.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.VALID,
                                     sex_at_birth='F', genome_type='aou_array', ny_flag='Y')

    self._create_fake_genomic_member(genomic_set.id, participant2.participantId, bo2.biobankOrderId,
                                     participant2.biobankId, bo2.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.INVALID_AGE,
                                     sex_at_birth='M', genome_type='aou_array', ny_flag='N')

    self._create_fake_genomic_member(genomic_set.id, participant3.participantId, bo3.biobankOrderId,
                                     participant3.biobankId, bo3.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.INVALID_CONSENT,
                                     sex_at_birth='F', genome_type='aou_wgs', ny_flag='Y')

    now = clock.CLOCK.now()
    genomic_biobank_menifest_handler\
      .create_and_upload_genomic_biobank_manifest_file(genomic_set.id, now)

    bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
    # convert UTC to CDT
    now_cdt_str = _UTC.localize(now).astimezone(_US_CENTRAL).replace(tzinfo=None) \
      .strftime(_OUTPUT_CSV_TIME_FORMAT)

    class ExpectedCsvColumns(object):
      VALUE = 'value'
      BIOBANK_ID = 'biobank_id'
      SEX_AT_BIRTH = 'sex_at_birth'
      GENOME_TYPE = 'genome_type'
      NY_FLAG = 'ny_flag'
      REQUEST_ID = 'request_id'
      PACKAGE_ID = 'package_id'

      ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID, PACKAGE_ID)

    expected_result_filename = 'rdr_fake_sub_folder/Genomic-Manifest-AoU-1-v1' + \
                               now_cdt_str + '.CSV'
    path = '/' + bucket_name + '/' + expected_result_filename
    csv_file = cloudstorage_api.open(path)
    csv_reader = csv.DictReader(csv_file, delimiter=',')

    missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames)
    self.assertEqual(len(missing_cols), 0)
    rows = list(csv_reader)
    self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678')
    self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '123')
    self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
    self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
    self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y')
    self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679')
    self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '124')
    self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
    self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
    self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N')
    self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680')
    self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '125')
    self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
    self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs')
    self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'Y')
 def _write_cloud_csv(self, file_name, contents_str):
     with cloudstorage_api.open('/%s/%s' % (_FAKE_BUCKET, file_name),
                                mode='w') as cloud_file:
         cloud_file.write(contents_str.encode('utf-8'))
Ejemplo n.º 15
0
 def _read_csv_from_gcs(self, bucket_name, file_name):
   with cloudstorage_api.open('/%s/%s' % (bucket_name, file_name), mode='r') as infile:
     return list(csv.DictReader(infile))
Ejemplo n.º 16
0
    def test_end_to_end_valid_case(self):
        participant = self._make_participant()
        self._make_summary(participant)
        self._make_biobank_order(participantId=participant.participantId,
                                 biobankOrderId=participant.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345678')
                                 ])

        participant2 = self._make_participant()
        self._make_summary(participant2)
        self._make_biobank_order(participantId=participant2.participantId,
                                 biobankOrderId=participant2.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345679')
                                 ])

        participant3 = self._make_participant()
        self._make_summary(participant3)
        self._make_biobank_order(participantId=participant3.participantId,
                                 biobankOrderId=participant3.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345680')
                                 ])

        samples_file = test_data.open_genomic_set_file(
            'Genomic-Test-Set-test-2.csv')

        input_filename = 'Genomic-Test-Set-v1%s.csv' % self\
          ._naive_utc_to_naive_central(clock.CLOCK.now())\
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(input_filename, samples_file)

        manifest_result_file = test_data.open_genomic_set_file(
            'Genomic-Manifest-Result-test.csv')

        manifest_result_filename = 'Genomic-Manifest-Result-AoU-1-v1%s.csv' % self \
          ._naive_utc_to_naive_central(clock.CLOCK.now()) \
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(manifest_result_filename,
                              manifest_result_file,
                              bucket=_FAKE_BIOBANK_SAMPLE_BUCKET,
                              folder=_FAKE_BUCKET_RESULT_FOLDER)

        genomic_pipeline.process_genomic_water_line()

        # verify result file
        bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME)
        path = self._find_latest_genomic_set_csv(bucket_name,
                                                 'Validation-Result')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        class ResultCsvColumns(object):
            """Names of CSV columns that we read from the genomic set upload."""
            GENOMIC_SET_NAME = 'genomic_set_name'
            GENOMIC_SET_CRITERIA = 'genomic_set_criteria'
            PID = 'pid'
            BIOBANK_ORDER_ID = 'biobank_order_id'
            NY_FLAG = 'ny_flag'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            STATUS = 'status'
            INVALID_REASON = 'invalid_reason'

            ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID,
                   BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE,
                   STATUS, INVALID_REASON)

        missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(len(rows), 3)
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[0][ResultCsvColumns.PID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[1][ResultCsvColumns.PID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F')

        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'valid')
        self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], '')
        self.assertEqual(rows[2][ResultCsvColumns.PID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        # verify manifest files
        bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)

        class ExpectedCsvColumns(object):
            VALUE = 'value'
            BIOBANK_ID = 'biobank_id'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            NY_FLAG = 'ny_flag'
            REQUEST_ID = 'request_id'
            PACKAGE_ID = 'package_id'

            ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID,
                   PACKAGE_ID)

        path = self._find_latest_genomic_set_csv(bucket_name, 'Manifest')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678')
        self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1')
        self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679')
        self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2')
        self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
        self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680')
        self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3')
        self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N')

        # verify manifest result files
        bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)

        class ExpectedCsvColumns(object):
            VALUE = 'value'
            BIOBANK_ID = 'biobank_id'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            NY_FLAG = 'ny_flag'
            REQUEST_ID = 'request_id'
            PACKAGE_ID = 'package_id'

            ALL = (VALUE, SEX_AT_BIRTH, GENOME_TYPE, NY_FLAG, REQUEST_ID,
                   PACKAGE_ID)

        path = self._find_latest_genomic_set_csv(bucket_name,
                                                 'Manifest-Result')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        missing_cols = set(ExpectedCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(rows[0][ExpectedCsvColumns.VALUE], '12345678')
        self.assertEqual(rows[0][ExpectedCsvColumns.BIOBANK_ID], '1')
        self.assertEqual(rows[0][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[0][ExpectedCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ExpectedCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[0][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX1')

        self.assertEqual(rows[1][ExpectedCsvColumns.VALUE], '12345679')
        self.assertEqual(rows[1][ExpectedCsvColumns.BIOBANK_ID], '2')
        self.assertEqual(rows[1][ExpectedCsvColumns.SEX_AT_BIRTH], 'F')
        self.assertEqual(rows[1][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[1][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX2')

        self.assertEqual(rows[2][ExpectedCsvColumns.VALUE], '12345680')
        self.assertEqual(rows[2][ExpectedCsvColumns.BIOBANK_ID], '3')
        self.assertEqual(rows[2][ExpectedCsvColumns.SEX_AT_BIRTH], 'M')
        self.assertEqual(rows[2][ExpectedCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ExpectedCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ExpectedCsvColumns.PACKAGE_ID],
                         'PKG-XXXX-XXXX3')

        # verify package id in database
        member_dao = GenomicSetMemberDao()
        members = member_dao.get_all()
        for member in members:
            self.assertIn(
                member.packageId,
                ['PKG-XXXX-XXXX1', 'PKG-XXXX-XXXX2', 'PKG-XXXX-XXXX3'])
Ejemplo n.º 17
0
def assertCsvContents(test, bucket_name, file_name, contents):
    with cloudstorage_api.open('/%s/%s' % (bucket_name, file_name),
                               mode='r') as output:
        reader = csv.reader(output)
        rows = sorted(reader)
    test.assertEquals(sorted(contents), rows)
Ejemplo n.º 18
0
    def test_end_to_end_invalid_case(self):
        participant = self._make_participant()
        self._make_summary(participant, dateOfBirth='2018-02-14')
        self._make_biobank_order(participantId=participant.participantId,
                                 biobankOrderId=participant.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345678')
                                 ])

        participant2 = self._make_participant()
        self._make_summary(participant2,
                           consentForStudyEnrollmentTime=datetime.datetime(
                               1990, 1, 1))
        self._make_biobank_order(participantId=participant2.participantId,
                                 biobankOrderId=participant2.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345679')
                                 ])

        participant3 = self._make_participant()
        self._make_summary(participant3, zipCode='')
        self._make_biobank_order(participantId=participant3.participantId,
                                 biobankOrderId=participant3.participantId,
                                 identifiers=[
                                     BiobankOrderIdentifier(
                                         system=u'https://www.pmi-ops.org',
                                         value=u'12345680')
                                 ])

        participant4 = self._make_participant()
        self._make_summary(participant4)
        self._make_biobank_order(
            participantId=participant4.participantId,
            biobankOrderId=participant4.participantId,
            identifiers=[BiobankOrderIdentifier(system=u'c', value=u'e')])

        samples_file = test_data.open_genomic_set_file(
            'Genomic-Test-Set-test-3.csv')

        input_filename = 'Genomic-Test-Set-v1%s.csv' % self\
          ._naive_utc_to_naive_central(clock.CLOCK.now())\
          .strftime(genomic_set_file_handler.INPUT_CSV_TIME_FORMAT)

        self._write_cloud_csv(input_filename, samples_file)

        genomic_pipeline.process_genomic_water_line()

        # verify result file
        bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME)
        path = self._find_latest_genomic_set_csv(bucket_name,
                                                 'Validation-Result')
        csv_file = cloudstorage_api.open(path)
        csv_reader = csv.DictReader(csv_file, delimiter=',')

        class ResultCsvColumns(object):
            """Names of CSV columns that we read from the genomic set upload."""
            GENOMIC_SET_NAME = 'genomic_set_name'
            GENOMIC_SET_CRITERIA = 'genomic_set_criteria'
            PID = 'pid'
            BIOBANK_ORDER_ID = 'biobank_order_id'
            NY_FLAG = 'ny_flag'
            SEX_AT_BIRTH = 'sex_at_birth'
            GENOME_TYPE = 'genome_type'
            STATUS = 'status'
            INVALID_REASON = 'invalid_reason'

            ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID,
                   BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH, GENOME_TYPE,
                   STATUS, INVALID_REASON)

        missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames)
        self.assertEqual(len(missing_cols), 0)
        rows = list(csv_reader)
        self.assertEqual(len(rows), 4)
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'invalid')
        self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON],
                         'INVALID_AGE')
        self.assertEqual(rows[0][ResultCsvColumns.PID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '1')
        self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'invalid')
        self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON],
                         'INVALID_CONSENT')
        self.assertEqual(rows[1][ResultCsvColumns.PID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '2')
        self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'F')

        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'invalid')
        self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON],
                         'INVALID_NY_ZIPCODE')
        self.assertEqual(rows[2][ResultCsvColumns.PID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '3')
        self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'N')
        self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_array')
        self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'M')

        self.assertEqual(rows[3][ResultCsvColumns.GENOMIC_SET_NAME],
                         'name_xxx')
        self.assertEqual(rows[3][ResultCsvColumns.GENOMIC_SET_CRITERIA],
                         'criteria_xxx')
        self.assertEqual(rows[3][ResultCsvColumns.STATUS], 'invalid')
        self.assertEqual(rows[3][ResultCsvColumns.INVALID_REASON],
                         'INVALID_BIOBANK_ORDER_CLIENT_ID')
        self.assertEqual(rows[3][ResultCsvColumns.PID], '4')
        self.assertEqual(rows[3][ResultCsvColumns.BIOBANK_ORDER_ID], '4')
        self.assertEqual(rows[3][ResultCsvColumns.NY_FLAG], 'Y')
        self.assertEqual(rows[3][ResultCsvColumns.GENOME_TYPE], 'aou_wgs')
        self.assertEqual(rows[3][ResultCsvColumns.SEX_AT_BIRTH], 'F')
  def test_create_genomic_set_result_file(self):
    participant = self.participant_dao.insert(Participant(participantId=123, biobankId=123))
    self.summary_dao.insert(self.participant_summary(participant))
    bo = self._make_biobank_order(participantId=participant.participantId, biobankOrderId='123',
                                  identifiers=[BiobankOrderIdentifier(
                                    system=u'https://www.pmi-ops.org', value=u'12345678')])
    BiobankOrderDao().insert(bo)

    participant2 = self.participant_dao.insert(Participant(participantId=124, biobankId=124))
    self.summary_dao.insert(self.participant_summary(participant2))
    bo2 = self._make_biobank_order(participantId=participant2.participantId, biobankOrderId='124',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345679')])
    BiobankOrderDao().insert(bo2)

    participant3 = self.participant_dao.insert(Participant(participantId=125, biobankId=125))
    self.summary_dao.insert(self.participant_summary(participant3))
    bo3 = self._make_biobank_order(participantId=participant3.participantId, biobankOrderId='125',
                                   identifiers=[BiobankOrderIdentifier(
                                     system=u'https://www.pmi-ops.org', value=u'12345680')])
    BiobankOrderDao().insert(bo3)

    genomic_set = self._create_fake_genomic_set('fake_genomic_set_name',
                                                'fake_genomic_set_criteria',
                                                'Genomic-Test-Set-v12019-04-05-00-30-10.CSV')
    self._create_fake_genomic_member(genomic_set.id, participant.participantId, bo.biobankOrderId,
                                     participant.biobankId, bo.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.VALID,
                                     sex_at_birth='F', genome_type='aou_array', ny_flag='Y')

    self._create_fake_genomic_member(genomic_set.id, participant2.participantId, bo2.biobankOrderId,
                                     participant2.biobankId, bo2.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.INVALID_AGE,
                                     sex_at_birth='M', genome_type='aou_array', ny_flag='N')

    self._create_fake_genomic_member(genomic_set.id, participant3.participantId, bo3.biobankOrderId,
                                     participant3.biobankId, bo3.identifiers[0].value,
                                     validation_status=GenomicValidationStatus.INVALID_CONSENT,
                                     sex_at_birth='F', genome_type='aou_wgs', ny_flag='Y')

    genomic_set_file_handler.create_genomic_set_status_result_file(genomic_set.id)

    expected_result_filename = 'Genomic-Test-Set-v12019-04-05-00-30-10-Validation-Result.CSV'
    bucket_name = config.getSetting(config.GENOMIC_SET_BUCKET_NAME)
    path = '/' + bucket_name + '/' + expected_result_filename
    csv_file = cloudstorage_api.open(path)
    csv_reader = csv.DictReader(csv_file, delimiter=',')

    class ResultCsvColumns(object):
      """Names of CSV columns that we read from the genomic set upload."""
      GENOMIC_SET_NAME = 'genomic_set_name'
      GENOMIC_SET_CRITERIA = 'genomic_set_criteria'
      PID = 'pid'
      BIOBANK_ORDER_ID = 'biobank_order_id'
      NY_FLAG = 'ny_flag'
      SEX_AT_BIRTH = 'sex_at_birth'
      GENOME_TYPE = 'genome_type'
      STATUS = 'status'
      INVALID_REASON = 'invalid_reason'

      ALL = (GENOMIC_SET_NAME, GENOMIC_SET_CRITERIA, PID, BIOBANK_ORDER_ID, NY_FLAG, SEX_AT_BIRTH,
             GENOME_TYPE, STATUS, INVALID_REASON)

    missing_cols = set(ResultCsvColumns.ALL) - set(csv_reader.fieldnames)
    self.assertEqual(len(missing_cols), 0)
    rows = list(csv_reader)
    self.assertEqual(len(rows), 3)
    self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name')
    self.assertEqual(rows[0][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria')
    self.assertEqual(rows[0][ResultCsvColumns.STATUS], 'valid')
    self.assertEqual(rows[0][ResultCsvColumns.INVALID_REASON], '')
    self.assertEqual(rows[0][ResultCsvColumns.PID], '123')
    self.assertEqual(rows[0][ResultCsvColumns.BIOBANK_ORDER_ID], '123')
    self.assertEqual(rows[0][ResultCsvColumns.NY_FLAG], 'Y')
    self.assertEqual(rows[0][ResultCsvColumns.GENOME_TYPE], 'aou_array')
    self.assertEqual(rows[0][ResultCsvColumns.SEX_AT_BIRTH], 'F')

    self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name')
    self.assertEqual(rows[1][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria')
    self.assertEqual(rows[1][ResultCsvColumns.STATUS], 'invalid')
    self.assertEqual(rows[1][ResultCsvColumns.INVALID_REASON], 'INVALID_AGE')
    self.assertEqual(rows[1][ResultCsvColumns.PID], '124')
    self.assertEqual(rows[1][ResultCsvColumns.BIOBANK_ORDER_ID], '124')
    self.assertEqual(rows[1][ResultCsvColumns.NY_FLAG], 'N')
    self.assertEqual(rows[1][ResultCsvColumns.GENOME_TYPE], 'aou_array')
    self.assertEqual(rows[1][ResultCsvColumns.SEX_AT_BIRTH], 'M')

    self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_NAME], 'fake_genomic_set_name')
    self.assertEqual(rows[2][ResultCsvColumns.GENOMIC_SET_CRITERIA], 'fake_genomic_set_criteria')
    self.assertEqual(rows[2][ResultCsvColumns.STATUS], 'invalid')
    self.assertEqual(rows[2][ResultCsvColumns.INVALID_REASON], 'INVALID_CONSENT')
    self.assertEqual(rows[2][ResultCsvColumns.PID], '125')
    self.assertEqual(rows[2][ResultCsvColumns.BIOBANK_ORDER_ID], '125')
    self.assertEqual(rows[2][ResultCsvColumns.NY_FLAG], 'Y')
    self.assertEqual(rows[2][ResultCsvColumns.GENOME_TYPE], 'aou_wgs')
    self.assertEqual(rows[2][ResultCsvColumns.SEX_AT_BIRTH], 'F')