Beispiel #1
0
def process_genotyping_manifest_file_from_bucket(bucket_name,
                                                 genotyping_folder_name):
    bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name)
    if not bucket_stat_list:
        logging.info('No files in cloud bucket %r.' % bucket_name)
        return None
    bucket_stat_list = [
        s for s in bucket_stat_list
        if s.filename.lower().endswith('.csv') and '%s' %
        genotyping_folder_name in s.filename
    ]
    if not bucket_stat_list:
        logging.info('No CSVs in cloud bucket %r folder %r (all files: %s).' %
                     (bucket_name, genotyping_folder_name, bucket_stat_list))
        return None

    bucket_stat_list.sort(key=lambda s: s.st_ctime)
    path = bucket_stat_list[-1].filename
    timestamp = datetime.datetime.utcfromtimestamp(
        bucket_stat_list[-1].st_ctime)
    csv_file = cloudstorage_api.open(path)

    logging.info('Opening latest genotyping manifest CSV in %r: %r.',
                 bucket_name + '/' + genotyping_folder_name, path)

    now = clock.CLOCK.now()
    if now - timestamp > _MAX_INPUT_AGE:
        logging.info(
            'Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
            % (path, timestamp, now))

        return None

    update_sample_info_from_genotyping_manifest_file(csv_file)
def process_genomic_manifest_result_file_from_bucket():
  bucket_name = config.getSetting(config.BIOBANK_SAMPLES_BUCKET_NAME)
  result_folder_name = config.getSetting(GENOMIC_BIOBANK_MANIFEST_RESULT_FOLDER_NAME)

  bucket_stat_list = cloudstorage_api.listbucket('/' + bucket_name)
  if not bucket_stat_list:
    logging.info('No files in cloud bucket %r.' % bucket_name)
    return None
  bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv')
                      and '%s' % result_folder_name in s.filename]
  if not bucket_stat_list:
    logging.info(
      'No CSVs in cloud bucket %r folder %r (all files: %s).' % (bucket_name, result_folder_name,
                                                                 bucket_stat_list))
    return None

  bucket_stat_list.sort(key=lambda s: s.st_ctime)
  path = bucket_stat_list[-1].filename
  csv_file = cloudstorage_api.open(path)
  filename = path.replace('/' + bucket_name + '/' + result_folder_name + '/', '')
  logging.info('Opening latest genomic manifest result CSV in %r: %r.', bucket_name + '/' + result_folder_name,
               path)
  timestamp = timestamp_from_filename(filename)
  now = clock.CLOCK.now()
  if now - timestamp > _MAX_INPUT_AGE:
    logging.info('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
                 % (filename, timestamp, now))
    print('Input %r (timestamp %s UTC) is > 24h old (relative to %s UTC), not processing.'
                 % (filename, timestamp, now))
    return None

  genomic_set_id = _get_genomic_set_id_from_filename(filename)
  update_package_id_from_manifest_result_file(genomic_set_id, csv_file)
Beispiel #3
0
 def _find_latest_genomic_set_csv(self, cloud_bucket_name, keyword=None):
     bucket_stat_list = cloudstorage_api.listbucket('/' + cloud_bucket_name)
     if not bucket_stat_list:
         raise RuntimeError('No files in cloud bucket %r.' %
                            cloud_bucket_name)
     bucket_stat_list = [
         s for s in bucket_stat_list if s.filename.lower().endswith('.csv')
     ]
     if not bucket_stat_list:
         raise RuntimeError('No CSVs in cloud bucket %r (all files: %s).' %
                            (cloud_bucket_name, bucket_stat_list))
     if keyword:
         buckt_stat_keyword_list = []
         for item in bucket_stat_list:
             if keyword in item.filename:
                 buckt_stat_keyword_list.append(item)
         if buckt_stat_keyword_list:
             buckt_stat_keyword_list.sort(key=lambda s: s.st_ctime)
             return buckt_stat_keyword_list[-1].filename
         else:
             raise RuntimeError(
                 'No CSVs in cloud bucket %r with keyword %s (all files: %s).'
                 % (cloud_bucket_name, keyword, bucket_stat_list))
     bucket_stat_list.sort(key=lambda s: s.st_ctime)
     return bucket_stat_list[-1].filename
def update_ehr_status():
    """
  Entrypoint, executed as a cron job
  """
    now = clock.CLOCK.now()
    cutoff_date = (now - datetime.timedelta(days=1)).date()
    bucket_name = _get_curation_bucket_name()
    try:
        organization_info_list = _get_organization_info_list(
            cloudstorage_api.listbucket('/' + bucket_name), cutoff_date)
    except config.MissingConfigException as e:
        LOG.info(str(e))
        return
    for org_info in organization_info_list:
        deferred.defer(_do_update_for_organization, *org_info)
Beispiel #5
0
def _find_latest_genomic_set_csv(cloud_bucket_name):
  """Returns the full path (including bucket name) of the most recently created CSV in the bucket.

  Raises:
    RuntimeError: if no CSVs are found in the cloud storage bucket.
  """
  bucket_stat_list = cloudstorage_api.listbucket('/' + cloud_bucket_name)
  if not bucket_stat_list:
    raise FileNotFoundError('No files in cloud bucket %r.' % cloud_bucket_name)
  # GCS does not really have the concept of directories (it's just a filename convention), so all
  # directory listings are recursive and we must filter out subdirectory contents.
  bucket_stat_list = [s for s in bucket_stat_list if s.filename.lower().endswith('.csv')
                      and '%s' % _RESULT_FILE_SUFFIX not in s.filename]
  if not bucket_stat_list:
    raise FileNotFoundError(
      'No CSVs in cloud bucket %r (all files: %s).' % (cloud_bucket_name, bucket_stat_list))
  bucket_stat_list.sort(key=lambda s: s.st_ctime)
  return bucket_stat_list[-1].filename