def query_for_project_names_within_program(program_id): ''' Gets a mapping of the available project names within a GDC program (e.g. all the TCGA cancer types within the TCGA program) Returns a dict that maps the program ID (e.g. TCGA-LUAD) to a "real" name like lung adenocarcinoma `program_id` is a string like TCGA or TARGET. One of the GDC top-level programs ''' filters = GDCDataSource.create_program_filter(program_id) # 'program.name' gives the ID like "TCGA-LUAD" and # 'name' gives a "readable" name like "Lung adenocarcinoma" fields = ['program.name', 'name'] query_params = GDCDataSource.create_query_params( fields, page_size=10000, # gets all types at once filters=json.dumps(filters)) r = get_with_retry(GDCDataSource.GDC_PROJECTS_ENDPOINT, params=query_params) response_json = r.json() project_mapping_dict = { x['id']: x['name'] for x in response_json['data']['hits'] } return project_mapping_dict
def check_if_ready(self): ''' Makes sure all the proper environment variables, etc. are present to use this job runner. Should be invoked at startup of django app. ''' # check that we can reach the Cromwell server url = self.CROMWELL_URL + self.VERSION_ENDPOINT try: response = get_with_retry(url) except Exception as ex: logger.info( 'An exception was raised when checking if the remote Cromwell runner was ready.' ' The exception reads: {ex}'.format(ex=ex)) raise ImproperlyConfigured( 'Failed to check the remote Cromwell runner. See logs.') if response.status_code != 200: logger.info('The Cromwell server located at: {url}' ' was not ready.'.format(url=url)) raise ImproperlyConfigured('Failed to reach Cromwell server.') bucket_region = get_storage_backend().get_bucket_region( self.CROMWELL_BUCKET) instance_region = get_instance_region() if bucket_region != instance_region: raise ImproperlyConfigured( 'The application is running on a' ' machine in the following region: {instance_region}. The' ' Cromwell bucket was found in {bucket_region}. They should' ' be located in the same region.'.format( bucket_region=bucket_region, instance_region=instance_region))
def get_instance_zone(): try: response = get_with_retry( 'http://metadata/computeMetadata/v1/instance/zone', headers={'Metadata-Flavor': 'Google'}) # zone_str is something like 'projects/{project ID number}/zones/us-east4-c' zone_str = response.text zone = zone_str.split('/')[-1] # now like us-east4-c return zone except Exception as ex: # if we could not get the region of the instance, return None for the region #return None raise ex
def query_for_status(self, job_uuid): ''' Performs the work of querying the Cromwell server. Returns either a dict (i.e. the response) or None, if the response did not have the expected 200 status code. ''' endpoint = self.STATUS_ENDPOINT.format(cromwell_job_id=job_uuid) status_url = self.CROMWELL_URL + endpoint response = get_with_retry(status_url) bad_codes = [404, 400, 500] if response.status_code in bad_codes: logger.info('Request for Cromwell job status returned' ' a {code} status.'.format(code=response.status_code)) elif response.status_code == 200: response_json = json.loads(response.text) return response_json else: logging.info('Received an unexpected status code when querying' ' the status of a Cromwell job.')
def query_for_metadata(self, job_uuid): ''' Calls out to the Cromwell server to get metadata about a job. See https://cromwell.readthedocs.io/en/stable/api/RESTAPI/#get-workflow-and-call-level-metadata-for-a-specified-workflow ''' endpoint = self.METADATA_ENDPOINT.format(cromwell_job_id=job_uuid) metadata_url = self.CROMWELL_URL + endpoint response = get_with_retry(metadata_url) bad_codes = [404, 400, 500] if response.status_code in bad_codes: logger.info('Request for Cromwell job metadata returned' ' a {code} status.'.format(code=response.status_code)) elif response.status_code == 200: response_json = json.loads(response.text) return response_json else: logging.info('Received an unexpected status code when querying' ' the metadata of a Cromwell job.')
def get_data_dictionary(self): ''' The GDC defines a data schema which we query here. This gives the universe of data fields, which are used by children classes. ''' # When querying the data dictionary, we also get extraneous fields # we don't care about. Add those to this list: IGNORED_PROPERTIES = [ 'cases', 'state', 'type', 'updated_datetime', 'created_datetime', 'id', 'submitter_id', 'releasable', 'released', 'intended_release_date', 'batch_id', 'programs' ] # Rather than getting EVERYTHING, we only query which fields are # available within these general categories: ATTRIBUTES = ['demographic', 'diagnosis', 'exposure', 'project'] d = {} for attr in ATTRIBUTES: property_list = [] url = self.GDC_DICTIONARY_ENDPOINT.format(attribute=attr) response = get_with_retry(url) j = response.json() properties = j['properties'] for k in properties.keys(): if k in IGNORED_PROPERTIES: continue try: description = properties[k]['description'] except KeyError as ex: description = None property_list.append({'field': k, 'description': description}) d[attr] = property_list return d
def _download_cohort(self, project_id, data_fields): ''' Handles the download of metadata and actual data for a single GDC project (e.g. TCGA-LUAD). Will return a tuple of: - dataframe giving the metadata (i.e. patient info) - count matrix ''' final_query_params = self._create_rnaseq_query_params(project_id) # prepare some temporary loop variables finished = False i = 0 downloaded_archives = [] # We have to keep a map of the fileId to the aliquot so we can properly # concatenate the files later file_to_aliquot_mapping = {} annotation_df = pd.DataFrame() while not finished: logger.info('Downloading batch %d for %s...' % (i, project_id)) # the records are paginated, so we have to keep track of which page we are currently requesting start_index = i * GDCDataSource.PAGE_SIZE end_index = (i + 1) * GDCDataSource.PAGE_SIZE final_query_params.update({'from': start_index}) try: response = get_with_retry(GDCDataSource.GDC_FILES_ENDPOINT, params=final_query_params) except Exception as ex: logger.info( 'An exception was raised when querying the GDC for' ' metadata. The exception reads: {ex}'.format(ex=ex)) return if response.status_code == 200: response_json = json.loads(response.content.decode("utf-8")) else: logger.error('The response code was NOT 200, but the request' ' exception was not handled.') return # If the first request, we can get the total records by examining # the pagination data if i == 0: pagination_response = response_json['data']['pagination'] total_records = int(pagination_response['total']) # now collect the file UUIDs and download file_uuid_list = [] case_id_list = [] exposures = [] diagnoses = [] demographics = [] projects = [] aliquot_ids = [] for hit in response_json['data']['hits']: # hit['cases'] is a list. To date, have only seen length of 1, # and it's not clear what a greater length would mean. # Hence, catch this and issue an error so we can investigate if len(hit['cases']) > 1: logger.info( 'Encountered an unexpected issue when iterating through the returned hits' ' of a GDC RNA-seq query. We expect the "cases" key for a hit to be of length 1,' ' but this was greater. Returned data was: {k}'.format( k=json.dumps(response_json))) continue file_uuid_list.append(hit['file_id']) case_item = hit['cases'][0] case_id_list.append(case_item['case_id']) try: exposures.append(case_item['exposures'][0]) except KeyError as ex: exposures.append({}) try: diagnoses.append(case_item['diagnoses'][0]) except KeyError as ex: diagnoses.append({}) try: demographics.append(case_item['demographic']) except KeyError as ex: demographics.append({}) try: projects.append(case_item['project']) except KeyError as ex: projects.append({}) try: aliquot_ids.append( case_item['samples'][0]['portions'][0]['analytes'][0] ['aliquots'][0]['aliquot_id']) except KeyError as ex: # Need an aliquot ID to uniquely identify the column. Fail out logger.error( 'Encountered an unexpected issue when iterating through the returned hits' ' of a GDC RNA-seq query. We expect that we should be able to drill-down to find a unique aliquot ID.' ' The returned data was: {k}'.format( k=json.dumps(response_json))) return logger.info('Adding {n} aliquots'.format(n=len(aliquot_ids))) file_to_aliquot_mapping.update( dict(zip(file_uuid_list, aliquot_ids))) exposure_df = GDCDataSource.merge_with_full_record( data_fields['exposure'], exposures, aliquot_ids) demographic_df = GDCDataSource.merge_with_full_record( data_fields['demographic'], demographics, aliquot_ids) diagnoses_df = GDCDataSource.merge_with_full_record( data_fields['diagnosis'], diagnoses, aliquot_ids) # note that we keep the extra 'project_id' field in this method call. # That gives us the cancer type such as "TCGA-BRCA", etc. project_df = GDCDataSource.merge_with_full_record( data_fields['project'], projects, aliquot_ids, extra_fields=['project_id']) # Remove the extra project_id column from the exposure, demo, and diagnoses dataframes. Otherwise we get duplicated # columns that we have to carry around: exposure_df = exposure_df.drop('project_id', axis=1) diagnoses_df = diagnoses_df.drop('project_id', axis=1) demographic_df = demographic_df.drop('project_id', axis=1) # Now merge all the dataframes (concatenate horizontally) # to get the full metadata/annotations ann_df = pd.concat( [exposure_df, demographic_df, diagnoses_df, project_df], axis=1) # Create another series which maps the aliquot IDs to the case ID. # That will then be added to the annotation dataframe so we know which # metadata is mapped to each case s = pd.Series(dict(zip(aliquot_ids, case_id_list)), name='case_id') ann_df = pd.concat([ann_df, s], axis=1) # Add to the master dataframe for this cancer type annotation_df = pd.concat([annotation_df, ann_df], axis=0) # Go get the actual count data for this batch. downloaded_archives.append( self._download_expression_archives(file_uuid_list)) i += 1 # are we done yet??? if end_index >= total_records: finished = True logger.info('Completed looping through the batches for {ct}'.format( ct=project_id)) # Merge and write the count files count_df = self._merge_downloaded_archives(downloaded_archives, file_to_aliquot_mapping) logger.info( 'For {ct}, created a count matrix with {n} aliquots.'.format( ct=project_id, n=count_df.shape[1])) # Cleanup the downloads [os.remove(x) for x in downloaded_archives] return annotation_df, count_df