コード例 #1
0
ファイル: gdc.py プロジェクト: web-mev/mev-backend
    def query_for_project_names_within_program(program_id):
        '''
        Gets a mapping of the available project names within a 
        GDC program (e.g. all the TCGA cancer types within the TCGA
        program)

        Returns a dict that maps the program ID (e.g. TCGA-LUAD)
        to a "real" name like lung adenocarcinoma

        `program_id` is a string like TCGA or TARGET. One of the GDC
        top-level programs

        '''
        filters = GDCDataSource.create_program_filter(program_id)

        # 'program.name' gives the ID like "TCGA-LUAD" and
        # 'name' gives a "readable" name like "Lung adenocarcinoma"
        fields = ['program.name', 'name']
        query_params = GDCDataSource.create_query_params(
            fields,
            page_size=10000,  # gets all types at once
            filters=json.dumps(filters))
        r = get_with_retry(GDCDataSource.GDC_PROJECTS_ENDPOINT,
                           params=query_params)
        response_json = r.json()
        project_mapping_dict = {
            x['id']: x['name']
            for x in response_json['data']['hits']
        }
        return project_mapping_dict
コード例 #2
0
    def check_if_ready(self):
        '''
        Makes sure all the proper environment variables, etc. 
        are present to use this job runner. Should be invoked
        at startup of django app.
        '''

        # check that we can reach the Cromwell server
        url = self.CROMWELL_URL + self.VERSION_ENDPOINT
        try:
            response = get_with_retry(url)
        except Exception as ex:
            logger.info(
                'An exception was raised when checking if the remote Cromwell runner was ready.'
                ' The exception reads: {ex}'.format(ex=ex))
            raise ImproperlyConfigured(
                'Failed to check the remote Cromwell runner. See logs.')
        if response.status_code != 200:
            logger.info('The Cromwell server located at: {url}'
                        ' was not ready.'.format(url=url))
            raise ImproperlyConfigured('Failed to reach Cromwell server.')

        bucket_region = get_storage_backend().get_bucket_region(
            self.CROMWELL_BUCKET)
        instance_region = get_instance_region()
        if bucket_region != instance_region:
            raise ImproperlyConfigured(
                'The application is running on a'
                ' machine in the following region: {instance_region}. The'
                ' Cromwell bucket was found in {bucket_region}. They should'
                ' be located in the same region.'.format(
                    bucket_region=bucket_region,
                    instance_region=instance_region))
コード例 #3
0
ファイル: google_cloud.py プロジェクト: web-mev/mev-backend
def get_instance_zone():
    try:
        response = get_with_retry(
            'http://metadata/computeMetadata/v1/instance/zone',
            headers={'Metadata-Flavor': 'Google'})
        # zone_str is something like 'projects/{project ID number}/zones/us-east4-c'
        zone_str = response.text
        zone = zone_str.split('/')[-1]  # now like us-east4-c
        return zone
    except Exception as ex:
        # if we could not get the region of the instance, return None for the region
        #return None
        raise ex
コード例 #4
0
 def query_for_status(self, job_uuid):
     '''
     Performs the work of querying the Cromwell server.
     Returns either a dict (i.e. the response) or None, if 
     the response did not have the expected 200 status code.
     '''
     endpoint = self.STATUS_ENDPOINT.format(cromwell_job_id=job_uuid)
     status_url = self.CROMWELL_URL + endpoint
     response = get_with_retry(status_url)
     bad_codes = [404, 400, 500]
     if response.status_code in bad_codes:
         logger.info('Request for Cromwell job status returned'
                     ' a {code} status.'.format(code=response.status_code))
     elif response.status_code == 200:
         response_json = json.loads(response.text)
         return response_json
     else:
         logging.info('Received an unexpected status code when querying'
                      ' the status of a Cromwell job.')
コード例 #5
0
 def query_for_metadata(self, job_uuid):
     '''
     Calls out to the Cromwell server to get metadata about
     a job. See 
     https://cromwell.readthedocs.io/en/stable/api/RESTAPI/#get-workflow-and-call-level-metadata-for-a-specified-workflow
     '''
     endpoint = self.METADATA_ENDPOINT.format(cromwell_job_id=job_uuid)
     metadata_url = self.CROMWELL_URL + endpoint
     response = get_with_retry(metadata_url)
     bad_codes = [404, 400, 500]
     if response.status_code in bad_codes:
         logger.info('Request for Cromwell job metadata returned'
                     ' a {code} status.'.format(code=response.status_code))
     elif response.status_code == 200:
         response_json = json.loads(response.text)
         return response_json
     else:
         logging.info('Received an unexpected status code when querying'
                      ' the metadata of a Cromwell job.')
コード例 #6
0
ファイル: gdc.py プロジェクト: web-mev/mev-backend
    def get_data_dictionary(self):
        '''
        The GDC defines a data schema which we query here. This gives the universe
        of data fields, which are used by children classes.
        '''

        # When querying the data dictionary, we also get extraneous fields
        # we don't care about. Add those to this list:
        IGNORED_PROPERTIES = [
            'cases', 'state', 'type', 'updated_datetime', 'created_datetime',
            'id', 'submitter_id', 'releasable', 'released',
            'intended_release_date', 'batch_id', 'programs'
        ]

        # Rather than getting EVERYTHING, we only query which fields are
        # available within these general categories:
        ATTRIBUTES = ['demographic', 'diagnosis', 'exposure', 'project']

        d = {}
        for attr in ATTRIBUTES:
            property_list = []
            url = self.GDC_DICTIONARY_ENDPOINT.format(attribute=attr)
            response = get_with_retry(url)
            j = response.json()
            properties = j['properties']

            for k in properties.keys():
                if k in IGNORED_PROPERTIES:
                    continue
                try:
                    description = properties[k]['description']
                except KeyError as ex:
                    description = None
                property_list.append({'field': k, 'description': description})
            d[attr] = property_list
        return d
コード例 #7
0
ファイル: gdc.py プロジェクト: web-mev/mev-backend
    def _download_cohort(self, project_id, data_fields):
        '''
        Handles the download of metadata and actual data for a single
        GDC project (e.g. TCGA-LUAD). Will return a tuple of:
        - dataframe giving the metadata (i.e. patient info)
        - count matrix 
        '''
        final_query_params = self._create_rnaseq_query_params(project_id)

        # prepare some temporary loop variables
        finished = False
        i = 0
        downloaded_archives = []

        # We have to keep a map of the fileId to the aliquot so we can properly
        # concatenate the files later
        file_to_aliquot_mapping = {}
        annotation_df = pd.DataFrame()
        while not finished:
            logger.info('Downloading batch %d for %s...' % (i, project_id))

            # the records are paginated, so we have to keep track of which page we are currently requesting
            start_index = i * GDCDataSource.PAGE_SIZE
            end_index = (i + 1) * GDCDataSource.PAGE_SIZE
            final_query_params.update({'from': start_index})

            try:
                response = get_with_retry(GDCDataSource.GDC_FILES_ENDPOINT,
                                          params=final_query_params)
            except Exception as ex:
                logger.info(
                    'An exception was raised when querying the GDC for'
                    ' metadata. The exception reads: {ex}'.format(ex=ex))
                return

            if response.status_code == 200:
                response_json = json.loads(response.content.decode("utf-8"))
            else:
                logger.error('The response code was NOT 200, but the request'
                             ' exception was not handled.')
                return

            # If the first request, we can get the total records by examining
            # the pagination data
            if i == 0:
                pagination_response = response_json['data']['pagination']
                total_records = int(pagination_response['total'])

            # now collect the file UUIDs and download
            file_uuid_list = []
            case_id_list = []
            exposures = []
            diagnoses = []
            demographics = []
            projects = []
            aliquot_ids = []

            for hit in response_json['data']['hits']:

                # hit['cases'] is a list. To date, have only seen length of 1,
                # and it's not clear what a greater length would mean.
                # Hence, catch this and issue an error so we can investigate
                if len(hit['cases']) > 1:
                    logger.info(
                        'Encountered an unexpected issue when iterating through the returned hits'
                        ' of a GDC RNA-seq query. We expect the "cases" key for a hit to be of length 1,'
                        ' but this was greater. Returned data was: {k}'.format(
                            k=json.dumps(response_json)))
                    continue

                file_uuid_list.append(hit['file_id'])

                case_item = hit['cases'][0]
                case_id_list.append(case_item['case_id'])

                try:
                    exposures.append(case_item['exposures'][0])
                except KeyError as ex:
                    exposures.append({})

                try:
                    diagnoses.append(case_item['diagnoses'][0])
                except KeyError as ex:
                    diagnoses.append({})

                try:
                    demographics.append(case_item['demographic'])
                except KeyError as ex:
                    demographics.append({})

                try:
                    projects.append(case_item['project'])
                except KeyError as ex:
                    projects.append({})

                try:
                    aliquot_ids.append(
                        case_item['samples'][0]['portions'][0]['analytes'][0]
                        ['aliquots'][0]['aliquot_id'])
                except KeyError as ex:
                    # Need an aliquot ID to uniquely identify the column. Fail out
                    logger.error(
                        'Encountered an unexpected issue when iterating through the returned hits'
                        ' of a GDC RNA-seq query. We expect that we should be able to drill-down to find a unique aliquot ID.'
                        ' The returned data was: {k}'.format(
                            k=json.dumps(response_json)))
                    return

            logger.info('Adding {n} aliquots'.format(n=len(aliquot_ids)))
            file_to_aliquot_mapping.update(
                dict(zip(file_uuid_list, aliquot_ids)))

            exposure_df = GDCDataSource.merge_with_full_record(
                data_fields['exposure'], exposures, aliquot_ids)

            demographic_df = GDCDataSource.merge_with_full_record(
                data_fields['demographic'], demographics, aliquot_ids)

            diagnoses_df = GDCDataSource.merge_with_full_record(
                data_fields['diagnosis'], diagnoses, aliquot_ids)

            # note that we keep the extra 'project_id' field in this method call.
            # That gives us the cancer type such as "TCGA-BRCA", etc.
            project_df = GDCDataSource.merge_with_full_record(
                data_fields['project'],
                projects,
                aliquot_ids,
                extra_fields=['project_id'])

            # Remove the extra project_id column from the exposure, demo, and diagnoses dataframes. Otherwise we get duplicated
            # columns that we have to carry around:
            exposure_df = exposure_df.drop('project_id', axis=1)
            diagnoses_df = diagnoses_df.drop('project_id', axis=1)
            demographic_df = demographic_df.drop('project_id', axis=1)

            # Now merge all the dataframes (concatenate horizontally)
            # to get the full metadata/annotations
            ann_df = pd.concat(
                [exposure_df, demographic_df, diagnoses_df, project_df],
                axis=1)

            # Create another series which maps the aliquot IDs to the case ID.
            # That will then be added to the annotation dataframe so we know which
            # metadata is mapped to each case
            s = pd.Series(dict(zip(aliquot_ids, case_id_list)), name='case_id')

            ann_df = pd.concat([ann_df, s], axis=1)

            # Add to the master dataframe for this cancer type
            annotation_df = pd.concat([annotation_df, ann_df], axis=0)

            # Go get the actual count data for this batch.
            downloaded_archives.append(
                self._download_expression_archives(file_uuid_list))

            i += 1

            # are we done yet???
            if end_index >= total_records:
                finished = True

        logger.info('Completed looping through the batches for {ct}'.format(
            ct=project_id))

        # Merge and write the count files
        count_df = self._merge_downloaded_archives(downloaded_archives,
                                                   file_to_aliquot_mapping)

        logger.info(
            'For {ct}, created a count matrix with {n} aliquots.'.format(
                ct=project_id, n=count_df.shape[1]))

        # Cleanup the downloads
        [os.remove(x) for x in downloaded_archives]

        return annotation_df, count_df