Exemple #1
0
def update_cloudsql_from_bigquery(config,
                                  postproc_config,
                                  project_name,
                                  cloudsql_table,
                                  log,
                                  data_type=None,
                                  endpt_type=None):
    update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % (
        cloudsql_table, '\n\t'.join(
            '%s = %%s,' % (postproc_config['postproc_columns'][key])
            for key in postproc_config['postproc_columns'].keys())[:-1],
        postproc_config['postproc_key_column'])
    if project_name:
        if data_type:  # assumes that endpt_type is also supplied
            query_results = query_bq_table(
                postproc_config['postproc_query'] %
                (', '.join(postproc_config['postproc_columns'].keys()),
                 endpt_type, project_name, data_type), False,
                postproc_config['postproc_project'], log)
        else:
            query_results = query_bq_table(
                postproc_config['postproc_query'] % (', '.join(
                    postproc_config['postproc_columns'].keys()), project_name),
                False, postproc_config['postproc_project'], log)
    else:
        query_results = query_bq_table(
            postproc_config['postproc_query'] %
            (', '.join(postproc_config['postproc_columns'].keys())), False,
            postproc_config['postproc_project'], log)
    page_token = None
    log.info('\t\t\tupdate_stmt\n%s' % (update_stmt))
    update_count = 0
    while True:
        total_rows, rows, page_token = fetch_paged_results(
            query_results, postproc_config['postproc_fetch_count'],
            project_name, page_token, log)
        if 0 < total_rows:
            log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' %
                     (total_rows, str(rows[0]), str(rows[-1])))
        else:
            log.info('\t\t\tno rows')
            return
        if config['update_cloudsql']:
            ISBCGC_database_helper.update(config, update_stmt, log, rows, True)
        update_count += len(rows)
        log.info(
            '\t\t\tupdated %s so far%s' %
            (update_count, ' for ' + project_name if project_name else ''))
        if not page_token:
            log.info(
                '\t\t\tupdated total of %s rows%s' %
                (update_count, ' for ' + project_name if project_name else ''))
            return
    def select_sample_bq_barcodes(self, program):
        self.log.info('start select %s bq samples' % (program.lower()))
        biospecimen_query = 'SELECT case_barcode, sample_barcode FROM [isb-cgc:%s_bioclin_v0.Biospecimen]' % (
            program)
        biospecimen_results = query_bq_table(biospecimen_query, True,
                                             'isb-cgc', self.log)
        page_token = None
        sample_case_barcodes = set()
        sample_sample_barcodes = set()
        while True:
            total_rows, rows, page_token = fetch_paged_results(
                biospecimen_results, 10000, None, page_token, self.log)
            for row in rows:
                case_barcode = row[0].strip()
                sample_barcode = row[1].strip()
                if sample_barcode in sample_sample_barcodes:
                    raise ValueError('found duplicate sample entry: %s' %
                                     (sample_barcode))
                sample_case_barcodes.add(case_barcode)
                sample_sample_barcodes.add(sample_barcode)

            if not page_token:
                self.log.info('\tselected total of %s sample_barcodes' %
                              (total_rows))
                break
            else:
                self.log.info('\t\tselect %d sample barcodes' % (len(rows)))

        return sample_sample_barcodes, sample_case_barcodes
Exemple #3
0
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/'
        log_name = create_log(log_dir, 'update_ccle_gcs_paths')
        log = logging.getLogger(log_name)
        
        log.info('begin updating CCLE paths in production')
        # first thing to do is to read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'CCLE\')'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log)
        
        log.info('\tcreate map of filename to path')
        name2path = {}
        for row in rows:
            fields = row[1].split('/')
            name2path[fields[-1]] = '/'.join(fields[3:])
        log.info('\tfinished map of filename to path')
        
        # get the db rows from production cloudsql
        log.info('\tselect ccle filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM main.metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\''
        
        rows = helper.select(config, query, log, [])
        log.info('\tselected %s ccle filenames from cloudsql' % (len(rows)))
        
        # now setup and do the update of paths in cloud sql
        log.info('\tstart updating paths in cloudsql')
        params = []
        not_matched = []
        for row in rows:
            if row[0] in name2path:
                params += [[name2path[row[0]], row[0]]]
            else:
                not_matched += [row[0]]
        update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s'
        helper.update(config, update, log, params)
        log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched)))

        log.info('finished updating CCLE paths in production')
    except:
        if log:
            log.exception('failed to update ccle GCS filepaths')
    finally:
        if log:
            close_log(log)
Exemple #4
0
def call_bigquery(table, log):
    if table in ('TARGET_metadata_data_HG38', 'TCGA_metadata_data_HG38'):
        bq_table = '[isb-cgc:GDC_metadata.rel8_fileData_current]'
    else:
        bq_table = '[isb-cgc:GDC_metadata.rel8_fileData_legacy]'
    query_results = query_bq_table('select file_name from {} where cases__project__program__name = "{}"'.format(bq_table, table.split('_')[0]), True, 'isb-cgc', log)
    all_rows = set()
    page_token = None
    while True:
        _, rows, page_token = fetch_paged_results(query_results, 20000, None, page_token, log)
        all_rows.update(rows)
        if not page_token:
            log.info('\t\tfound %s rows ' % (len(all_rows)))
            break
    return all_rows
    def get_bq_data_type_barcodes(self, program_name, bq_table, sample_barcode,
                                  has_file, log):
        log.info('\t\tgetting bq data type barcodes {}-{} for gcs'.format(
            program_name, bq_table))

        try:
            if 'Methylation' in bq_table:
                project = '"ALL"'
            else:
                project = 'project_short_name'
            if has_file:
                stmt = 'select {}, case_barcode, {}, file_gdc_id from {} group by 1, 2, 3, 4'.format(
                    project, sample_barcode, bq_table)
            else:
                stmt = 'select {}, case_barcode, {}, "" from {} group by 1, 2, 3'.format(
                    project, sample_barcode, bq_table)
            project2cases = {}
            project2samples = {}
            project2files = {}
            results = query_bq_table(stmt, True, 'isb-cgc', self.log)
            count = 0
            page_token = None
            while True:
                total, rows, page_token = fetch_paged_results(
                    results, 1000, None, page_token, self.log)
                count += 1000
                for row in rows:
                    cases = project2cases.setdefault(row[0], set())
                    cases.add(row[1])
                    samples = project2samples.setdefault(row[0], set())
                    samples.add(row[2])
                    files = project2files.setdefault(row[0], set())
                    if 0 < len(files):
                        files.add(row[3])

                if not page_token:
                    self.log.info(
                        '\tfinished select from {}.  selected {} total rows'.
                        format(bq_table, total))
                    break

            log.info('\t\tfinished bq data type barcodes {}-{} for gcs'.format(
                program_name, bq_table))
            return project2cases, project2samples, project2files
        except:
            log.exception('problem in get_bq_data_type_barcodes()')
            raise
    def get_project_counts(self, table, column, values):
        self.log.info('start get project counts for %s:%s' % (table, column))
        clinical_query = 'SELECT project_short_name, count(*) FROM %s where %s in (\'%s\') group by 1' % (
            table, column, '\', \''.join(values))
        results = query_bq_table(clinical_query, True, 'isb-cgc', self.log)
        page_token = None
        output = '\tproject\tcount\n'
        while True:
            _, rows, page_token = fetch_paged_results(results, 50, None,
                                                      page_token, self.log)
            for row in rows:
                output += '\t%s\t%d\n' % (row[0], row[1])

            if not page_token:
                self.log.info('project counts:\n%s' % (output))
                break
            else:
                pass
Exemple #7
0
def process_image_type(config, image_type, log):
    '''
    based on the configuration map loaded from the configFileName, loads the DCC metadata into CloudSQL.  also
    obtains metadata based on file paths, SDRF values, and CGHub manifest values
    
    parameters:
        config: configuration mappings
        image_type: the type of image (radiology, tissue or diagnostic)
        log: where to write progress and other messages
    '''
    programs = config['program_names_for_images']
    for program in programs:
        # for programs with images, select the appropriate section from the config file
        image_config = config[program]['process_files']['images']
        # query the big query table
        bq_select_template = image_config[image_type]['bq_select_template']
        bq_columns = image_config[image_type]['bq_columns']
        query_results = query_bq_table(
            bq_select_template.format(','.join(bq_columns)),
            image_config[image_type]['use_legacy'],
            image_config['target_program'], log)
        page_token = None
        combined_rows = []
        while True:
            # loop through the big query results
            total_rows, rows, page_token = fetch_paged_results(
                query_results, image_config['fetch_count'], None, page_token,
                log)
            combined_rows += rows
            # process updates to the metadata data table
            process_image_records(config, program, image_config, image_type,
                                  rows, log)

            # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table
            if not page_token:
                log.info('\tchecked total of %s rows' % (total_rows))
                break
        verify_barcodes_filenames(config, program, image_config, image_type,
                                  combined_rows, log)
    def select_clinical_bq_barcodes(self, program):
        self.log.info('start select %s bq cases' % (program.lower()))
        if 'CCLE' == program:
            clinical_query = 'SELECT case_barcode FROM [isb-cgc:%s_bioclin_v0.clinical_v0]' % (
                program)
        else:
            clinical_query = 'SELECT case_barcode FROM [isb-cgc:%s_bioclin_v0.Clinical]' % (
                program)

        clinical_results = query_bq_table(clinical_query, True, 'isb-cgc',
                                          self.log)

        page_token = None
        clinical_case_barcodes = set()
        dup_barcodes = set()
        while True:
            total_rows, rows, page_token = fetch_paged_results(
                clinical_results, 10000, None, page_token, self.log)
            for row in rows:
                case_barcode = row[0].strip()
                if case_barcode in clinical_case_barcodes:
                    dup_barcodes.add(case_barcode)
                else:
                    clinical_case_barcodes.add(case_barcode)

            if not page_token:
                self.log.info('\tselected total of %s case_barcodes' %
                              (total_rows))
                break
            else:
                self.log.info('\t\tselect %d barcodes' % (len(rows)))

        if len(dup_barcodes) > 0:
            self.print_partial_list(
                'duplicate case barcodes in BQ (%s)' % (len(dup_barcodes)),
                dup_barcodes)

        return clinical_case_barcodes
    def get_bq_case_info(self, program, barcode_type, barcodes, tag):
        if 'CCLE' == program:
            table = 'isb-cgc:CCLE_bioclin_v0.clinical_v0'
        else:
            table = 'isb-cgc:{}_bioclin_v0.Clinical'.format(program)
        stmt = 'SELECT case_barcode FROM [{}] where case_barcode in ({})'.format(
            table, ', '.join('"{}"'.format('-'.join(barcode.split('-')[:-1]))
                             for barcode in set(barcodes)))
        results = query_bq_table(stmt, True, 'isb-cgc', self.log)

        page_token = None
        case_barcodes = {}
        while True:
            _, rows, page_token = fetch_paged_results(results, 10000, None,
                                                      page_token, self.log)
            for row in rows:
                case_barcode = row[0].strip()
                case_barcodes.setdefault(case_barcode, set()).add(case_barcode)

            if not page_token:
                break

        return case_barcodes
    def test_populate_sample_availibility(self):
        bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'}
        for bqTable, data_type in bqTable2data_type.iteritems():
            self.log.info('populate_sample_availibility() for %s' %
                          (data_type))
            #remove existing records
            stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \
            '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")'
            ISBCGC_database_helper.update(self.config, stmt, self.log, [[]])

            query_results = query_bq_table(
                'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3'
                .format(bqTable), True, None, self.log)
            page_token = None
            barcode2seen_files = {}
            barcode2infos = {}
            infos = []
            while True:
                # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility()
                # expects it
                total_rows, rows, page_token = fetch_paged_results(
                    query_results, 200000, None, page_token, self.log)
                for row in rows:
                    tumor = row[0][:16]
                    #                     normal = row[1][:16]
                    files = row[2].split('|')
                    for curfile in files:
                        if tumor in barcode2seen_files:
                            seen_files = barcode2seen_files[tumor]
                            if row[2] in seen_files:
                                continue
                            seen_files.add(curfile)
                        else:
                            barcode2seen_files[tumor] = set([curfile])
                        samples_tumor = {'submitter_id': tumor}
                        sample_list = [samples_tumor]

                        info = {'access': 'open'}
                        case_list = info.setdefault('cases', [])
                        case_list += [{'samples': sample_list}]
                        barcode2infos[tumor] = barcode2infos.setdefault(
                            tumor, []) + [info]
#
#                     samples_normal = {'submitter_id': normal}
#                     sample_list = [samples_normal]
#
#                     info = {'access': 'open'}
#                     case_list = info.setdefault('cases', [])
#                     case_list += [{'samples': sample_list}]
#                     barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info]
                infos += [
                    info for curinfos in barcode2infos.itervalues()
                    for info in curinfos
                ]

                # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table
                if not page_token:
                    self.log.info('\tprocessed total of %s rows for %s' %
                                  (total_rows, bqTable))
                    break
            populate_sample_availibility(self.config, 'current', 'TCGA', 'all',
                                         data_type, infos, self.log)
            self.log.info('finished populate_sample_availibility() for %s' %
                          (data_type))
        bq2cases = {}
        bq2samples = {}
        bq2files = {}
        for table in bq_tables:
            if not table[0]:
                continue
            if not table[2]:
                sql = 'select {}, "" from {}'.format(case, table[0])
            elif not table[1]:
                sql = 'select "", {} from {}'.format(sample, table[0])
            else:
                sql = 'select {}, {} from {}'.format(case, sample, table[0])

            self.log.info('\tstart select for {} from bq{}'.format(
                table[0], ' where {}'.format(where) if where else ''))
            results = query_bq_table(sql, True, 'isb-cgc', self.log)
            count = 0
            page_token = None
            cases = set()
            samples = set()
            while True:
                total, rows, page_token = fetch_paged_results(
                    results, 1000, None, page_token, self.log)
                count += 1000
                for row in rows:
                    cases.add(row[0])
                    samples.add(row[1])

                if not page_token:
                    self.log.info(
                        '\tfinished select from {}.  select {} total rows'.
    def get_bq_file_info(self, program, barcode_type, barcodes, tag):
        program2dataset2data_type2column_name = {
            'TARGET': {
                'TARGET_hg38_data_v0': {
                    'RNAseq_Gene_Expression': 'file_gdc_id',
                    'miRNAseq_Expression': 'file_gdc_id',
                    'miRNAseq_Isoform_Expression': 'file_gdc_id'
                },
            },
            'TCGA': {
                'TCGA_hg19_data_v0': {
                    'Copy_Number_Segment_Masked': 'aliquot_barcode',
                    'DNA_Methylation': 'aliquot_barcode',
                    'Protein_Expression': 'aliquot_barcode',
                    'RNAseq_Gene_Expression_UNC_RSEM': 'aliquot_barcode',
                    'miRNAseq_Expression': 'file_gdc_id',
                    'miRNAseq_Isoform_Expression': 'file_gdc_id',
                    'Somatic_Mutation_DCC': 'aliquot_barcode_tumor',
                    'Somatic_Mutation_MC3': 'aliquot_barcode_tumor'
                },
                'TCGA_hg38_data_v0': {
                    'Copy_Number_Segment_Masked': 'file_gdc_id',
                    'RNAseq_Gene_Expression': 'file_gdc_id',
                    'miRNAseq_Expression': 'file_gdc_id',
                    'miRNAseq_Isoform_Expression': 'file_gdc_id',
                    'DNA_Methylation': 'file_gdc_id',
                    'Protein_Expression': 'aliquot_barcode',
                    'Somatic_Mutation': 'fileName'
                }
            }
        }

        barcodes_in = ','.join('"{}"'.format(barcode)
                               for barcode in set(barcodes))
        query_template = 'select left(data.{0}, {8}), meta.data_type, meta.data_format, meta.experimental_strategy, meta.platform, {1} from [isb-cgc:{2}.{3}] data join [isb-cgc:GDC_metadata.{4}] meta\n' \
            '  on data.{5} = meta.{6}\nwhere data.{0} in ({7})\ngroup by 1,2,3,4,5,6'
        barcode2infos = {}
        dataset2data_type2column_name = program2dataset2data_type2column_name.setdefault(
            program, {})
        for dataset in dataset2data_type2column_name:
            if 'hg19' in dataset:
                column_sub = '"None"'
                table = 'rel5_legacy_fileData'
            else:
                column_sub = 'meta.analysis_workflow_type'
                table = 'rel5_current_fileData'
            data_type2column_name = dataset2data_type2column_name.setdefault(
                dataset, {})
            for data_type, column_name in data_type2column_name.iteritems():
                if column_name in ('aliquot_barcode', 'Tumor_Sample_Barcode',
                                   'aliquot_barcode_tumor'):
                    join_col = 'associated_entities__entity_submitter_id'
                elif column_name in ('fileName'):
                    join_col = 'file_name'
                else:
                    join_col = column_name

                if data_type in ('Somatic_Mutation', ):
                    barcode_t = 'Tumor_Sample_Barcode'
                elif data_type in ('Somatic_Mutation_MC3',
                                   'Somatic_Mutation_DCC'):
                    barcode_t = 'sample_barcode_tumor'
                else:
                    barcode_t = barcode_type

                if 'case_barcode' == barcode_type:
                    if program == "CCLE":
                        length = 200
                    elif program == 'TARGET':
                        length = 16
                    else:
                        length = 12
                else:
                    length = 200

                query = query_template.format(barcode_t, column_sub, dataset,
                                              data_type, table, column_name,
                                              join_col, barcodes_in, length)
                results = query_bq_table(query, True, 'isb-cgc', self.log)

                page_token = None
                while True:
                    _, rows, page_token = fetch_paged_results(
                        results, 10000, None, page_token, self.log)
                    for row in rows:
                        barcode = row[0]
                        if barcode in barcodes:
                            barcode2infos[barcode] = barcode2infos.setdefault(
                                barcode, []) + [[
                                    row[1], row[2],
                                    str(row[3]),
                                    str(row[4]),
                                    str(row[5])
                                ]]
                        else:
                            raise ValueError(
                                'unexpected mismatch of return with barcodes:\n{}\n'
                                .format(', '.join(barcodes),
                                        ', '.join(str(field)
                                                  for field in row)))

                    if not page_token:
                        break

        return barcode2infos
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/'
        log_name = create_log(log_dir, 'update_tcga_gcs_paths')
        log = logging.getLogger(log_name)

        log.info('begin updating TCGA paths in production')

        # get the db rows from production cloudsql
        log.info('\tselect tcga filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\''

        cloudsql_rows = set(row[0]
                            for row in helper.select(config, query, log, []))
        log.info('\tselected %s tcga filenames from cloudsql' %
                 (len(cloudsql_rows)))

        # read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \
            'order by file_gcs_url'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        total_not_matched = 0
        total_distinct = set()
        page_token = None
        while True:
            total_rows, rows, page_token = fetch_paged_results(
                query_results, 2000, None, page_token, log)

            log.info('\t\tcreate map of filename to path')
            name2path = {}
            for row in rows:
                fields = row[1].split('/')
                name2path[fields[-1]] = '/'.join(fields[3:])
            log.info('\t\tfinished map of filename to path')

            # now setup and do the update of paths in cloud sql
            log.info('\t\tstart updating paths in cloudsql')
            params = []
            select_params = []
            not_matched = []
            for name, path in name2path.iteritems():
                if name in cloudsql_rows:
                    total_distinct.add(name)
                    params += [[path, name]]
                    select_params += [name]
                else:
                    not_matched += [path]
            update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s'
            helper.update(config, update, log, params)
            select_in = '%s,' * len(select_params)
            select_in = select_in[:-1]
            select_query = 'select count(*) from metadata_data where datafilename in (%s)' % (
                select_in)
            count = helper.select(config, select_query, log, select_params)
            log.info('select %s file name matches for %s file names.' %
                     (count[0][0], len(select_params)))

            total_not_matched += len(not_matched)
            if not page_token:
                log.info(
                    '\t\tupdated total of %s rows for TCGA with %d distinct file names'
                    % (total_rows - total_not_matched, len(total_distinct)))
                break
            else:
                log.info(
                    '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s'
                    %
                    (len(params), len(not_matched), '\n\t'.join(not_matched)))

        log.info('\tcompleted update of paths in cloudsql')

        log.info('finished updating TCGA paths in production')
    except:
        if log:
            log.exception('failed to update tcga GCS filepaths')
    finally:
        if log:
            close_log(log)