def select_sample_sql_barcodes(self, program):
     self.log.info('start select %s sql cases' % (program.lower()))
     sample_query = 'SELECT case_barcode, sample_barcode FROM %s_metadata_biospecimen' % (
         program)
     rows = ISBCGC_database_helper.select(self.config,
                                          sample_query,
                                          self.log,
                                          params=[])
     sample_case_barcodes = set()
     sample_sample_barcodes = set()
     duplicates = list()
     for row in rows:
         case_barcode = row[0].strip()
         sample_barcode = row[1].strip()
         if sample_barcode in sample_sample_barcodes:
             duplicates += [sample_barcode]
         sample_case_barcodes.add(case_barcode)
         sample_sample_barcodes.add(sample_barcode)
     if 0 < len(duplicates):
         self.print_partial_list('found sample duplicates', duplicates)
         query = 'select l.endpoint_type, c.endpoint_type, count(distinct l.sample_barcode), count(distinct c.sample_barcode) ' \
                 'from {0}_metadata_biospecimen l left join {0}_metadata_biospecimen c ' \
                 'on l.sample_barcode = c.sample_barcode ' \
                 'group by 1, 2'.format(program)
         rows = ISBCGC_database_helper.select(self.config,
                                              query,
                                              self.log,
                                              params=[])
         self.log.info(
             '\nendpoint\tendpoint\tlegacy count\tcurrent count\n%s\n' %
             ('\n'.join('%s\t%s\t%d\t%d' % (row[0], row[1], row[2], row[3])
                        for row in rows)))
     return sample_sample_barcodes, sample_case_barcodes
 def select_clinical_sql_barcodes(self, program):
     self.log.info('start select %s sql cases' % (program.lower()))
     clinical_query = 'SELECT case_barcode FROM %s_metadata_clinical' % (
         program)
     rows = ISBCGC_database_helper.select(self.config,
                                          clinical_query,
                                          self.log,
                                          params=[])
     clinical_case_barcodes = set()
     duplicates = list()
     for row in rows:
         case_barcode = row[0].strip()
         if case_barcode in clinical_case_barcodes:
             duplicates += [case_barcode]
             #                 raise ValueError('found duplicate case entry: %s' % (row[0]))
             continue
         clinical_case_barcodes.add(case_barcode)
     if 0 < len(duplicates):
         self.print_partial_list('found case duplicates', duplicates)
         query = 'select l.endpoint_type, c.endpoint_type, count(distinct l.case_barcode), count(distinct c.case_barcode) ' \
                 'from {0}_metadata_clinical l left join {0}_metadata_clinical c ' \
                 'on l.case_barcode = c.case_barcode ' \
                 'group by 1, 2'.format(program)
         rows = ISBCGC_database_helper.select(self.config,
                                              query,
                                              self.log,
                                              params=[])
         self.log.info(
             '\nendpoint\tendpoint\tlegacy count\tcurrent count\n%s' %
             ('\n'.join('%s\t%s\t%d\t%d' % (row[0], row[1], row[2], row[3])
                        for row in rows)))
     return clinical_case_barcodes
def main(configfilename, baminfo_filename):
    log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/'
    log_name = create_log(log_dir, 'update_gg_metadata')
    log = logging.getLogger(log_name)
    log.info('begin update gg metadata')

    idcol = 0
    ggdatasetcol = 7
    ggreadgroupset = 6
    nexist = 'NA'
    updates = []
    count = 0
    with open(baminfo_filename) as baminfo:
        baminfo.readline()
        for line in baminfo:
            fields = line.strip().split('\t')
            if fields[ggreadgroupset] == nexist:
                continue
            values = [
                fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol]
            ]
            if 0 == count % 1000:
                log.info('%s processing row %s--%s' %
                         (datetime.now(), count, ':'.join(values)))
            count += 1
            updates += [values]

    stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s'
    with open(configfilename) as configFile:
        config = json.load(configFile)
    ISBCGC_database_helper.update(config, stmt, log, updates, False)

    log.info('finished update gg metadata')
Ejemplo n.º 4
0
def process_biospecimen_image_records(config, program, image_config,
                                      image_type, rows, log):
    '''
    based on either the sample_barcode (for tissue or diagnostic images),
    creates a new metadata clinical record if necessary in the config-specified metadata clinical table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        endpoint_type: 'legacy'
        sample_gdc_id: ?
        sample_barcode: row[0]
        sample_type: row[0][13:15]
        case_gdc_id: ?
        case_barcode: row[2]
        program_name: 'TCGA'
        disease_code: project_name.split('-')[1]
        project_short_name: row[4].split('/')[6]
    '''
    # get the information from the config mapping
    log.info('\tchecking biospecimen records')
    barcode2row = dict([(row[image_config[image_type]['sample_barcode_index']],
                         row) for row in rows])
    log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)),
                                       len(barcode2row)))
    biospecimen_select_template = image_config['biospecimen_select_template']
    biospecimen_rows = ISBCGC_database_helper.select(
        config,
        biospecimen_select_template % ("'" + "','".join(barcode2row) + "'"),
        log, [])
    barcode_db = set(
        [biospecimen_row[0] for biospecimen_row in biospecimen_rows])
    new_barcodes = set(barcode2row) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new biospecimen records'.format(
            len(new_barcodes)))
        rows = []
        for barcode in new_barcodes:
            row = barcode2row[barcode]
            project_short_name = row[4].split('/')[6]
            rows += [[
                'legacy', row[0], row[0][13:15], row[2], 'TCGA',
                project_short_name.split('-')[1], project_short_name
            ]]
            rows += [[
                'current', row[0], row[0][13:15], row[2], 'TCGA',
                project_short_name.split('-')[1], project_short_name
            ]]
        ISBCGC_database_helper.column_insert(
            config, rows, image_config['biospecimen_table'],
            image_config['biospecimen_columns'], log)
    else:
        log.info('\t\tno rows to insert for biospecimen records')
Ejemplo n.º 5
0
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/'
        log_name = create_log(log_dir, 'update_ccle_gcs_paths')
        log = logging.getLogger(log_name)
        
        log.info('begin updating CCLE paths in production')
        # first thing to do is to read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'CCLE\')'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log)
        
        log.info('\tcreate map of filename to path')
        name2path = {}
        for row in rows:
            fields = row[1].split('/')
            name2path[fields[-1]] = '/'.join(fields[3:])
        log.info('\tfinished map of filename to path')
        
        # get the db rows from production cloudsql
        log.info('\tselect ccle filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM main.metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\''
        
        rows = helper.select(config, query, log, [])
        log.info('\tselected %s ccle filenames from cloudsql' % (len(rows)))
        
        # now setup and do the update of paths in cloud sql
        log.info('\tstart updating paths in cloudsql')
        params = []
        not_matched = []
        for row in rows:
            if row[0] in name2path:
                params += [[name2path[row[0]], row[0]]]
            else:
                not_matched += [row[0]]
        update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s'
        helper.update(config, update, log, params)
        log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched)))

        log.info('finished updating CCLE paths in production')
    except:
        if log:
            log.exception('failed to update ccle GCS filepaths')
    finally:
        if log:
            close_log(log)
Ejemplo n.º 6
0
def update_cloudsql_from_bigquery(config,
                                  postproc_config,
                                  project_name,
                                  cloudsql_table,
                                  log,
                                  data_type=None,
                                  endpt_type=None):
    update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % (
        cloudsql_table, '\n\t'.join(
            '%s = %%s,' % (postproc_config['postproc_columns'][key])
            for key in postproc_config['postproc_columns'].keys())[:-1],
        postproc_config['postproc_key_column'])
    if project_name:
        if data_type:  # assumes that endpt_type is also supplied
            query_results = query_bq_table(
                postproc_config['postproc_query'] %
                (', '.join(postproc_config['postproc_columns'].keys()),
                 endpt_type, project_name, data_type), False,
                postproc_config['postproc_project'], log)
        else:
            query_results = query_bq_table(
                postproc_config['postproc_query'] % (', '.join(
                    postproc_config['postproc_columns'].keys()), project_name),
                False, postproc_config['postproc_project'], log)
    else:
        query_results = query_bq_table(
            postproc_config['postproc_query'] %
            (', '.join(postproc_config['postproc_columns'].keys())), False,
            postproc_config['postproc_project'], log)
    page_token = None
    log.info('\t\t\tupdate_stmt\n%s' % (update_stmt))
    update_count = 0
    while True:
        total_rows, rows, page_token = fetch_paged_results(
            query_results, postproc_config['postproc_fetch_count'],
            project_name, page_token, log)
        if 0 < total_rows:
            log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' %
                     (total_rows, str(rows[0]), str(rows[-1])))
        else:
            log.info('\t\t\tno rows')
            return
        if config['update_cloudsql']:
            ISBCGC_database_helper.update(config, update_stmt, log, rows, True)
        update_count += len(rows)
        log.info(
            '\t\t\tupdated %s so far%s' %
            (update_count, ' for ' + project_name if project_name else ''))
        if not page_token:
            log.info(
                '\t\t\tupdated total of %s rows%s' %
                (update_count, ' for ' + project_name if project_name else ''))
            return
def set_uploaded_path(config, endpt_type, program_name, project_id, data_type,
                      log):
    log.info('\tbegin set_uploaded_path()')
    postproc_config = config['postprocess_keypath']
    # first set file_name_key to null
    cloudsql_table = '%s_metadata_data_%s' % (
        program_name, config['endpt2genomebuild'][endpt_type])
    ISBCGC_database_helper.update(
        config, postproc_config['postproc_file_name_key_null_update'] %
        (cloudsql_table, project_id, data_type), log, [[]])
    # now use the BigQuery table to set file_name_key
    update_cloudsql_from_bigquery(config, postproc_config, project_id,
                                  cloudsql_table, log, data_type, endpt_type)
    ISBCGC_database_helper.update(
        config,
        postproc_config['postproc_file_uploaded_update'] % (cloudsql_table),
        log, [[]])
    log.info('\tfinished set_uploaded_path()')
def populate_sample_availibility(config, endpt_type, program_name, project_id,
                                 data_type, infos, log):
    log.info('\tbegin populate_sample_availibility() for %s:%s' %
             (project_id, data_type))

    # iterate through the gdc info and put together the counts for the sample barcodes
    sample_barcode2count = {}
    for info in infos:
        mapping = config[program_name]['process_files']['data_table_mapping']
        flattened = flatten_map(info, mapping)
        for index in range(len(flattened)):
            if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \
                (data_type in ('Aligned reads') and 'open' == flattened[index]['access']):
                continue
            sample_barcode = flattened[index]['sample_barcode']
            count = sample_barcode2count.setdefault(sample_barcode, 0)
            sample_barcode2count[sample_barcode] = count + 1

    # read in the appropriate data availability row to get the foreign key
    isb_label = config['data_type2isb_label'][data_type]
    stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % (
        program_name)
    foreign_key = ISBCGC_database_helper.select(
        config, stmt, log,
        [config['endpt2genomebuild'][endpt_type], isb_label])[0][0]

    params = []
    for sample_barcode, count in sample_barcode2count.iteritems():
        params += [[foreign_key, sample_barcode, count]]

    ISBCGC_database_helper.column_insert(
        config, params,
        '%s_metadata_sample_data_availability' % (program_name),
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    log.info('\tfinished populate_sample_availibility() for %s:%s' %
             (project_id, data_type))
Ejemplo n.º 9
0
def process_data_availability_records(config, program, image_config,
                                      image_type, rows, log):
    '''
    NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label.  it is
    possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with 
    more files
    '''
    if 'Radiology' == image_type:
        return

    stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build'
    image_rows = ISBCGC_database_helper.select(config, stmt, log,
                                               ["Pathology_Image", "HG19"])
    if 1 != len(image_rows):
        raise ValueError(
            'unexpected number of rows returned for availability id {}'.format(
                image_rows))
    data_type_availability_id = image_rows[0][0]

    # get the current count for a barcode based on previous runs
    stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format(
        program)
    db_rows = ISBCGC_database_helper.select(config, stmt, log, [
        data_type_availability_id,
    ])
    db_barcode2count = {}
    for db_row in db_rows:
        db_barcode2count[db_row[0]] = db_row[1]
    if len(db_rows) != len(set(db_rows)):
        raise ValueError(
            'sample_barcode should only appear once per isb_label')

    # get the current count from the current batch for each sample barcode
    barcode2count = {}
    for row in rows:
        barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1

    new_barcodes = set(barcode2count) - set(db_rows)
    new_params = []
    for new_barcode in new_barcodes:
        new_params += [[
            data_type_availability_id, new_barcode, barcode2count[new_barcode]
        ]]
    ISBCGC_database_helper.column_insert(
        config, new_params, 'TCGA_metadata_sample_data_availability',
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    update_barcodes = set(barcode2count) & set(db_rows)
    update_params = []
    for update_barcode in update_barcodes:
        update_params += [[
            barcode2count[update_barcode] + db_barcode2count[update_barcode],
            data_type_availability_id, update_barcode
        ]]
    stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s'
    ISBCGC_database_helper.update(config, stmt, log, update_params, False)
    def get_gcs_data_types_barcodes(self, program_name, data_types, log):
        log.info('\t\tgetting gcs data types barcodes {}-{} for gcs'.format(
            program_name, '"{}"'.format(', '.join(data_types))))

        try:
            stmt = 'select project_short_name, case_barcode, sample_barcode, file_gdc_id from {} where data_type in ({})'
            project2cases = {}
            project2samples = {}
            project2files = {}
            for data_type in data_types:
                build = 'HG38' if data_type in ('Aligned Reads', \
                                                'miRNA Expression Quantification', \
                                                'Isoform Expression Quantification', \
                                                'Gene Expression Quantification', \
                                                'Masked Copy Number Segment', \
                                                'Methylation Beta Value', \
                                                'Masked Somatic Mutation') \
                        else 'HG19'
                rows = ISBCGC_database_helper.select(
                    self.config,
                    stmt.format(
                        '{}_metadata_data_{}'.format(program_name, build),
                        '"{}"'.format(data_type)), log, [])
                for row in rows:
                    cases = project2cases.setdefault(row[0], set())
                    cases.add(row[1])
                    samples = project2samples.setdefault(row[0], set())
                    samples.add(row[2])
                    files = project2files.setdefault(row[0], set())
                    files.add(row[3])
                log.info(
                    '\t\tget_gcs_data_types_barcodes(): {}-{} cumulative counts, cases={}, samples={}'
                    .format(
                        program_name, data_type,
                        sum(
                            len(cases)
                            for cases in project2cases.itervalues()),
                        sum(
                            len(samples)
                            for samples in project2samples.itervalues())))

            log.info(
                '\t\tfinished gcs data types barcodes {}-{} for gcs'.format(
                    program_name, '"{}"'.format(', '.join(data_types))))
            return project2cases, project2samples, project2files
        except:
            log.exception('problem in get_gcs_data_types_barcodes()')
            raise
    def get_sql_file_info(self, program, barcode_type, barcodes, tag):
        barcode2infos = {}
        query_template = 'select {0}, data_type, data_format, experimental_strategy, platform, {1}\n' \
            'from {2}_metadata_data_HG{3}\n' \
            'where {0} in ({4})\n'
        barcodes_in = ','.join('"{}"'.format(barcode)
                               for barcode in set(barcodes))
        query = ''
        for build in ('19', '38'):
            if '38' == build:
                if 'CCLE' == program:
                    break
                else:
                    query += 'union\n'
                    column = 'analysis_workflow_type'
            else:
                column = '"None"'
            query += query_template.format(barcode_type, column, program,
                                           build, barcodes_in)
        rows = ISBCGC_database_helper.select(self.config, query, self.log, [])
        for row in rows:
            barcode = row[0]
            if barcode in barcodes:
                barcode2infos[barcode] = barcode2infos.setdefault(
                    barcode, []) + [[
                        row[1], row[2],
                        str(row[3]),
                        str(row[4]),
                        str(row[5])
                    ]]
            else:
                raise ValueError(
                    'unexpected mismatch of return with barcodes:\n{}\n'.
                    format(', '.join(barcodes),
                           ', '.join(str(field) for field in row)))

        return barcode2infos
    def get_gcs_isb_label_barcodes(self, program_name, isb_label, log):
        log.info('\t\tgetting isb_label barcodes {}-{} for gcs'.format(
            program_name, isb_label))

        try:
            project2cases = {}
            project2samples = {}
            stmt = 'select bs.project_short_name, bs.case_barcode, sa.sample_barcode ' \
                'from {0}_metadata_sample_data_availability sa join {0}_metadata_data_type_availability da on sa.metadata_data_type_availability_id = da.metadata_data_type_availability_id ' \
                'join {0}_metadata_biospecimen bs on sa.sample_barcode = bs.sample_barcode ' \
                'where isb_label = %s group by 1, 2, 3'.format(program_name)
            rows = ISBCGC_database_helper().select(self.config, stmt, log,
                                                   [isb_label])
            for row in rows:
                cases = project2cases.setdefault(row[0], set())
                cases.add(row[1])
                samples = project2samples.setdefault(row[0], set())
                samples.add(row[2])
            log.info(
                '\t\tget_bq_isb_label_barcodes(): {}-{} cumulative counts, cases={}, samples={}'
                .format(
                    program_name, isb_label,
                    sum(len(cases) for cases in project2cases.itervalues()),
                    sum(
                        len(samples)
                        for samples in project2samples.itervalues())))

            log.info('\t\tfinished get_gcs_isb_label_barcodes() {}-{} for gcs'.
                     format(program_name, isb_label))
            return project2cases, project2samples
        except:
            log.exception('problem in get_bq_isb_label_barcodes()')
            raise

        log.info('\t\tfinished isb_label barcodes {}-{} for gcs'.format(
            program_name, isb_label))
Ejemplo n.º 13
0
def verify_barcodes_filenames(config, program, image_config, image_type, rows,
                              log):
    barcode_bq = set()
    if 'Radiology' == image_type:
        barcode_bq = set([(row[0], row[1].split('/')[-1]) for row in rows])
    else:
        barcode_bq = set([(row[0], row[1]) for row in rows])

    data_select_template = image_config[image_type][
        'data_verify_select_template']
    data_rows = ISBCGC_database_helper.select(config, data_select_template,
                                              log, [])
    barcode_db = set([(data_row[0], data_row[1]) for data_row in data_rows])

    log.info(
        '\n\tBQ length:{}\tSQL length:{}\n\tbarcode/file combinations in BQ and not SQL:{}\n\tbarcode/file combinations in SQL and not BQ:{}'
        .format(len(barcode_bq), len(barcode_db), len(barcode_bq - barcode_db),
                len(barcode_db - barcode_bq)))
    if 0 < len(barcode_bq - barcode_db):
        log.info('\n\tfirst barcodes in bq only: {}'.format(
            list(barcode_bq - barcode_db)[:20]))
    if 0 < len(barcode_db - barcode_bq):
        log.info('first barcodes in sql only: {}'.format(
            list(barcode_db - barcode_bq)[:20]))
    def test_populate_sample_availibility(self):
        bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'}
        for bqTable, data_type in bqTable2data_type.iteritems():
            self.log.info('populate_sample_availibility() for %s' %
                          (data_type))
            #remove existing records
            stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \
            '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")'
            ISBCGC_database_helper.update(self.config, stmt, self.log, [[]])

            query_results = query_bq_table(
                'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3'
                .format(bqTable), True, None, self.log)
            page_token = None
            barcode2seen_files = {}
            barcode2infos = {}
            infos = []
            while True:
                # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility()
                # expects it
                total_rows, rows, page_token = fetch_paged_results(
                    query_results, 200000, None, page_token, self.log)
                for row in rows:
                    tumor = row[0][:16]
                    #                     normal = row[1][:16]
                    files = row[2].split('|')
                    for curfile in files:
                        if tumor in barcode2seen_files:
                            seen_files = barcode2seen_files[tumor]
                            if row[2] in seen_files:
                                continue
                            seen_files.add(curfile)
                        else:
                            barcode2seen_files[tumor] = set([curfile])
                        samples_tumor = {'submitter_id': tumor}
                        sample_list = [samples_tumor]

                        info = {'access': 'open'}
                        case_list = info.setdefault('cases', [])
                        case_list += [{'samples': sample_list}]
                        barcode2infos[tumor] = barcode2infos.setdefault(
                            tumor, []) + [info]
#
#                     samples_normal = {'submitter_id': normal}
#                     sample_list = [samples_normal]
#
#                     info = {'access': 'open'}
#                     case_list = info.setdefault('cases', [])
#                     case_list += [{'samples': sample_list}]
#                     barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info]
                infos += [
                    info for curinfos in barcode2infos.itervalues()
                    for info in curinfos
                ]

                # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table
                if not page_token:
                    self.log.info('\tprocessed total of %s rows for %s' %
                                  (total_rows, bqTable))
                    break
            populate_sample_availibility(self.config, 'current', 'TCGA', 'all',
                                         data_type, infos, self.log)
            self.log.info('finished populate_sample_availibility() for %s' %
                          (data_type))
Ejemplo n.º 15
0
def update_database(path2bam2bai, config, log):
    log.info('\tbegin update database to match bucket index files')
    
    table2dbpath2bam2bai = {}
    for table in ['CCLE_metadata_data_HG19', 'TARGET_metadata_data_HG19', 'TARGET_metadata_data_HG38', 'TCGA_metadata_data_HG19', 'TCGA_metadata_data_HG38']:
        program_filenames = call_bigquery(table, log)
        
        dbpath2bam2bai = table2dbpath2bam2bai.setdefault(table, {})
        rows = ISBCGC_database_helper.select(config, 'select file_name_key, index_file_name from {} where "BAM" = data_format order by 1'.
                                             format(table), log, [])
        log.info('\t\tfound {} rows for {}: {}...{}'.format(len(rows), table, rows[:5], rows[-5:]))
        count = 0
        
        print_at = len(rows) / 11
        missing_files = set()
        bucket2missing_files = {}
        mismatchs = 0
        bucket2mismatchs = {}
        params = []
        for row in rows:
            if not row[0]:
                continue
            if 0 == count % print_at:
                log.info('\t\t\tlooking at row {} for {}: {}'.format(count, table, row))
            count += 1
            parts = row[0].split('/')
            path = '/'.join(parts[:-1])
            bam = parts[-1]
            bai = row[1]
            bam2bai = path2bam2bai.get(path)
            if bam2bai: 
                assert bam in bam2bai, 'didn\'t find bam {} in tsv files'.format(bam)
                if bai != bam2bai[bam]:
                    mismatchs += 1
                    curcount = bucket2mismatchs.setdefault(parts[2], 0)
                    bucket2mismatchs[parts[2]] = curcount + 1
                    params += [[bam2bai[bam], row[0]]]
            else:
                missing_files.add(row[0])
                curcount = bucket2missing_files.setdefault(parts[2], 0)
                bucket2missing_files[parts[2]] = curcount + 1
            dbbam2bai = dbpath2bam2bai.setdefault(path, {})
            if bam in dbbam2bai:
                log.warning('\t\t\tfound {} already in the db map for {}'.format(bam, table))
            assert bai, 'didn\'t find a index file for {} in the db'.format(bam) 
            dbbam2bai[bam] = bai

        if 0 < len(params):
#             ISBCGC_database_helper.update(config, 'update {} set index_file_name = %s where file_name_key = %s'.format(table), log, params)
            log.info('there were {} param sets for update'.format(len(params)))
        else:
            log.info('there were no param sets for update')
        
        if 0 < len(missing_files):
            missing_files = sorted(missing_files)
            log.warning('\n\t\t\tfound {} missing files in tsv that are in db out of {} total: \n\t{}\n'.format(len(missing_files), len(rows), '\n\t'.join(missing_files)))
            for bucket, bmissing_files in bucket2missing_files.iteritems():
                if 0 < bmissing_files:
                    log.warning('\n\t\t\tfound {} missing files in tsv that is in bucket {}\n'.format(bmissing_files, bucket))
        if 0 < mismatchs:
            log.warning('\n\t\t\tfound {} index between tsv and db mismatches out of {} total\n'.format(mismatchs, len(rows)))
            for bucket, bmismatchs in bucket2mismatchs.iteritems():
                if 0 < bmismatchs:
                    log.warning('\n\t\t\tfound {} mismatches between tsv and db files in bucket {}\n'.format(bmismatchs, bucket))
        log.info('\t\tfinished table {}\n\n'.format(table))
    
    log.info('\t\tfind discrepencies in the db from tsv')
    for table, dbpath2bam2bai in table2dbpath2bam2bai.iteritems():
        bucket2bambai_diff = {}
        missing_paths = set()
        path2missing_dbfiles = {}
        for path, bam2bai in path2bam2bai.iteritems():
            if path not in dbpath2bam2bai:
                not_curpath = True
                for bam in bam2bai:
                    if bam in program_filenames:
                        not_curpath = False
                        break
                if not_curpath:
                    continue
                missing_paths.add(path)
                continue
            dbbam2bai = dbpath2bam2bai[path]
            for bam, bai in bam2bai.iteritems():
                if bam not in program_filenames:
                    continue
                if bam not in dbbam2bai:
                    path2missing_dbfiles.setdefault(path, set()).add(bam)
                    continue
                dbbai = dbbam2bai[bam]
                if dbbai != bai: 
                    if dbbai.endswith('bam.bai') and bai.endswith('bai'):
                        bambai_diff = bucket2bambai_diff.setdefault(path.split('/')[2], 0)
                        bucket2bambai_diff[path.split('/')[2]] = bambai_diff + 1
                    else:
                        log.warning('\t\t\tdidn\'t find matching bai in db for table {}: {} vs {}'.format(table, bai, dbbai))
        missing_paths = sorted(list(missing_paths))
        log.warning('\t\t\tfound {} paths missing from db for table {}: {}...{}\n'.format(len(missing_paths), table, missing_paths[:4], missing_paths[-3:]))
        for path, bams in path2missing_dbfiles.iteritems():
            bams = sorted(list(bams))
            log.warning('\t\t\tfound {} files missing from db for path {} for table {}: {}...{}'.format(len(bams), path, table, bams[:4], bams[-3:]))
        log.info('\n\t\t\ttotal of {} files missing from the db that are in the tsv for table {}\n'.format(sum(len(dbfiles) for dbfiles in path2missing_dbfiles.itervalues()), table))
        for bucket, bambai_diff in bucket2bambai_diff.iteritems():
            if bambai_diff > 0:
                log.warning('\t\t\tfound {} cases of \'bai\' in tsv and \'bam.bai\' in db for {} for table {}'.format(bambai_diff, bucket, table))
    log.info('\n\t\tfinished discrepancies in the db from tsv')
    log.info('\tfinished update database to match bucket index files')
        return set(barcode2info.keys()), samples

    def get_sql_barcodes(self,
                         tables,
                         case='case_barcode',
                         sample='sample_barcode'):
        table2cases = {}
        table2samples = {}
        for table, info in tables.iteritems():
            if not info[2]:
                sql = 'select {}, "" from {}'.format(case, table)
            elif not info[1]:
                sql = 'select "", {} from {}'.format(sample, table)
            else:
                sql = 'select {}, {} from {}'.format(case, sample, table)
            rows = ISBCGC_database_helper().select(self.config, sql, self.log,
                                                   [])

            cases = set()
            samples = set()
            for row in rows:
                cases.add(row[0])
                samples.add(row[1])
            table2cases[table] = cases if 1 < len(cases) else set()
            table2samples[table] = samples if 1 < len(samples) else set()
        return table2cases, table2samples

    def get_bq_barcodes(self,
                        bq_tables,
                        case='case_barcode',
                        sample='sample_barcode',
                        where=None):
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/'
        log_name = create_log(log_dir, 'update_tcga_gcs_paths')
        log = logging.getLogger(log_name)

        log.info('begin updating TCGA paths in production')

        # get the db rows from production cloudsql
        log.info('\tselect tcga filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\''

        cloudsql_rows = set(row[0]
                            for row in helper.select(config, query, log, []))
        log.info('\tselected %s tcga filenames from cloudsql' %
                 (len(cloudsql_rows)))

        # read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \
            'order by file_gcs_url'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        total_not_matched = 0
        total_distinct = set()
        page_token = None
        while True:
            total_rows, rows, page_token = fetch_paged_results(
                query_results, 2000, None, page_token, log)

            log.info('\t\tcreate map of filename to path')
            name2path = {}
            for row in rows:
                fields = row[1].split('/')
                name2path[fields[-1]] = '/'.join(fields[3:])
            log.info('\t\tfinished map of filename to path')

            # now setup and do the update of paths in cloud sql
            log.info('\t\tstart updating paths in cloudsql')
            params = []
            select_params = []
            not_matched = []
            for name, path in name2path.iteritems():
                if name in cloudsql_rows:
                    total_distinct.add(name)
                    params += [[path, name]]
                    select_params += [name]
                else:
                    not_matched += [path]
            update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s'
            helper.update(config, update, log, params)
            select_in = '%s,' * len(select_params)
            select_in = select_in[:-1]
            select_query = 'select count(*) from metadata_data where datafilename in (%s)' % (
                select_in)
            count = helper.select(config, select_query, log, select_params)
            log.info('select %s file name matches for %s file names.' %
                     (count[0][0], len(select_params)))

            total_not_matched += len(not_matched)
            if not page_token:
                log.info(
                    '\t\tupdated total of %s rows for TCGA with %d distinct file names'
                    % (total_rows - total_not_matched, len(total_distinct)))
                break
            else:
                log.info(
                    '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s'
                    %
                    (len(params), len(not_matched), '\n\t'.join(not_matched)))

        log.info('\tcompleted update of paths in cloudsql')

        log.info('finished updating TCGA paths in production')
    except:
        if log:
            log.exception('failed to update tcga GCS filepaths')
    finally:
        if log:
            close_log(log)
Ejemplo n.º 18
0
def process_clinical_image_records(config, program, image_config, image_type,
                                   rows, log):
    '''
    based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images),
    creates a new metadata data record in the config-specified metadata data table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        endpoint_type: 'legacy'
        case_gdc_id: ?
        case_barcode: row[2]
        program_name: 'TCGA'
        disease_code: project_name.split('-')[1]
        project_short_name: row[4].split('/')[6]
    '''
    # get the information from the config mapping
    log.info('\tchecking clinical records.')
    barcode2row = dict([(row[image_config[image_type]['case_barcode_index']],
                         row) for row in rows])
    log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)),
                                       len(barcode2row)))
    clinical_select_template = image_config['clinical_select_template']
    clinical_rows = ISBCGC_database_helper.select(
        config,
        clinical_select_template % ("'" + "','".join(barcode2row) + "'"), log,
        [])
    barcode_db = set([clinical_row[0] for clinical_row in clinical_rows])
    new_barcodes = set(barcode2row) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new clinical records'.format(
            len(new_barcodes)))

        rows = []
        for barcode in new_barcodes:
            row = barcode2row[barcode]
            if 'Radiology' == image_type:
                case_barcode = row[0]
                project_short_name = row[1].split('/')[5]
            else:
                case_barcode = row[2]
                project_short_name = row[4].split('/')[6]
            if 'NA' == project_short_name:
                continue
            rows += [[
                'legacy', case_barcode, program,
                project_short_name.split('-')[1], project_short_name
            ]]
            rows += [[
                'current', case_barcode, program,
                project_short_name.split('-')[1], project_short_name
            ]]
        ISBCGC_database_helper.column_insert(config, rows,
                                             image_config['clinical_table'],
                                             image_config['clinical_columns'],
                                             log)

    else:
        log.info('\t\tno rows to insert for clinical records')
Ejemplo n.º 19
0
def process_data_image_records(config, program, image_config, image_type, rows,
                               log):
    '''
    based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images),
    either updates or creates a new metadata data record in the config-specified metadata data table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        file_gdc_id: ?
        case_gdc_id: ?
        case_barcode: row[2]
        sample_gdc_id: ?
        sample_barcode: row[0]
        sample_type: sample_barcode[13:15]
        aliquot_barcode: row[3]
        aliquot_gdc_id: ?
        project_short_name: row[4].split('/')[6]
        disease_code: project_short_name.split('-')[1]
        program_name: 'TCGA'
        data_type: image_type
        data_category: 'Clinical'
        type: 'file'
        file_name: row[1]
        data_format: 'SVS'
        access: 'open'
        acl: 'open'
        platform: 'Clinical'
        file_name_key: row[4]
        file_uploaded: 'true'
        endpoint_type: 'legacy'
        species: 'H**o sapien'
    '''
    # get the information from the config mapping
    log.info('\tchecking data records')
    barcode2rows = {}
    for row in rows:
        if 'Radiology' == image_type:
            bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]),
                                             [])
            bcrows += [row]
        else:
            bcrows = barcode2rows.setdefault((row[0], row[1]), [])
            bcrows += [row]
    data_select_template = image_config[image_type]['data_select_template']
    if 0 == len(data_select_template):
        barcode_db = set()
    else:
        barcodes = ''
        for barcode, file_name in barcode2rows:
            barcodes += '("{}", "{}")'.format(barcode, file_name)
        barcodes = barcodes[:-1]
        data_rows = ISBCGC_database_helper.select(
            config, data_select_template %
            (','.join('("{}", "{}")'.format(barcode, file_name)
                      for barcode, file_name in barcode2rows)), log, [])
        barcode_db = set([(data_row[0], data_row[1])
                          for data_row in data_rows])

    new_barcodes = set(barcode2rows) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new data records'.format(len(new_barcodes)))
        db_rows = []
        for barcode in new_barcodes:
            for row in barcode2rows[barcode]:
                row_method = image_config['image_tag2row_method'][image_type]
                next_row = getattr(sys.modules[__name__],
                                   row_method)(row, program, image_config,
                                               image_type)
                if next_row is not None:
                    db_rows += next_row
        ISBCGC_database_helper.column_insert(
            config, db_rows, image_config['data_table'],
            image_config[image_type]['data_columns'], log)
    else:
        log.info('\t\tno rows to insert for data records')

    if 0 < len(barcode_db):
        log.info('\t\tupdating {} existing data records'.format(
            len(barcode_db)))
        rows = []
        for barcode in barcode_db:
            for row in barcode2rows[barcode]:
                if 'Radiology' == image_type:
                    rows += [[
                        row[1], row[0], row[1].split('/')[5],
                        image_config['image_tag2data_type'][image_type]
                    ]]
                else:
                    rows += [[
                        row[4], row[0], row[1],
                        image_config['image_tag2data_type'][image_type]
                    ]]
        ISBCGC_database_helper.update(
            config, image_config[image_type]['data_update_template'], log,
            rows)
    else:
        log.info('\t\tno rows to update for data records')