Esempio n. 1
0
def process_data_availability_records(config, program, image_config,
                                      image_type, rows, log):
    '''
    NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label.  it is
    possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with 
    more files
    '''
    if 'Radiology' == image_type:
        return

    stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build'
    image_rows = ISBCGC_database_helper.select(config, stmt, log,
                                               ["Pathology_Image", "HG19"])
    if 1 != len(image_rows):
        raise ValueError(
            'unexpected number of rows returned for availability id {}'.format(
                image_rows))
    data_type_availability_id = image_rows[0][0]

    # get the current count for a barcode based on previous runs
    stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format(
        program)
    db_rows = ISBCGC_database_helper.select(config, stmt, log, [
        data_type_availability_id,
    ])
    db_barcode2count = {}
    for db_row in db_rows:
        db_barcode2count[db_row[0]] = db_row[1]
    if len(db_rows) != len(set(db_rows)):
        raise ValueError(
            'sample_barcode should only appear once per isb_label')

    # get the current count from the current batch for each sample barcode
    barcode2count = {}
    for row in rows:
        barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1

    new_barcodes = set(barcode2count) - set(db_rows)
    new_params = []
    for new_barcode in new_barcodes:
        new_params += [[
            data_type_availability_id, new_barcode, barcode2count[new_barcode]
        ]]
    ISBCGC_database_helper.column_insert(
        config, new_params, 'TCGA_metadata_sample_data_availability',
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    update_barcodes = set(barcode2count) & set(db_rows)
    update_params = []
    for update_barcode in update_barcodes:
        update_params += [[
            barcode2count[update_barcode] + db_barcode2count[update_barcode],
            data_type_availability_id, update_barcode
        ]]
    stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s'
    ISBCGC_database_helper.update(config, stmt, log, update_params, False)
Esempio n. 2
0
def process_biospecimen_image_records(config, program, image_config,
                                      image_type, rows, log):
    '''
    based on either the sample_barcode (for tissue or diagnostic images),
    creates a new metadata clinical record if necessary in the config-specified metadata clinical table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        endpoint_type: 'legacy'
        sample_gdc_id: ?
        sample_barcode: row[0]
        sample_type: row[0][13:15]
        case_gdc_id: ?
        case_barcode: row[2]
        program_name: 'TCGA'
        disease_code: project_name.split('-')[1]
        project_short_name: row[4].split('/')[6]
    '''
    # get the information from the config mapping
    log.info('\tchecking biospecimen records')
    barcode2row = dict([(row[image_config[image_type]['sample_barcode_index']],
                         row) for row in rows])
    log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)),
                                       len(barcode2row)))
    biospecimen_select_template = image_config['biospecimen_select_template']
    biospecimen_rows = ISBCGC_database_helper.select(
        config,
        biospecimen_select_template % ("'" + "','".join(barcode2row) + "'"),
        log, [])
    barcode_db = set(
        [biospecimen_row[0] for biospecimen_row in biospecimen_rows])
    new_barcodes = set(barcode2row) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new biospecimen records'.format(
            len(new_barcodes)))
        rows = []
        for barcode in new_barcodes:
            row = barcode2row[barcode]
            project_short_name = row[4].split('/')[6]
            rows += [[
                'legacy', row[0], row[0][13:15], row[2], 'TCGA',
                project_short_name.split('-')[1], project_short_name
            ]]
            rows += [[
                'current', row[0], row[0][13:15], row[2], 'TCGA',
                project_short_name.split('-')[1], project_short_name
            ]]
        ISBCGC_database_helper.column_insert(
            config, rows, image_config['biospecimen_table'],
            image_config['biospecimen_columns'], log)
    else:
        log.info('\t\tno rows to insert for biospecimen records')
def populate_sample_availibility(config, endpt_type, program_name, project_id,
                                 data_type, infos, log):
    log.info('\tbegin populate_sample_availibility() for %s:%s' %
             (project_id, data_type))

    # iterate through the gdc info and put together the counts for the sample barcodes
    sample_barcode2count = {}
    for info in infos:
        mapping = config[program_name]['process_files']['data_table_mapping']
        flattened = flatten_map(info, mapping)
        for index in range(len(flattened)):
            if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \
                (data_type in ('Aligned reads') and 'open' == flattened[index]['access']):
                continue
            sample_barcode = flattened[index]['sample_barcode']
            count = sample_barcode2count.setdefault(sample_barcode, 0)
            sample_barcode2count[sample_barcode] = count + 1

    # read in the appropriate data availability row to get the foreign key
    isb_label = config['data_type2isb_label'][data_type]
    stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % (
        program_name)
    foreign_key = ISBCGC_database_helper.select(
        config, stmt, log,
        [config['endpt2genomebuild'][endpt_type], isb_label])[0][0]

    params = []
    for sample_barcode, count in sample_barcode2count.iteritems():
        params += [[foreign_key, sample_barcode, count]]

    ISBCGC_database_helper.column_insert(
        config, params,
        '%s_metadata_sample_data_availability' % (program_name),
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    log.info('\tfinished populate_sample_availibility() for %s:%s' %
             (project_id, data_type))
Esempio n. 4
0
def process_clinical_image_records(config, program, image_config, image_type,
                                   rows, log):
    '''
    based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images),
    creates a new metadata data record in the config-specified metadata data table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        endpoint_type: 'legacy'
        case_gdc_id: ?
        case_barcode: row[2]
        program_name: 'TCGA'
        disease_code: project_name.split('-')[1]
        project_short_name: row[4].split('/')[6]
    '''
    # get the information from the config mapping
    log.info('\tchecking clinical records.')
    barcode2row = dict([(row[image_config[image_type]['case_barcode_index']],
                         row) for row in rows])
    log.info('\tbacodes--{}:{}'.format(len(set(barcode2row)),
                                       len(barcode2row)))
    clinical_select_template = image_config['clinical_select_template']
    clinical_rows = ISBCGC_database_helper.select(
        config,
        clinical_select_template % ("'" + "','".join(barcode2row) + "'"), log,
        [])
    barcode_db = set([clinical_row[0] for clinical_row in clinical_rows])
    new_barcodes = set(barcode2row) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new clinical records'.format(
            len(new_barcodes)))

        rows = []
        for barcode in new_barcodes:
            row = barcode2row[barcode]
            if 'Radiology' == image_type:
                case_barcode = row[0]
                project_short_name = row[1].split('/')[5]
            else:
                case_barcode = row[2]
                project_short_name = row[4].split('/')[6]
            if 'NA' == project_short_name:
                continue
            rows += [[
                'legacy', case_barcode, program,
                project_short_name.split('-')[1], project_short_name
            ]]
            rows += [[
                'current', case_barcode, program,
                project_short_name.split('-')[1], project_short_name
            ]]
        ISBCGC_database_helper.column_insert(config, rows,
                                             image_config['clinical_table'],
                                             image_config['clinical_columns'],
                                             log)

    else:
        log.info('\t\tno rows to insert for clinical records')
Esempio n. 5
0
def process_data_image_records(config, program, image_config, image_type, rows,
                               log):
    '''
    based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images),
    either updates or creates a new metadata data record in the config-specified metadata data table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        file_gdc_id: ?
        case_gdc_id: ?
        case_barcode: row[2]
        sample_gdc_id: ?
        sample_barcode: row[0]
        sample_type: sample_barcode[13:15]
        aliquot_barcode: row[3]
        aliquot_gdc_id: ?
        project_short_name: row[4].split('/')[6]
        disease_code: project_short_name.split('-')[1]
        program_name: 'TCGA'
        data_type: image_type
        data_category: 'Clinical'
        type: 'file'
        file_name: row[1]
        data_format: 'SVS'
        access: 'open'
        acl: 'open'
        platform: 'Clinical'
        file_name_key: row[4]
        file_uploaded: 'true'
        endpoint_type: 'legacy'
        species: 'H**o sapien'
    '''
    # get the information from the config mapping
    log.info('\tchecking data records')
    barcode2rows = {}
    for row in rows:
        if 'Radiology' == image_type:
            bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]),
                                             [])
            bcrows += [row]
        else:
            bcrows = barcode2rows.setdefault((row[0], row[1]), [])
            bcrows += [row]
    data_select_template = image_config[image_type]['data_select_template']
    if 0 == len(data_select_template):
        barcode_db = set()
    else:
        barcodes = ''
        for barcode, file_name in barcode2rows:
            barcodes += '("{}", "{}")'.format(barcode, file_name)
        barcodes = barcodes[:-1]
        data_rows = ISBCGC_database_helper.select(
            config, data_select_template %
            (','.join('("{}", "{}")'.format(barcode, file_name)
                      for barcode, file_name in barcode2rows)), log, [])
        barcode_db = set([(data_row[0], data_row[1])
                          for data_row in data_rows])

    new_barcodes = set(barcode2rows) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new data records'.format(len(new_barcodes)))
        db_rows = []
        for barcode in new_barcodes:
            for row in barcode2rows[barcode]:
                row_method = image_config['image_tag2row_method'][image_type]
                next_row = getattr(sys.modules[__name__],
                                   row_method)(row, program, image_config,
                                               image_type)
                if next_row is not None:
                    db_rows += next_row
        ISBCGC_database_helper.column_insert(
            config, db_rows, image_config['data_table'],
            image_config[image_type]['data_columns'], log)
    else:
        log.info('\t\tno rows to insert for data records')

    if 0 < len(barcode_db):
        log.info('\t\tupdating {} existing data records'.format(
            len(barcode_db)))
        rows = []
        for barcode in barcode_db:
            for row in barcode2rows[barcode]:
                if 'Radiology' == image_type:
                    rows += [[
                        row[1], row[0], row[1].split('/')[5],
                        image_config['image_tag2data_type'][image_type]
                    ]]
                else:
                    rows += [[
                        row[4], row[0], row[1],
                        image_config['image_tag2data_type'][image_type]
                    ]]
        ISBCGC_database_helper.update(
            config, image_config[image_type]['data_update_template'], log,
            rows)
    else:
        log.info('\t\tno rows to update for data records')