def main(configfilename, baminfo_filename):
    log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/'
    log_name = create_log(log_dir, 'update_gg_metadata')
    log = logging.getLogger(log_name)
    log.info('begin update gg metadata')

    idcol = 0
    ggdatasetcol = 7
    ggreadgroupset = 6
    nexist = 'NA'
    updates = []
    count = 0
    with open(baminfo_filename) as baminfo:
        baminfo.readline()
        for line in baminfo:
            fields = line.strip().split('\t')
            if fields[ggreadgroupset] == nexist:
                continue
            values = [
                fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol]
            ]
            if 0 == count % 1000:
                log.info('%s processing row %s--%s' %
                         (datetime.now(), count, ':'.join(values)))
            count += 1
            updates += [values]

    stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s'
    with open(configfilename) as configFile:
        config = json.load(configFile)
    ISBCGC_database_helper.update(config, stmt, log, updates, False)

    log.info('finished update gg metadata')
Esempio n. 2
0
def process_data_availability_records(config, program, image_config,
                                      image_type, rows, log):
    '''
    NOTE: this assumes at the start of the run, that no sample barcode is associated with and Image ISB label.  it is
    possible that a sample_barcode is added and then a subsequent batch will contain that barcode associated with 
    more files
    '''
    if 'Radiology' == image_type:
        return

    stmt = 'select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where %s = isb_label and %s = genomic_build'
    image_rows = ISBCGC_database_helper.select(config, stmt, log,
                                               ["Pathology_Image", "HG19"])
    if 1 != len(image_rows):
        raise ValueError(
            'unexpected number of rows returned for availability id {}'.format(
                image_rows))
    data_type_availability_id = image_rows[0][0]

    # get the current count for a barcode based on previous runs
    stmt = 'select sample_barcode, count(*) from {}_metadata_sample_data_availability where metadata_data_type_availability_id = %s group by 1'.format(
        program)
    db_rows = ISBCGC_database_helper.select(config, stmt, log, [
        data_type_availability_id,
    ])
    db_barcode2count = {}
    for db_row in db_rows:
        db_barcode2count[db_row[0]] = db_row[1]
    if len(db_rows) != len(set(db_rows)):
        raise ValueError(
            'sample_barcode should only appear once per isb_label')

    # get the current count from the current batch for each sample barcode
    barcode2count = {}
    for row in rows:
        barcode2count[row[0]] = barcode2count.setdefault(row[0], 0) + 1

    new_barcodes = set(barcode2count) - set(db_rows)
    new_params = []
    for new_barcode in new_barcodes:
        new_params += [[
            data_type_availability_id, new_barcode, barcode2count[new_barcode]
        ]]
    ISBCGC_database_helper.column_insert(
        config, new_params, 'TCGA_metadata_sample_data_availability',
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    update_barcodes = set(barcode2count) & set(db_rows)
    update_params = []
    for update_barcode in update_barcodes:
        update_params += [[
            barcode2count[update_barcode] + db_barcode2count[update_barcode],
            data_type_availability_id, update_barcode
        ]]
    stmt = 'update TCGA_metadata_sample_data_availability set count = %s where metadata_data_type_availability_id = %s and sample_barcode = %s'
    ISBCGC_database_helper.update(config, stmt, log, update_params, False)
Esempio n. 3
0
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/'
        log_name = create_log(log_dir, 'update_ccle_gcs_paths')
        log = logging.getLogger(log_name)
        
        log.info('begin updating CCLE paths in production')
        # first thing to do is to read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'CCLE\')'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log)
        
        log.info('\tcreate map of filename to path')
        name2path = {}
        for row in rows:
            fields = row[1].split('/')
            name2path[fields[-1]] = '/'.join(fields[3:])
        log.info('\tfinished map of filename to path')
        
        # get the db rows from production cloudsql
        log.info('\tselect ccle filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM main.metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\''
        
        rows = helper.select(config, query, log, [])
        log.info('\tselected %s ccle filenames from cloudsql' % (len(rows)))
        
        # now setup and do the update of paths in cloud sql
        log.info('\tstart updating paths in cloudsql')
        params = []
        not_matched = []
        for row in rows:
            if row[0] in name2path:
                params += [[name2path[row[0]], row[0]]]
            else:
                not_matched += [row[0]]
        update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s'
        helper.update(config, update, log, params)
        log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched)))

        log.info('finished updating CCLE paths in production')
    except:
        if log:
            log.exception('failed to update ccle GCS filepaths')
    finally:
        if log:
            close_log(log)
Esempio n. 4
0
def update_cloudsql_from_bigquery(config,
                                  postproc_config,
                                  project_name,
                                  cloudsql_table,
                                  log,
                                  data_type=None,
                                  endpt_type=None):
    update_stmt = 'update %s\nset \n\t%s\nwhere %s = %%s' % (
        cloudsql_table, '\n\t'.join(
            '%s = %%s,' % (postproc_config['postproc_columns'][key])
            for key in postproc_config['postproc_columns'].keys())[:-1],
        postproc_config['postproc_key_column'])
    if project_name:
        if data_type:  # assumes that endpt_type is also supplied
            query_results = query_bq_table(
                postproc_config['postproc_query'] %
                (', '.join(postproc_config['postproc_columns'].keys()),
                 endpt_type, project_name, data_type), False,
                postproc_config['postproc_project'], log)
        else:
            query_results = query_bq_table(
                postproc_config['postproc_query'] % (', '.join(
                    postproc_config['postproc_columns'].keys()), project_name),
                False, postproc_config['postproc_project'], log)
    else:
        query_results = query_bq_table(
            postproc_config['postproc_query'] %
            (', '.join(postproc_config['postproc_columns'].keys())), False,
            postproc_config['postproc_project'], log)
    page_token = None
    log.info('\t\t\tupdate_stmt\n%s' % (update_stmt))
    update_count = 0
    while True:
        total_rows, rows, page_token = fetch_paged_results(
            query_results, postproc_config['postproc_fetch_count'],
            project_name, page_token, log)
        if 0 < total_rows:
            log.info('\t\t\ttotal rows: %s\n\t%s\n\t\t...\n\t%s' %
                     (total_rows, str(rows[0]), str(rows[-1])))
        else:
            log.info('\t\t\tno rows')
            return
        if config['update_cloudsql']:
            ISBCGC_database_helper.update(config, update_stmt, log, rows, True)
        update_count += len(rows)
        log.info(
            '\t\t\tupdated %s so far%s' %
            (update_count, ' for ' + project_name if project_name else ''))
        if not page_token:
            log.info(
                '\t\t\tupdated total of %s rows%s' %
                (update_count, ' for ' + project_name if project_name else ''))
            return
def set_uploaded_path(config, endpt_type, program_name, project_id, data_type,
                      log):
    log.info('\tbegin set_uploaded_path()')
    postproc_config = config['postprocess_keypath']
    # first set file_name_key to null
    cloudsql_table = '%s_metadata_data_%s' % (
        program_name, config['endpt2genomebuild'][endpt_type])
    ISBCGC_database_helper.update(
        config, postproc_config['postproc_file_name_key_null_update'] %
        (cloudsql_table, project_id, data_type), log, [[]])
    # now use the BigQuery table to set file_name_key
    update_cloudsql_from_bigquery(config, postproc_config, project_id,
                                  cloudsql_table, log, data_type, endpt_type)
    ISBCGC_database_helper.update(
        config,
        postproc_config['postproc_file_uploaded_update'] % (cloudsql_table),
        log, [[]])
    log.info('\tfinished set_uploaded_path()')
    def test_populate_sample_availibility(self):
        bqTable2data_type = {'Somatic_Mutation': 'Masked Somatic Mutation'}
        for bqTable, data_type in bqTable2data_type.iteritems():
            self.log.info('populate_sample_availibility() for %s' %
                          (data_type))
            #remove existing records
            stmt = 'delete from TCGA_metadata_sample_data_availability where metadata_data_type_availability_id = ' \
            '(select metadata_data_type_availability_id from TCGA_metadata_data_type_availability where isb_label = "Somatic_Mutation" and genomic_build = "HG38")'
            ISBCGC_database_helper.update(self.config, stmt, self.log, [[]])

            query_results = query_bq_table(
                'select Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode, fileUUID from [isb-cgc:TCGA_hg38_data_v0.{}] group by 1,2,3'
                .format(bqTable), True, None, self.log)
            page_token = None
            barcode2seen_files = {}
            barcode2infos = {}
            infos = []
            while True:
                # loop through the big query results and get the sample_barcode into the info list as populate_sample_availibility()
                # expects it
                total_rows, rows, page_token = fetch_paged_results(
                    query_results, 200000, None, page_token, self.log)
                for row in rows:
                    tumor = row[0][:16]
                    #                     normal = row[1][:16]
                    files = row[2].split('|')
                    for curfile in files:
                        if tumor in barcode2seen_files:
                            seen_files = barcode2seen_files[tumor]
                            if row[2] in seen_files:
                                continue
                            seen_files.add(curfile)
                        else:
                            barcode2seen_files[tumor] = set([curfile])
                        samples_tumor = {'submitter_id': tumor}
                        sample_list = [samples_tumor]

                        info = {'access': 'open'}
                        case_list = info.setdefault('cases', [])
                        case_list += [{'samples': sample_list}]
                        barcode2infos[tumor] = barcode2infos.setdefault(
                            tumor, []) + [info]
#
#                     samples_normal = {'submitter_id': normal}
#                     sample_list = [samples_normal]
#
#                     info = {'access': 'open'}
#                     case_list = info.setdefault('cases', [])
#                     case_list += [{'samples': sample_list}]
#                     barcode2infos[normal] = barcode2infos.setdefault(normal, []) + [info]
                infos += [
                    info for curinfos in barcode2infos.itervalues()
                    for info in curinfos
                ]

                # create inserts into the metadata data that for big query rows that didn't have a match already in the metadata data table
                if not page_token:
                    self.log.info('\tprocessed total of %s rows for %s' %
                                  (total_rows, bqTable))
                    break
            populate_sample_availibility(self.config, 'current', 'TCGA', 'all',
                                         data_type, infos, self.log)
            self.log.info('finished populate_sample_availibility() for %s' %
                          (data_type))
Esempio n. 7
0
def process_data_image_records(config, program, image_config, image_type, rows,
                               log):
    '''
    based on either the case_barcode (for radiology images) or the sample_barcode (for tissue or diagnostic images),
    either updates or creates a new metadata data record in the config-specified metadata data table
    
    parameters:
        config: configuration mappings
        image_config: section of the config file with specific mappings for these image types
        not_barcodes: an output parameter for barcodes that weren't found in the underlying clinical or biospecimen table
        rows: rows selected from the BigQuery table
        log: where to write progress and other messages

        file_gdc_id: ?
        case_gdc_id: ?
        case_barcode: row[2]
        sample_gdc_id: ?
        sample_barcode: row[0]
        sample_type: sample_barcode[13:15]
        aliquot_barcode: row[3]
        aliquot_gdc_id: ?
        project_short_name: row[4].split('/')[6]
        disease_code: project_short_name.split('-')[1]
        program_name: 'TCGA'
        data_type: image_type
        data_category: 'Clinical'
        type: 'file'
        file_name: row[1]
        data_format: 'SVS'
        access: 'open'
        acl: 'open'
        platform: 'Clinical'
        file_name_key: row[4]
        file_uploaded: 'true'
        endpoint_type: 'legacy'
        species: 'H**o sapien'
    '''
    # get the information from the config mapping
    log.info('\tchecking data records')
    barcode2rows = {}
    for row in rows:
        if 'Radiology' == image_type:
            bcrows = barcode2rows.setdefault((row[0], row[1].split('/')[-1]),
                                             [])
            bcrows += [row]
        else:
            bcrows = barcode2rows.setdefault((row[0], row[1]), [])
            bcrows += [row]
    data_select_template = image_config[image_type]['data_select_template']
    if 0 == len(data_select_template):
        barcode_db = set()
    else:
        barcodes = ''
        for barcode, file_name in barcode2rows:
            barcodes += '("{}", "{}")'.format(barcode, file_name)
        barcodes = barcodes[:-1]
        data_rows = ISBCGC_database_helper.select(
            config, data_select_template %
            (','.join('("{}", "{}")'.format(barcode, file_name)
                      for barcode, file_name in barcode2rows)), log, [])
        barcode_db = set([(data_row[0], data_row[1])
                          for data_row in data_rows])

    new_barcodes = set(barcode2rows) - barcode_db
    if 0 < len(new_barcodes):
        log.info('\t\tinserting {} new data records'.format(len(new_barcodes)))
        db_rows = []
        for barcode in new_barcodes:
            for row in barcode2rows[barcode]:
                row_method = image_config['image_tag2row_method'][image_type]
                next_row = getattr(sys.modules[__name__],
                                   row_method)(row, program, image_config,
                                               image_type)
                if next_row is not None:
                    db_rows += next_row
        ISBCGC_database_helper.column_insert(
            config, db_rows, image_config['data_table'],
            image_config[image_type]['data_columns'], log)
    else:
        log.info('\t\tno rows to insert for data records')

    if 0 < len(barcode_db):
        log.info('\t\tupdating {} existing data records'.format(
            len(barcode_db)))
        rows = []
        for barcode in barcode_db:
            for row in barcode2rows[barcode]:
                if 'Radiology' == image_type:
                    rows += [[
                        row[1], row[0], row[1].split('/')[5],
                        image_config['image_tag2data_type'][image_type]
                    ]]
                else:
                    rows += [[
                        row[4], row[0], row[1],
                        image_config['image_tag2data_type'][image_type]
                    ]]
        ISBCGC_database_helper.update(
            config, image_config[image_type]['data_update_template'], log,
            rows)
    else:
        log.info('\t\tno rows to update for data records')
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + '_' + 'tcga/'
        log_name = create_log(log_dir, 'update_tcga_gcs_paths')
        log = logging.getLogger(log_name)

        log.info('begin updating TCGA paths in production')

        # get the db rows from production cloudsql
        log.info('\tselect tcga filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'TCGA\''

        cloudsql_rows = set(row[0]
                            for row in helper.select(config, query, log, []))
        log.info('\tselected %s tcga filenames from cloudsql' %
                 (len(cloudsql_rows)))

        # read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'TCGA\') and 0 < instr(file_gcs_url, \'legacy\') and 0 < instr(file_gcs_url, \'bam\') ' \
            'order by file_gcs_url'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        total_not_matched = 0
        total_distinct = set()
        page_token = None
        while True:
            total_rows, rows, page_token = fetch_paged_results(
                query_results, 2000, None, page_token, log)

            log.info('\t\tcreate map of filename to path')
            name2path = {}
            for row in rows:
                fields = row[1].split('/')
                name2path[fields[-1]] = '/'.join(fields[3:])
            log.info('\t\tfinished map of filename to path')

            # now setup and do the update of paths in cloud sql
            log.info('\t\tstart updating paths in cloudsql')
            params = []
            select_params = []
            not_matched = []
            for name, path in name2path.iteritems():
                if name in cloudsql_rows:
                    total_distinct.add(name)
                    params += [[path, name]]
                    select_params += [name]
                else:
                    not_matched += [path]
            update = 'update metadata_data set datafilenamekey = %s, datafileuploaded = \'true\' where datafilename = %s'
            helper.update(config, update, log, params)
            select_in = '%s,' * len(select_params)
            select_in = select_in[:-1]
            select_query = 'select count(*) from metadata_data where datafilename in (%s)' % (
                select_in)
            count = helper.select(config, select_query, log, select_params)
            log.info('select %s file name matches for %s file names.' %
                     (count[0][0], len(select_params)))

            total_not_matched += len(not_matched)
            if not page_token:
                log.info(
                    '\t\tupdated total of %s rows for TCGA with %d distinct file names'
                    % (total_rows - total_not_matched, len(total_distinct)))
                break
            else:
                log.info(
                    '\t\tupdated %d rows, did not find matches from BQ in cloudsql for %d:\n\t%s'
                    %
                    (len(params), len(not_matched), '\n\t'.join(not_matched)))

        log.info('\tcompleted update of paths in cloudsql')

        log.info('finished updating TCGA paths in production')
    except:
        if log:
            log.exception('failed to update tcga GCS filepaths')
    finally:
        if log:
            close_log(log)