def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
Beispiel #2
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df["Study"] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list(
        [k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)
Beispiel #4
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header,
                  set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df['Study'] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
Beispiel #5
0
    def process_file(self, config, outputdir, data_type, path, info,
                     program_name, project, log):
        if config[program_name]['process_files']['datatype2bqscript'][
                data_type]['file_compressed']:
            with gzip.open(outputdir + path) as input_file:
                file_df = convert_file_to_dataframe(input_file)
        else:
            with open(outputdir + path) as input_file:
                file_df = convert_file_to_dataframe(input_file)

        #now filter down to the desired columns
        use_columns = config[program_name]['process_files'][
            'datatype2bqscript'][data_type]['use_columns']
        file_df = file_df[use_columns.keys()]
        #modify to BigQuery desired names, checking for columns that will be split in the next step
        new_names = []
        for colname in file_df.columns:
            fields = use_columns[colname].split('~')
            if 1 == len(fields):
                new_names += [use_columns[colname]]
            else:
                new_names += [colname]
        file_df.columns = new_names

        # now process the splits
        for colname in use_columns:
            fields = use_columns[colname].split('~')
            if 2 == len(fields) and 'split' == fields[0]:
                extracted_df = file_df[colname].str.extract(fields[1],
                                                            expand=True)
                file_df = pd.concat([file_df, extracted_df], axis=1)
        # add the metadata columns
        file_df = self.add_metadata(file_df, data_type, info, program_name,
                                    project, config)
        # allow subclasses to make final updates
        self.data_type_specific(config, file_df)
        # and reorder them
        file_df = file_df[config[program_name]['process_files']
                          ['datatype2bqscript'][data_type]['order_columns']]

        return file_df
Beispiel #6
0
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0):
    """
    Function to connect to google cloud storage, download the file,
    and convert to a dataframe
    """

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)

    return data_df
def melt_matrix(matrix_file, Platform, studies_map, config, log):
    """
    # melt matrix
    """
    log.info('\tbegin melt matrix: \'%s\'' % (matrix_file))
    # begin parsing the data
    data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
    data_df2 = data_df2.set_index(["Gene"])

    # create a StingIO object with this info
    # call utils.convert_file_to_dataframe(buffer, sep=",")
    # call tools.cleanup_dataframe()
    # gcs.convert_df_to_njson_and_upload()
    log.info('\t\tstart processing saved matrix.  size: %s' % (len(data_df2)))
    mod = int(len(data_df2) / 20)
    count = 0
    buf = StringIO()
    buf.write(
        "ParticipantBarcode	SampleBarcode	AliquotBarcode	SampleTypeLetterCode	Study	Platform	mirna_id	mirna_accession	normalized_count\n"
    )
    for i, j in data_df2.T.iteritems():
        if 0 == count % mod:
            log.info('\t\t\tprocessed %s lines' % (count))
        count += 1
        for k, m in j.iteritems():
            aliquot = k.strip(".mirbase20")
            aliquot = aliquot.strip(".hg19")
            SampleBarcode = "-".join(aliquot.split("-")[0:4])
            ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
            SampleTypeLetterCode = config["sample_code2letter"][aliquot.split(
                "-")[3][0:2]]
            Study = studies_map[aliquot].upper()
            buf.write("\t".join(
                map(str, (ParticipantBarcode, SampleBarcode,
                          aliquot, SampleTypeLetterCode, Study, Platform,
                          i.split(".")[0], i.split(".")[1], m))) + '\n')
    log.info('\t\tprocessed %s total lines' % (count))

    file_name = matrix_file.split('/')[-1]
    log.info('\t\tsave %s to GCS' % file_name)
    buf.seek(0)
    df = convert_file_to_dataframe(buf)
    df = cleanup_dataframe(df)
    gcs = GcsConnector(config['project_id'], config['buckets']['open'])
    gcs.convert_df_to_njson_and_upload(
        df, config['mirna_isoform_matrix'][Platform]['output_dir'] + file_name)
    log.info('\t\tcompleted save to GCS')
    log.info('\tfinished melt matrix')
Beispiel #8
0
    def process_per_sample_files(self, config, outputdir, associated_paths,
                                 types, info, program_name, project, log):
        dfs = [None] * 3
        curindex = 0
        for associated_path in associated_paths:
            # convert blob into dataframe
            log.info('\t\tcalling convert_file_to_dataframe() for %s' %
                     (associated_path))
            dfs[curindex] = convert_file_to_dataframe(
                gzip.open(outputdir + associated_path), header=None)
            dfs[curindex].columns = [
                'Ensembl_versioned_gene_ID', types[curindex]
            ]
            self.add_metadata(dfs[curindex], info, program_name, project,
                              config)
            if 'HTSeq - Counts' == types[curindex]:
                dfs[curindex] = dfs[curindex].drop(
                    dfs[curindex].index[[60483, 60484, 60485, 60486, 60487]])
            log.info('\t\tdone calling convert_file_to_dataframe() for %s' %
                     (associated_path))
            curindex += 1

        merge_df = dfs[0]
        for df in dfs[1:]:
            merge_df = merge_df.merge(
                df,
                how='inner',
                on=[
                    'Ensembl_versioned_gene_ID', 'file_gdc_id',
                    'aliquot_barcode', 'aliquot_barcode', 'sample_gdc_id',
                    'sample_barcode', 'case_gdc_id', 'case_barcode',
                    'program_name', 'project_short_name',
                    'sample_type_letter_code', 'data_type',
                    'experimental_strategy'
                ])

        log.info('merge workflow(%d):\n%s\n\t...\n%s' %
                 (len(merge_df), merge_df.head(3), merge_df.tail(3)))
        return merge_df
Beispiel #9
0
def convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=0, log = None):
    """
    Function to connect to google cloud storage, download the file,
    and convert to a dataframe
    """

    try:
        logit(log, 'calling download_blob_to_file() for %s' % (filename), 'info')
        filebuffer = gcs.download_blob_to_file(filename)
        logit(log, 'done calling download_blob_to_file() for %s' % (filename), 'info')
    
        # convert blob into dataframe
        logit(log, 'calling convert_file_to_dataframe() for %s' % (filename), 'info')
        data_df = convert_file_to_dataframe(filebuffer, skiprows=skiprows)
        logit(log, 'done calling convert_file_to_dataframe() for %s' % (filename), 'info')
    
        # clean-up dataframe
        logit(log, 'calling cleanup_dataframe() for %s' % (filename), 'info')
        data_df = cleanup_dataframe(data_df)
        logit(log, 'done calling cleanup_dataframe() for %s' % (filename), 'info')
    except Exception as e:
        logit(log, 'problem in convert_blob_to_dataframe(%s): %s' % (filename, e), 'exception')

    return data_df
Beispiel #10
0
def process_oncotator_output(project_id, bucket_name, data_library, bq_columns,
                             sample_code2letter, oncotator_object_path):
    study = data_library['Study'].iloc[0]

    # this needed to stop pandas from converting them to FLOAT
    dtype = {
        "Transcript_Exon": "object",
        "NCBI_Build": "object",
        "COSMIC_Total_Alterations_In_Gene": "object",
        "CCLE_ONCOMAP_Total_Mutations_In_Gene": "object",
        "HGNC_HGNC_ID": "object",
        "UniProt_AApos": "object",
        "Transcript_Position": "object",
        "HGNC_OMIM_ID_Supplied_By_NCBI": "object"
    }

    file_count = 0

    # create an empty dataframe. we use this to merge dataframe
    disease_bigdata_df = pd.DataFrame()

    # iterate over the selected files
    for oncotator_file in data_library['filename']:
        file_count += 1

        log.info(
            '-' * 10 +
            "{0}: Processing file {1}".format(file_count, oncotator_file) +
            '-' * 10)

        try:
            # covert the file to a dataframe
            filename = oncotator_object_path + oncotator_file
            with open(filename) as infile:
                filestring = StringIO(infile.read())
            df = convert_file_to_dataframe(filestring)
            try:
                df = cleanup_dataframe(df)
            except RuntimeError as re:
                log.warning('%s: problem cleaning dataframe for %s: %s' %
                            (study, filename, re))
        except Exception as e:
            print e
            raise

        if df.empty:
            log.debug('empty dataframe for file: ' + str(oncotator_file))
            continue
        #------------------------------
        # different operations on the frame
        #------------------------------
        # get only the required BigQuery columns
        df = df[bq_columns]

        # format oncotator columns; name changes etc
        df = format_oncotator_columns(df)

        # add new columns
        df = add_columns(df, sample_code2letter, study)

        disease_bigdata_df = disease_bigdata_df.append(df, ignore_index=True)

        log.info('-' * 10 + "{0}: Finished file {1}. rows: {2}".format(
            file_count, oncotator_file, len(df)) + '-' * 10)

    # this is a merged dataframe
    if not disease_bigdata_df.empty:

        # remove duplicates; various rules; see check duplicates)

        log.info(
            '\tcalling check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = check_duplicates.remove_maf_duplicates(
            disease_bigdata_df, sample_code2letter, log)
        log.info(
            '\tfinished check_duplicates to collapse aliquots with %s rows' %
            (len(disease_bigdata_df)))

        # enforce unique mutation--previous
        # unique_mutation = ['Chromosome', 'Start_Position', 'End_Position', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode']
        # enforce unique mutation
        unique_mutation = [
            'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
            'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele1',
            'Tumor_Seq_Allele2', 'Tumor_AliquotBarcode'
        ]
        # merge mutations from multiple centers
        log.info('\tconsolidate the centers for duplicate mutations into list')
        seencenters = set()

        def concatcenters(df_group):
            if len(df_group) > 1:
                centers = list(set(df_group['Center'].tolist()))
                uniquecenters = set()
                delim = config['maf']['center_delim']
                for center in centers:
                    fields = center.split(delim)
                    for field in fields:
                        uniquecenters.add(field)
                sortedunique = delim.join(sorted(list(uniquecenters)))
                df_group.loc[:, 'Center'] = sortedunique
                if sortedunique not in seencenters:
                    log.info('unique centers: %s' % sortedunique)
                    seencenters.add(sortedunique)
            return df_group

        disease_bigdata_df = disease_bigdata_df.groupby(unique_mutation).apply(
            concatcenters)
        log.info('\tfinished consolidating centers for duplicate mutations')

        # enforce unique mutation
        log.info(
            '\tcalling remove_duplicates to collapse mutations with %s rows' %
            (len(disease_bigdata_df)))
        disease_bigdata_df = remove_duplicates(disease_bigdata_df,
                                               unique_mutation)
        log.info(
            '\tfinished remove_duplicates to collapse mutations with %s rows' %
            (len(disease_bigdata_df)))

        # convert the disease_bigdata_df to new-line JSON and the upload the file
        file_to_upload = StringIO()

        log.info('writing %s rows' % (len(disease_bigdata_df)))
        for _, rec in disease_bigdata_df.iterrows():
            file_to_upload.write(
                rec.convert_objects(convert_numeric=False).to_json() + "\n")
        file_to_upload.seek(0)
        with open(oncotator_object_path + "{0}.json".format(study),
                  'w') as outfile:
            outfile.write(file_to_upload.getvalue())
    else:
        log.warning('Empty dataframe for %s in %s!' % (oncotator_file, study))
    return True
Beispiel #11
0
def generate_oncotator_inputfiles(project_id, bucket_name, filename,
                                  outputfilename, oncotator_columns):

    print(filename)

    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    try:
        maf_df = convert_file_to_dataframe(filebuffer)
    except:
        print 'problem converting %s to a dataframe' % (filename)
        raise

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns)

    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({
        'hg19': '37',
        'GRCh37': '37',
        'GRCh37-lite': '37'
    })

    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome": map(str, range(1, 23)) + ['X', 'Y'],
        "mutation_status": ['somatic', 'Somatic'],
        "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'],
        "ncbi_build": ['37']
    }

    filter_checklist_df = maf_df.isin(filters)

    filter_string = ((filter_checklist_df["chromosome"] == True)
                     & (filter_checklist_df["mutation_status"] == True)
                     & (filter_checklist_df["sequencer"] == True)
                     & (filter_checklist_df["ncbi_build"] == True))

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build": 'build',
        'chromosome': 'chr',
        'start_position': 'start',
        'end_position': 'end',
        'reference_allele': 'ref_allele',
        'tumor_seq_allele1': 'tum_allele1',
        'tumor_seq_allele2': 'tum_allele2',
        'tumor_sample_barcode': 'tumor_barcode',
        'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [
            replace_column_names[x] if x == rcol else x for x in maf_df.columns
        ]
        oncotator_columns = [
            replace_column_names[y] if y == rcol else y
            for y in oncotator_columns
        ]

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)

    df_stringIO = oncotator_df.to_csv(sep='\t',
                                      index=False,
                                      columns=oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)

    return True
Beispiel #12
0
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns):

    print (filename)
    
    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    maf_df = convert_file_to_dataframe(filebuffer)

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns) 
    
    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37'
                                  ,'GRCh37': '37'
                                  ,'GRCh37-lite': '37'
                                 })

   
    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome" : map(str,range(1,23)) + ['X', 'Y']
        ,"mutation_status": ['somatic', 'Somatic']
        ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq']
        ,"ncbi_build" : ['37']
    }

    filter_checklist_df = maf_df.isin(filters)
    
    filter_string = (
                       (filter_checklist_df["chromosome"] == True)
                        &   
                       (filter_checklist_df["mutation_status"] == True)
                        &
                       (filter_checklist_df["sequencer"] == True)
                        &
                       (filter_checklist_df["ncbi_build"] == True)
                    )

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------
   
    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build" : 'build'
       ,'chromosome' : 'chr'
       ,'start_position' : 'start'
       ,'end_position' : 'end'
       ,'reference_allele' : 'ref_allele'
       ,'tumor_seq_allele1' : 'tum_allele1'
       ,'tumor_seq_allele2' : 'tum_allele2'
       ,'tumor_sample_barcode': 'tumor_barcode'
       ,'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns]
        oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns]         

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------      
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)
   
    df_stringIO =  oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)
    
    return True
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
Beispiel #14
0
    def melt_matrix(self, matrix_file, platform, file2info, program_name,
                    config, log):
        """
        # melt matrix
        """
        log.info('\t\t\tbegin melt matrix: \'%s\'' % (matrix_file))
        # begin parsing the data
        data_df2 = pd.read_csv(matrix_file, delimiter='\t', header=0)
        data_df2 = data_df2.set_index(["Gene"])

        # create a StingIO object with this info
        # call utils.convert_file_to_dataframe(buffer, sep=",")
        # call tools.cleanup_dataframe()
        # gcs.convert_df_to_njson_and_upload()
        log.info('\t\t\t\tstart processing saved matrix.  size: %s' %
                 (len(data_df2)))
        mod = int(len(data_df2) / 20)
        count = 0
        total_count = 0
        buf = StringIO()
        buf.write(
            "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
            +
            "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
        )
        for i, j in data_df2.T.iteritems():
            for k, m in j.iteritems():
                aliquot = file2info[k]['aliquot_barcode']
                SampleBarcode = "-".join(aliquot.split("-")[0:4])
                ParticipantBarcode = "-".join(aliquot.split("-")[0:3])
                SampleTypeCode = aliquot.split("-")[3][0:2]
                info = file2info[k]
                line = "\t".join(
                    map(str, (SampleBarcode, i.split(".")[0], i.split(".")[1],
                              m, platform, info['project_short_name'],
                              info['program_name'], SampleTypeCode,
                              info['file_name'], info['file_gdc_id'], aliquot,
                              ParticipantBarcode, info['case_gdc_id'],
                              info['sample_gdc_id'],
                              info['aliquot_gdc_id']))) + '\n'
                buf.write(line)
                total_count += 1
            if 0 == count % mod:
                log.info('\t\t\t\t\tprocessed %s lines:\n%s' % (count, line))
                file_name = '%s_%s' % (matrix_file.split('/')[-1], count)
                log.info('\t\t\t\tsave %s to GCS' % file_name)
                buf.seek(0)
                df = convert_file_to_dataframe(buf)
                df = cleanup_dataframe(df, log)
                gcs = GcsConnector(config['cloud_projects']['open'],
                                   config['buckets']['open'])
                gcs.convert_df_to_njson_and_upload(
                    df,
                    config[program_name]['process_files']['datatype2bqscript']
                    ['Isoform Expression Quantification']['gcs_output_path'] +
                    file_name,
                    logparam=log)
                buf = StringIO()
                buf.write(
                    "sample_barcode	mirna_id	mirna_accession	normalized_count	platform	project_short_name	program_name	sample_type_code"
                    +
                    "	file_name	file_gdc_id	aliquot_barcode	case_barcode	case_gdc_id	sample_gdc_id	aliquot_gdc_id\n"
                )
            count += 1
        log.info('\t\t\t\tprocessed %s total lines created %s records' %
                 (count, total_count))

        log.info('\t\t\t\tcompleted save to GCS')
        log.info('\t\t\tfinished melt matrix')
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
Beispiel #16
0
        gcs.convert_df_to_njson_and_upload(disease_bigdata_df, "tcga/intermediary/MAF/bigquery_data_files/{0}.json".format(study))

    else:
        raise Exception('Empty dataframe!')
    return True

if __name__ == '__main__':

    config = json.load(open(sys.argv[1]))
  
    project_id = config['project_id']
    bucket_name = config['buckets']['open']
    sample_code2letter = config['sample_code2letter']
 
    # get disease_codes/studies( TODO this must be changed to get the disease code from the file name)
    df = convert_file_to_dataframe(open(sys.argv[2]))
    df = cleanup_dataframe(df)
    studies = list(set(df['Study'].tolist()))

    # get bq columns ( this allows the user to select the columns
    # , without worrying about the index, case-sensitivenes etc
    selected_columns = pd.read_table(sys.argv[3], names=['bq_columns'])
    transposed = selected_columns.T
    transposed.columns = transposed.loc['bq_columns']
    transposed = cleanup_dataframe(transposed)
    bq_columns = transposed.columns.values

    # submit threads by disease  code
    pm = process_manager.ProcessManager(max_workers=33, db='maf.db', table='task_queue_status')
    for idx, df_group in df.groupby(['Study']):
        future = pm.submit(process_oncotator_output, project_id, bucket_name, df_group, bq_columns, sample_code2letter)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)