def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list(
        [k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)
Ejemplo n.º 2
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header, set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df["Study"] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
Ejemplo n.º 3
0
def get_sdrf_info(project_id, bucket_name, disease_codes, header,
                  set_index_col, search_patterns):

    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket_name)

    # connect to google cloud storage
    gcs = GcsConnector(project_id, bucket_name)

    sdrf_info = pd.DataFrame()
    for disease_code in disease_codes:
        for blob in bucket.list_blobs(prefix=disease_code):
            sdrf_filename = blob.name
            if not all(x in sdrf_filename for x in search_patterns):
                continue
            print(sdrf_filename)

            filebuffer = gcs.download_blob_to_file(sdrf_filename)
            # convert to a dataframe
            sdrf_df = convert_file_to_dataframe(filebuffer, skiprows=0)

            sdrf_df = cleanup_dataframe(sdrf_df)

            sdrf_df['Study'] = disease_code

            try:
                sdrf_df = sdrf_df.set_index(set_index_col)
            except:
                sdrf_df = sdrf_df.set_index("Derived_Array_Data_File")

            sdrf_info = sdrf_info.append(sdrf_df)

    print("Done loading SDRF files.")
    return sdrf_info
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # Get basic column information depending on datatype
    column_mapping = get_column_mapping(metadata['DataType'])

    data_df = cleanup_dataframe(data_df)
    data_df.rename(columns=column_mapping, inplace=True)

    # Get barcodes and update metadata_data table
    # Assuming second scenario where each file is a different platform/pipeline combination
    # TODO: Put in functionality for other scenario where all lists are in one file.
    sample_barcodes = list([k for d, k in data_df['SampleBarcode'].iteritems()])
    file_list = list([k for d, k in data_df['filenamepath'].iteritems()])
    sample_metadata_list = []
    for idx, barcode in enumerate(sample_barcodes):
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        new_metadata['file_path'] = file_list[idx].replace('gs://', '')
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)
Ejemplo n.º 5
0
def download_antibody_annotation_files(config, log):
    object_key_template = config['protein']['aa_object_key_template']
    aa_file_dir = config['protein']['aa_file_dir']
    gcs = GcsConnector(config['project_id'], config['buckets']['open'])
    studies = config['all_tumor_types']
    nonrppa_studies = config['protein']['nonrppa']

    log.info('\tstart downloading antibody annotation files to %s from %s:%s' %
             (aa_file_dir, config['project_id'], config['buckets']['open']))
    if not os.path.isdir(aa_file_dir):
        os.makedirs(aa_file_dir)
    for study in studies:
        if study in nonrppa_studies:
            continue
        keypath = object_key_template % (study.lower(), study.upper())
        log.info('\t\tdownloading %s' % (keypath))
        tmpfile = gcs.download_blob_to_file(keypath)
        with open(aa_file_dir + keypath[keypath.rindex('/'):], 'w') as outfile:
            outfile.write(tmpfile.getvalue())
    log.info('\tdone downloading antibody annotation files')
Ejemplo n.º 6
0
def generate_oncotator_inputfiles(project_id, bucket_name, filename,
                                  outputfilename, oncotator_columns):

    print(filename)

    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    try:
        maf_df = convert_file_to_dataframe(filebuffer)
    except:
        print 'problem converting %s to a dataframe' % (filename)
        raise

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns)

    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({
        'hg19': '37',
        'GRCh37': '37',
        'GRCh37-lite': '37'
    })

    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome": map(str, range(1, 23)) + ['X', 'Y'],
        "mutation_status": ['somatic', 'Somatic'],
        "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'],
        "ncbi_build": ['37']
    }

    filter_checklist_df = maf_df.isin(filters)

    filter_string = ((filter_checklist_df["chromosome"] == True)
                     & (filter_checklist_df["mutation_status"] == True)
                     & (filter_checklist_df["sequencer"] == True)
                     & (filter_checklist_df["ncbi_build"] == True))

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build": 'build',
        'chromosome': 'chr',
        'start_position': 'start',
        'end_position': 'end',
        'reference_allele': 'ref_allele',
        'tumor_seq_allele1': 'tum_allele1',
        'tumor_seq_allele2': 'tum_allele2',
        'tumor_sample_barcode': 'tumor_barcode',
        'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [
            replace_column_names[x] if x == rcol else x for x in maf_df.columns
        ]
        oncotator_columns = [
            replace_column_names[y] if y == rcol else y
            for y in oncotator_columns
        ]

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)

    df_stringIO = oncotator_df.to_csv(sep='\t',
                                      index=False,
                                      columns=oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)

    return True
Ejemplo n.º 7
0
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns):

    print (filename)
    
    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    maf_df = convert_file_to_dataframe(filebuffer)

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns) 
    
    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37'
                                  ,'GRCh37': '37'
                                  ,'GRCh37-lite': '37'
                                 })

   
    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome" : map(str,range(1,23)) + ['X', 'Y']
        ,"mutation_status": ['somatic', 'Somatic']
        ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq']
        ,"ncbi_build" : ['37']
    }

    filter_checklist_df = maf_df.isin(filters)
    
    filter_string = (
                       (filter_checklist_df["chromosome"] == True)
                        &   
                       (filter_checklist_df["mutation_status"] == True)
                        &
                       (filter_checklist_df["sequencer"] == True)
                        &
                       (filter_checklist_df["ncbi_build"] == True)
                    )

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------
   
    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build" : 'build'
       ,'chromosome' : 'chr'
       ,'start_position' : 'start'
       ,'end_position' : 'end'
       ,'reference_allele' : 'ref_allele'
       ,'tumor_seq_allele1' : 'tum_allele1'
       ,'tumor_seq_allele2' : 'tum_allele2'
       ,'tumor_sample_barcode': 'tumor_barcode'
       ,'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns]
        oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns]         

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------      
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)
   
    df_stringIO =  oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)
    
    return True
Ejemplo n.º 8
0
def main():
    """Parse GCT file, merge with barcodes info, melt(tidy) 
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct'
    outfilename = 'test'
    writer = ExcelWriter('ccle.xlsx')

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #------------------------------
    # load the GCT file
    #  * the file has duplicate samples (columns)
    #------------------------------
    # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe
    # this is a hack, but I cant find a elegant way to remove duplciates
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=False,
                           nrows=2)
    unqiue_columns = np.unique(gct_df.columns)
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=True)
    # clean-up the dataset/dataframe
    gct_df = cleanup_dataframe(gct_df)
    gct_df = gct_df[unqiue_columns]

    # remove any gene_id starting with 'AFFX-'
    gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel(
        writer, sheet_name="affy_info")
    gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')]

    #------------------------------
    # HGNC validation
    #-----------------------------
    hgnc_df = hgnc_validation.get_hgnc_map()
    hgnc_df.to_excel(writer, sheet_name="hgnc_info")
    hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol))
    gct_df['HGNC_gene_symbol'] = gct_df['Name'].map(
        lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan))
    gct_df[['HGNC_gene_symbol', 'Name',
            'Description']].to_excel(writer, sheet_name="gene_info")
    gct_df['Name'] = gct_df['Name'].map(
        lambda gene_id: gene_id.replace('_at', ''))

    #------------------------------
    # barcodes info
    #------------------------------
    barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv'
    filebuffer = gcs.download_blob_to_file(barcodes_filename)
    barcodes_df = pd.read_table(
        filebuffer,
        header=None,
        names=['ParticipantBarcode', 'SampleBarcode',
               'CCLE_long_name'])  # convert into dataframe
    barcodes_df = cleanup_dataframe(barcodes_df)  # clean-up dataframe

    #------------------------------
    # ignore (drop) all of the columns from the gene-expression matrix
    #that don't have corresponding Participant and Sample barcodes,
    #------------------------------
    columns_df = pd.DataFrame(unqiue_columns)
    columns_df.columns = ['CCLE_long_name']
    samples_map_df = pd.merge(columns_df,
                              barcodes_df,
                              on='CCLE_long_name',
                              how='inner')
    samples_map_df.to_excel(writer, sheet_name="sample_info")

    # select columns that are overlapping
    overlapping_samples = samples_map_df['CCLE_long_name'].tolist()
    overlapping_samples = overlapping_samples + [
        'Name', 'Description', 'HGNC_gene_symbol'
    ]
    gct_df = gct_df[overlapping_samples]
    print gct_df

    # melt the matrix
    value_vars = [
        col for col in gct_df.columns
        if col not in ['Name', 'Description', 'HGNC_gene_symbol']
    ]
    melted_df = pd.melt(gct_df,
                        id_vars=['Name', 'Description', 'HGNC_gene_symbol'],
                        value_vars=value_vars)
    melted_df = melted_df.rename(
        columns={
            'Name': 'gene_id',
            'Description': 'original_gene_symbol',
            'variable': 'CCLE_long_name',
            'value': 'RMA_normalized_expression'
        })

    # merge to get barcode information
    # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join
    # while select the samples above.
    data_df = pd.merge(melted_df,
                       samples_map_df,
                       on='CCLE_long_name',
                       how='inner')
    data_df['Platform'] = "Affymetrix U133 Plus 2.0"

    # reorder columns
    col_order = [
        "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id",
        "HGNC_gene_symbol", "original_gene_symbol", "Platform",
        "RMA_normalized_expression"
    ]
    data_df = data_df[col_order]

    # upload the contents of the dataframe in CSV format
    print "Convert to CSV"
    outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv"
    df_string = data_df.to_csv(index=False, header=False)
    status = gcs.upload_blob_from_string(outfilename, df_string)
    print status

    # save the excel file
    writer.save()
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename, outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj['sample_barcode'] = i # Normalized to match user_gen
                new_df_obj['project_id'] = metadata['project_id']
                new_df_obj['study_id'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(set([k for d, k in new_df['sample_barcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'], sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'], metadata['data_type'], sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'], metadata['study_id'], project_id, bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket')
    gcs.convert_df_to_njson_and_upload(new_df, outfilename, metadata=metadata, tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
def parse_file(project_id, bq_dataset, bucket_name, file_data, filename,
               outfilename, metadata, cloudsql_tables):

    print 'Begin processing {0}.'.format(filename)

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)

    # clean-up dataframe
    data_df = cleanup_dataframe(data_df)
    new_df_data = []

    map_values = {}

    # Get basic column information depending on datatype
    column_map = get_column_mapping(metadata['data_type'])

    # Column headers are sample ids
    for i, j in data_df.iteritems():
        if i in column_map.keys():
            map_values[column_map[i]] = [k for d, k in j.iteritems()]

        else:
            for k, m in j.iteritems():
                new_df_obj = {}

                new_df_obj[
                    'sample_barcode'] = i  # Normalized to match user_gen
                new_df_obj['Project'] = metadata['project_id']
                new_df_obj['Study'] = metadata['study_id']
                new_df_obj['Platform'] = metadata['platform']
                new_df_obj['Pipeline'] = metadata['pipeline']

                # Optional values
                new_df_obj['Symbol'] = map_values['Symbol'][
                    k] if 'Symbol' in map_values.keys() else ''
                new_df_obj['ID'] = map_values['ID'][
                    k] if 'ID' in map_values.keys() else ''
                new_df_obj['TAB'] = map_values['TAB'][
                    k] if 'TAB' in map_values.keys() else ''

                new_df_obj['Level'] = m
                new_df_data.append(new_df_obj)
    new_df = pd.DataFrame(new_df_data)

    # Get unique barcodes and update metadata_data table
    sample_barcodes = list(
        set([k for d, k in new_df['SampleBarcode'].iteritems()]))
    sample_metadata_list = []
    for barcode in sample_barcodes:
        new_metadata = metadata.copy()
        new_metadata['sample_barcode'] = barcode
        sample_metadata_list.append(new_metadata)
    update_metadata_data_list(cloudsql_tables['METADATA_DATA'],
                              sample_metadata_list)

    # Update metadata_samples table
    update_molecular_metadata_samples_list(cloudsql_tables['METADATA_SAMPLES'],
                                           metadata['data_type'],
                                           sample_barcodes)

    # Generate feature names and bq_mappings
    table_name = file_data['BIGQUERY_TABLE_NAME']
    feature_defs = generate_feature_Defs(metadata['data_type'],
                                         metadata['study_id'], project_id,
                                         bq_dataset, table_name, new_df)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # upload the contents of the dataframe in njson format
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(new_df,
                                       outfilename,
                                       metadata=metadata,
                                       tmp_bucket=tmp_bucket)

    # Load into BigQuery
    # Using temporary file location (in case we don't have write permissions on user's bucket?)
    source_path = 'gs://' + tmp_bucket + '/' + outfilename
    schema = get_molecular_schema()

    load_data_from_file.run(project_id,
                            bq_dataset,
                            table_name,
                            schema,
                            source_path,
                            source_format='NEWLINE_DELIMITED_JSON',
                            write_disposition='WRITE_APPEND',
                            is_schema_file=False)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(outfilename)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(outfilename)
def process_user_gen_files(project_id, user_project_id, study_id, bucket_name, bq_dataset, cloudsql_tables, files):

    print 'Begin processing user_gen files.'

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)
    data_df = pd.DataFrame()

    # Collect all columns that get passed in for generating BQ schema later
    all_columns = []

    # For each file, download, convert to df
    for idx, file in enumerate(files):
        blob_name = file['FILENAME'].split('/')[1:]
        all_columns += file['COLUMNS']

        metadata = {
            'sample_barcode': file.get('SAMPLEBARCODE', ''),
            'participant_barcode': file.get('PARTICIPANTBARCODE', ''),
            'study_id': study_id,
            'platform': file.get('PLATFORM', ''),
            'pipeline': file.get('PIPELINE', ''),
            'file_path': file['FILENAME'],
            'file_name': file['FILENAME'].split('/')[-1],
            'data_type': file['DATATYPE']
        }

        # download, convert to df
        filebuffer = gcs.download_blob_to_file(blob_name)

        # Get column mapping
        column_mapping = get_column_mapping(file['COLUMNS'])
        if idx == 0:
            data_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            data_df = cleanup_dataframe(data_df)
            data_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(data_df, metadata, cloudsql_tables['METADATA_DATA'])

        else:
            # convert blob into dataframe
            new_df = convert_file_to_dataframe(filebuffer, skiprows=0, header=0)
            new_df = cleanup_dataframe(new_df)
            new_df.rename(columns=column_mapping, inplace=True)

            # Generate Metadata for this file
            insert_metadata(new_df, metadata, cloudsql_tables['METADATA_DATA'])

            # TODO: Write function to check for participant barcodes, for now, we assume each file contains SampleBarcode Mapping
            data_df = pd.merge(data_df, new_df, on='sample_barcode', how='outer')

    # For complete dataframe, create metadata_samples rows
    print 'Inserting into data into {0}.'.format(cloudsql_tables['METADATA_SAMPLES'])
    data_df = cleanup_dataframe(data_df)
    data_df['has_mrna'] = 0
    data_df['has_mirna'] = 0
    data_df['has_protein'] = 0
    data_df['has_meth'] = 0
    insert_metadata_samples(data_df, cloudsql_tables['METADATA_SAMPLES'])

    # Update and create bq table file
    temp_outfile = cloudsql_tables['METADATA_SAMPLES'] + '.out'
    tmp_bucket = os.environ.get('tmp_bucket_location')
    gcs.convert_df_to_njson_and_upload(data_df, temp_outfile, tmp_bucket=tmp_bucket)

    # Using temporary file location (in case we don't have write permissions on user's bucket?
    source_path = 'gs://' + tmp_bucket + '/' + temp_outfile

    schema = generate_bq_schema(all_columns)
    table_name = 'cgc_user_{0}_{1}'.format(user_project_id, study_id)
    load_data_from_file.run(
        project_id,
        bq_dataset,
        table_name,
        schema,
        source_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_APPEND',
        is_schema_file=False)

    # Generate feature_defs
    feature_defs = generate_feature_defs(study_id, project_id, bq_dataset, table_name, schema)

    # Update feature_defs table
    insert_feature_defs_list(cloudsql_tables['FEATURE_DEFS'], feature_defs)

    # Delete temporary files
    print 'Deleting temporary file {0}'.format(temp_outfile)
    gcs = GcsConnector(project_id, tmp_bucket)
    gcs.delete_blob(temp_outfile)