def parse_methylation(project_id, bucket_name, filename, outfilename,
                      metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform',
                      "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs,
                                                project_id,
                                                bucket_name,
                                                filename,
                                                skiprows=1)
    data_df.columns = [
        'Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome",
        "Genomic_Coordinate"
    ]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
def parse_methylation(project_id, bucket_name, filename, outfilename, metadata):
    """Download and convert blob into dataframe
       Transform the file: includes data cleaning
       Add Metadata information
    """
    # setup logging
    configure_logging('mirna.isoform', "logs/" + metadata['AliquotBarcode'] + '.log')

    # connect to the cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #main steps: download, convert to df, cleanup, transform, add metadata
    data_df = gcutils.convert_blob_to_dataframe(gcs, project_id, bucket_name, filename, skiprows=1)
    data_df.columns = ['Probe_Id', "Beta_Value", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"]

    data_df = add_metadata(data_df, metadata)
    data_df = additional_changes(data_df)

    # upload the contents of the dataframe in njson format
    df_string = data_df.to_csv(index=False, header=False, float_format='%.2f')
    status = gcs.upload_blob_from_string(outfilename, df_string)

    return status
def generate_oncotator_inputfiles(project_id, bucket_name, filename,
                                  outputfilename, oncotator_columns):

    print(filename)

    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    try:
        maf_df = convert_file_to_dataframe(filebuffer)
    except:
        print 'problem converting %s to a dataframe' % (filename)
        raise

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns)

    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({
        'hg19': '37',
        'GRCh37': '37',
        'GRCh37-lite': '37'
    })

    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome": map(str, range(1, 23)) + ['X', 'Y'],
        "mutation_status": ['somatic', 'Somatic'],
        "sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq'],
        "ncbi_build": ['37']
    }

    filter_checklist_df = maf_df.isin(filters)

    filter_string = ((filter_checklist_df["chromosome"] == True)
                     & (filter_checklist_df["mutation_status"] == True)
                     & (filter_checklist_df["sequencer"] == True)
                     & (filter_checklist_df["ncbi_build"] == True))

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build": 'build',
        'chromosome': 'chr',
        'start_position': 'start',
        'end_position': 'end',
        'reference_allele': 'ref_allele',
        'tumor_seq_allele1': 'tum_allele1',
        'tumor_seq_allele2': 'tum_allele2',
        'tumor_sample_barcode': 'tumor_barcode',
        'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [
            replace_column_names[x] if x == rcol else x for x in maf_df.columns
        ]
        oncotator_columns = [
            replace_column_names[y] if y == rcol else y
            for y in oncotator_columns
        ]

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)

    df_stringIO = oncotator_df.to_csv(sep='\t',
                                      index=False,
                                      columns=oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)

    return True
def generate_oncotator_inputfiles(project_id, bucket_name, filename, outputfilename, oncotator_columns):

    print (filename)
    
    # NEW connection
    gcs = GcsConnector(project_id, bucket_name)

    filebuffer = gcs.download_blob_to_file(filename)

    # convert blob into dataframe
    maf_df = convert_file_to_dataframe(filebuffer)

    # clean-up dataframe
    maf_df = cleanup_dataframe(maf_df)

    print maf_df.columns
    # lowercase the column names (WHY?)
    maf_df.columns = map(lambda x: x.lower(), maf_df.columns) 
    
    #--------------------------------------------
    # data - manipulation
    #--------------------------------------------
    maf_df["ncbi_build"] = maf_df["ncbi_build"].replace({ 'hg19': '37'
                                  ,'GRCh37': '37'
                                  ,'GRCh37-lite': '37'
                                 })

   
    #---------------------------------------------
    ## Filters
    ## remember all the column names are lowercase
    #---------------------------------------------
    filters = {
        "chromosome" : map(str,range(1,23)) + ['X', 'Y']
        ,"mutation_status": ['somatic', 'Somatic']
        ,"sequencer": ['Illumina HiSeq', 'Illumina GAIIx', 'Illumina MiSeq']
        ,"ncbi_build" : ['37']
    }

    filter_checklist_df = maf_df.isin(filters)
    
    filter_string = (
                       (filter_checklist_df["chromosome"] == True)
                        &   
                       (filter_checklist_df["mutation_status"] == True)
                        &
                       (filter_checklist_df["sequencer"] == True)
                        &
                       (filter_checklist_df["ncbi_build"] == True)
                    )

    maf_df = maf_df[filter_string]

    #---------------------
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------
   
    # oncotator needs these columns
    replace_column_names = {
        "ncbi_build" : 'build'
       ,'chromosome' : 'chr'
       ,'start_position' : 'start'
       ,'end_position' : 'end'
       ,'reference_allele' : 'ref_allele'
       ,'tumor_seq_allele1' : 'tum_allele1'
       ,'tumor_seq_allele2' : 'tum_allele2'
       ,'tumor_sample_barcode': 'tumor_barcode'
       ,'matched_norm_sample_barcode': 'normal_barcode'
    }

    # replace columns with new headings; just name change
    for rcol in replace_column_names:
        maf_df.columns = [replace_column_names[x] if x==rcol else x for x in maf_df.columns]
        oncotator_columns = [replace_column_names[y] if y==rcol else y for y in oncotator_columns]         

    # remove/mangle any duplicate columns ( we are naming line a, a.1, a.2 etc)
    maf_df.columns = mangle_dupe_cols(maf_df.columns.values)

    #---------------------      
    #Oncotator part: generate intermediate files for Oncotator
    #---------------------

    oncotator_df = maf_df[oncotator_columns]

    print "df_columns", len(oncotator_df.columns)
   
    df_stringIO =  oncotator_df.to_csv(sep='\t', index=False, columns= oncotator_columns)

    # upload the file
    gcs.upload_blob_from_string(outputfilename, df_stringIO)
    
    return True
def main():
    """Parse GCT file, merge with barcodes info, melt(tidy) 
        and load to Google Storage and BigQuery
    """

    project_id = ''
    bucket_name = ''
    # example file in bucket
    filename = 'ccle/mRNA-gene-exp/CCLE_Expression_Entrez_2012-09-29.gct'
    outfilename = 'test'
    writer = ExcelWriter('ccle.xlsx')

    # connect to the google cloud bucket
    gcs = GcsConnector(project_id, bucket_name)

    #------------------------------
    # load the GCT file
    #  * the file has duplicate samples (columns)
    #------------------------------
    # To remove duplicates, load the first few lines of the file only. Get columns, unique and select dataframe
    # this is a hack, but I cant find a elegant way to remove duplciates
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=False,
                           nrows=2)
    unqiue_columns = np.unique(gct_df.columns)
    gct_df = pd.read_table('CCLE_Expression_Entrez_2012-09-29.gct',
                           sep='\t',
                           skiprows=2,
                           mangle_dupe_cols=True)
    # clean-up the dataset/dataframe
    gct_df = cleanup_dataframe(gct_df)
    gct_df = gct_df[unqiue_columns]

    # remove any gene_id starting with 'AFFX-'
    gct_df[gct_df['Name'].str.startswith('AFFX-')].to_excel(
        writer, sheet_name="affy_info")
    gct_df = gct_df[~gct_df['Name'].str.startswith('AFFX-')]

    #------------------------------
    # HGNC validation
    #-----------------------------
    hgnc_df = hgnc_validation.get_hgnc_map()
    hgnc_df.to_excel(writer, sheet_name="hgnc_info")
    hgnc_dict = dict(zip(hgnc_df.entrez_id, hgnc_df.symbol))
    gct_df['HGNC_gene_symbol'] = gct_df['Name'].map(
        lambda gene_id: hgnc_dict.get(gene_id.replace('_at', ''), np.nan))
    gct_df[['HGNC_gene_symbol', 'Name',
            'Description']].to_excel(writer, sheet_name="gene_info")
    gct_df['Name'] = gct_df['Name'].map(
        lambda gene_id: gene_id.replace('_at', ''))

    #------------------------------
    # barcodes info
    #------------------------------
    barcodes_filename = 'ccle/mRNA-gene-exp/mRNA_names.out.tsv'
    filebuffer = gcs.download_blob_to_file(barcodes_filename)
    barcodes_df = pd.read_table(
        filebuffer,
        header=None,
        names=['ParticipantBarcode', 'SampleBarcode',
               'CCLE_long_name'])  # convert into dataframe
    barcodes_df = cleanup_dataframe(barcodes_df)  # clean-up dataframe

    #------------------------------
    # ignore (drop) all of the columns from the gene-expression matrix
    #that don't have corresponding Participant and Sample barcodes,
    #------------------------------
    columns_df = pd.DataFrame(unqiue_columns)
    columns_df.columns = ['CCLE_long_name']
    samples_map_df = pd.merge(columns_df,
                              barcodes_df,
                              on='CCLE_long_name',
                              how='inner')
    samples_map_df.to_excel(writer, sheet_name="sample_info")

    # select columns that are overlapping
    overlapping_samples = samples_map_df['CCLE_long_name'].tolist()
    overlapping_samples = overlapping_samples + [
        'Name', 'Description', 'HGNC_gene_symbol'
    ]
    gct_df = gct_df[overlapping_samples]
    print gct_df

    # melt the matrix
    value_vars = [
        col for col in gct_df.columns
        if col not in ['Name', 'Description', 'HGNC_gene_symbol']
    ]
    melted_df = pd.melt(gct_df,
                        id_vars=['Name', 'Description', 'HGNC_gene_symbol'],
                        value_vars=value_vars)
    melted_df = melted_df.rename(
        columns={
            'Name': 'gene_id',
            'Description': 'original_gene_symbol',
            'variable': 'CCLE_long_name',
            'value': 'RMA_normalized_expression'
        })

    # merge to get barcode information
    # changed from outer join to inner join. In this case it shouldnt matter, since we already did a inner join
    # while select the samples above.
    data_df = pd.merge(melted_df,
                       samples_map_df,
                       on='CCLE_long_name',
                       how='inner')
    data_df['Platform'] = "Affymetrix U133 Plus 2.0"

    # reorder columns
    col_order = [
        "ParticipantBarcode", "SampleBarcode", "CCLE_long_name", "gene_id",
        "HGNC_gene_symbol", "original_gene_symbol", "Platform",
        "RMA_normalized_expression"
    ]
    data_df = data_df[col_order]

    # upload the contents of the dataframe in CSV format
    print "Convert to CSV"
    outfilename = "tcga/intermediary/CCLE_mrna_expr/bq_data_files/ccle_mrna_expr.csv"
    df_string = data_df.to_csv(index=False, header=False)
    status = gcs.upload_blob_from_string(outfilename, df_string)
    print status

    # save the excel file
    writer.save()