Ejemplo n.º 1
0
def get_gistic_genes(data_path, cancer, filter_with_rna=True,
                     collapse_on_bands=True, min_patients=5):
    """
    Gets a matrix of events for high grade amplifications and homozygous 
    deletions. 
    We filter down this list by asserting that a copy number event corresponds
    with a resultant expression change. 
    The final matrix merges gene-level events on the same band to combine
    redundant events and reduce the test space.     
    """
    gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01')
    deletion = gistic[(gistic == -2).sum(1) > min_patients]
    amp = gistic[(gistic == 2).sum(1) > min_patients]
    ft = pd.MultiIndex.from_tuples  # rediculously long pandas names
    deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index])
    amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index])
    
    if filter_with_rna:
        rna = FH.read_rnaSeq(data_path, cancer)
        deletion = rna_filter(deletion, -2, rna)
        amp = rna_filter(amp, 2, rna)
   
    cna_genes = amp.append(deletion)
    if collapse_on_bands == False:
        return cna_genes
    
    cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))): 
                           b.mean().round() for a, b in 
                           cna_genes.groupby(level=[0, 1])}).T
    cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index)
    return cna_genes
Ejemplo n.º 2
0
def get_gistic_genes(data_path,
                     cancer,
                     filter_with_rna=True,
                     collapse_on_bands=True,
                     min_patients=5):
    '''
    Gets a matrix of events for high grade amplifications and homozygous 
    deletions. 
    We filter down this list by asserting that a copy number event corresponds
    with a resultant expression change. 
    The final matrix merges gene-level events on the same band to combine
    redundant events and reduce the test space.     
    '''
    gistic = FH.get_gistic_gene_matrix(data_path, cancer, '01')
    deletion = gistic[(gistic == -2).sum(1) > min_patients]
    amp = gistic[(gistic == 2).sum(1) > min_patients]
    ft = pd.MultiIndex.from_tuples  # rediculously long pandas names
    deletion.index = ft([('Deletion', s[0], s[2]) for s in deletion.index])
    amp.index = ft([('Amplification', s[0], s[2]) for s in amp.index])

    if filter_with_rna:
        rna = FH.read_rnaSeq(data_path, cancer)
        deletion = rna_filter(deletion, -2, rna)
        amp = rna_filter(amp, 2, rna)

    cna_genes = amp.append(deletion)
    if collapse_on_bands == False:
        return cna_genes

    cna_genes = pd.DataFrame({(a[0], a[1], tuple(b.index.get_level_values(2))):
                              b.mean().round()
                              for a, b in cna_genes.groupby(level=[0, 1])}).T
    cna_genes.index = pd.MultiIndex.from_tuples(cna_genes.index)
    return cna_genes
Ejemplo n.º 3
0
def get_cna_rates(data_path, cancer, patients=None):
    """
    Get copy-number aberration rates from GISTIC processing pipeline.  
    This function depends on the current Firehose output of this program 
    as of July 2013.
    """
    gistic = FH.get_gistic_gene_matrix(data_path, cancer)
    amp_gene_all = (gistic >= 1).astype(int).sum()
    amp_gene_high = (gistic == 2).astype(int).sum()
    del_gene_all = (gistic <= -1).astype(int).sum()
    del_gene_homo = (gistic <= -2).astype(int).sum()
    
    lesions = FH.get_gistic_lesions(data_path, cancer)
    amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum()
    amp_lesion_high = (lesions.ix['Amplification'] == 2).sum()
    del_lesion_all = (lesions.ix['Deletion'] <= -1).sum()
    del_lesion_homo = (lesions.ix['Deletion'] == -2).sum()
    
    arm_cn = FH.get_gistic_arm_values(data_path, cancer)
    chromosomal_instability = arm_cn.abs().mean()
    
    cna_df = {'gene_amp': amp_gene_all, 'gene_amp_high': amp_gene_high,
              'gene_del': del_gene_all, 'gene_del_homo': del_gene_homo,
              'lesion_amp': amp_lesion_all, 'lesion_amp_high': amp_lesion_high,
              'lesion_del': del_lesion_all, 'lesion_del_homo': del_lesion_homo,
              'chrom_instability': chromosomal_instability}
    cna_df = pd.DataFrame(cna_df)
    if patients is not None:
        cna_df = cna_df.ix[patients].dropna()
    return cna_df
Ejemplo n.º 4
0
def get_cna_rates(data_path, cancer, patients=None):
    '''
    Get copy-number aberration rates from GISTIC processing pipeline.  
    This function depends on the current Firehose output of this program 
    as of July 2013.
    '''
    gistic = FH.get_gistic_gene_matrix(data_path, cancer)
    amp_gene_all = (gistic >= 1).astype(int).sum()
    amp_gene_high = (gistic == 2).astype(int).sum()
    del_gene_all = (gistic <= -1).astype(int).sum()
    del_gene_homo = (gistic <= -2).astype(int).sum()

    lesions = FH.get_gistic_lesions(data_path, cancer)
    amp_lesion_all = (lesions.ix['Amplification'] >= 1).sum()
    amp_lesion_high = (lesions.ix['Amplification'] == 2).sum()
    del_lesion_all = (lesions.ix['Deletion'] <= -1).sum()
    del_lesion_homo = (lesions.ix['Deletion'] == -2).sum()

    arm_cn = FH.get_gistic_arm_values(data_path, cancer)
    chromosomal_instability = arm_cn.abs().mean()

    cna_df = {
        'gene_amp': amp_gene_all,
        'gene_amp_high': amp_gene_high,
        'gene_del': del_gene_all,
        'gene_del_homo': del_gene_homo,
        'lesion_amp': amp_lesion_all,
        'lesion_amp_high': amp_lesion_high,
        'lesion_del': del_lesion_all,
        'lesion_del_homo': del_lesion_homo,
        'chrom_instability': chromosomal_instability
    }
    cna_df = pd.DataFrame(cna_df)
    if patients is not None:
        cna_df = cna_df.ix[patients].dropna()
    return cna_df