Python match_series Exemples, Processing.Helpers.match_series Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Survival.py Projet : anyone1985/TCGA_Working

def interaction_empirical_p_resample(a, b, surv, num_perm=101, check_first=True):
    '''
    Calculate an empirical p-value for an interaction by sampling
    with replacement.  
    
    We first test if there is an improvement in model fit by 
    considering the interaction of the two events.  If so, we 
    then derive an empirical p-value. 
    '''
    a, b = match_series(a, b)
    if fisher_exact_test(a, b)['odds_ratio'] > 1:
        int_direction = 'both'
    else:
        int_direction = 'neither'
    r = get_interaction(a, b, surv)
    if (r < 0) and (check_first is True):
        return pd.Series({'p': 1, 'interaction': int_direction})
    
    mat = np.random.choice(a.index, size=(num_perm, len(a.index)))
    
    vec = {}
    for i, idx in enumerate(mat):
        a_p = pd.Series(list(a.ix[idx]), range(len(idx)))
        b_p = pd.Series(list(b.ix[idx]), range(len(idx)))
        surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(),
                              index=range(len(idx)),
                              columns=['days', 'event']).stack()
        vec[i] = get_interaction(a_p, b_p, surv_p, int_direction)
    vec = pd.Series(vec)
    
    empirical_p = 1.*(len(vec) - sum(vec <= r)) / len(vec)
    return pd.Series({'p': empirical_p, 'interaction': int_direction})

Exemple #2

0

Afficher le fichier

Fichier : Boxplots.py Projet : Krysia/TCGA

def box_plot_pandas(bin_vec, real_vec, ax=None):
    """
    Wrapper around matplotlib's boxplot function.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    _, ax = init_ax(ax)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    categories = bin_vec.value_counts().index
    data = [real_vec[bin_vec == num] for num in categories]
    bp = ax.boxplot(data, positions=range(len(categories)), widths=.3,
                    patch_artist=True)
    if real_vec.name:
        ax.set_ylabel(real_vec.name)
    if bin_vec.name:
        ax.set_xlabel(bin_vec.name)
    [p.set_visible(False) for p in bp['fliers']]
    [p.set_visible(False) for p in bp['caps']]
    [p.set_visible(False) for p in bp['whiskers']]
    for p in bp['medians']:
        p.set_color(colors[0])
        p.set_lw(3)
        p.set_alpha(.8)
    for i, p in enumerate(bp['boxes']):
        p.set_color('grey')
        p.set_lw(3)
        p.set_alpha(.7)
        if len(data[i]) < 3:
            p.set_alpha(0)

Exemple #3

0

Afficher le fichier

def box_plot_pandas(bin_vec, real_vec, ax=None):
    """
    Wrapper around matplotlib's boxplot function.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    _, ax = init_ax(ax)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    categories = bin_vec.value_counts().index
    data = [real_vec[bin_vec == num] for num in categories]
    bp = ax.boxplot(data,
                    positions=range(len(categories)),
                    widths=.3,
                    patch_artist=True)
    if real_vec.name:
        ax.set_ylabel(real_vec.name)
    if bin_vec.name:
        ax.set_xlabel(bin_vec.name)
    [p.set_visible(False) for p in bp['fliers']]
    [p.set_visible(False) for p in bp['caps']]
    [p.set_visible(False) for p in bp['whiskers']]
    for p in bp['medians']:
        p.set_color(colors[0])
        p.set_lw(3)
        p.set_alpha(.8)
    for i, p in enumerate(bp['boxes']):
        p.set_color('grey')
        p.set_lw(3)
        p.set_alpha(.7)
        if len(data[i]) < 3:
            p.set_alpha(0)

Exemple #4

0

Afficher le fichier

def interaction_empirical_p(a, b, surv, num_perm=101):
    '''
    Calculate an empirical p-value for an interaction by sampling
    with replacement.  
    
    We first test if there is an improvement in model fit by 
    considering the interaction of the two events.  If so, we 
    then derive an empirical p-value. 
    '''
    a, b = match_series(a, b)
    if fisher_exact_test(a, b)['odds_ratio'] > 1:
        int_direction = 'both'
    else:
        int_direction = 'neither'
    r = get_interaction(a, b, surv)
    mat = np.array([np.random.permutation(a.index) for i in range(num_perm)])

    vec = {}
    for i, idx in enumerate(mat):
        a_p = pd.Series(list(a.ix[idx]), range(len(idx)))
        b_p = pd.Series(list(b.ix[idx]), range(len(idx)))
        surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(),
                              index=range(len(idx)),
                              columns=['days', 'event']).stack()
        vec[i] = get_interaction(a_p, b_p, surv_p, int_direction)
    vec = pd.Series(vec).dropna()
    empirical_p = 1. * (len(vec) - sum(vec <= r)) / len(vec)
    return pd.Series({'p': empirical_p, 'interaction': int_direction})

Exemple #5

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def pearson_p(a,b):
    '''
    Find pearson's correlation and return p-value.
    ------------------------------------------------
    a, b: Series with continuous measurements
    '''
    a,b = match_series(a.dropna(), b.dropna())
    _,p = pearsonr(a,b)
    return p

Exemple #6

0

Afficher le fichier

def violin_plot_pandas(bin_vec,
                       real_vec,
                       ann='p',
                       order=None,
                       ax=None,
                       filename=None):
    """
    http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html
    Wrapper around matplotlib's boxplot function to add violin profile.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    fig, ax = init_ax(ax)
    ax.set_ylabel(real_vec.name)
    ax.set_xlabel(bin_vec.name)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    try:
        if order is None:
            categories = bin_vec.value_counts().index
        else:
            categories = order
        _violin_plot(ax, [real_vec[bin_vec == num] for num in categories],
                     pos=categories,
                     bp=True)
        ax.set_xticklabels(
            [str(c) + '\n(n=%i)' % sum(bin_vec == c) for c in categories])
    except:
        box_plot_pandas(bin_vec, real_vec, ax=ax)

    #if type(bin_vec.name) == str:
    #    ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name))

    p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p']
    if ann == 'p_fancy':
        ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=14)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=12)
    elif ann is not None:
        ax.annotate(ann, (.95, .02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=12)
    if filename is not None:
        fig.savefig(filename)
    return

Exemple #7

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def kruskal_p(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = match_series(hit_vec, response_vec)
        return kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])[1]
    except:
        return nan

Exemple #8

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def bartlett_pandas(group_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    group_vec: Series of labels
    response_vec: Series of measurements
    '''
    if group_vec.value_counts().min() < min_size:
        return nan
    group_vec, response_vec = match_series(group_vec, response_vec)
    res = bartlett(*[response_vec[group_vec == num] for num in 
                     group_vec.unique()])
    return pd.Series(res, index=['T','p'])

Exemple #9

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def pearson_pandas(a, b, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        a, b = match_series(a, b)
        res = stats.pearsonr(a,b)
        return pd.Series(res, index=['rho','p'])
    except:
        return pd.Series(index=['rho','p'])

Exemple #10

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def anova(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    if hit_vec.value_counts().min < min_size:
        return nan
    hit_vec, response_vec = match_series(hit_vec, response_vec)
    res = f_oneway(*[response_vec[hit_vec == num] for num in 
                     hit_vec.unique()])
    return pd.Series(res, index=['F','p'])

Exemple #11

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

def kruskal_pandas(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = match_series(hit_vec, response_vec)
        res = kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])
        return pd.Series(res, index=['H','p'])
    except:
        return pd.Series(index=['H','p'])

Exemple #12

0

Afficher le fichier

Fichier : Reports_Old.py Projet : anyone1985/TCGA_Working

def single_gene_section(cancer, hit_matrix, cutoff=.25):
    #Format data for report
    path = cancer.report_folder + '/'
    gene_table_file = path + 'gene_table.csv'
    hit_matrix = hit_matrix.groupby(level=0).first() #Make index unique
    counts = (hit_matrix.ix[:,cancer.patients] > 0).sum(1)
    counts.name = 'n_patients'
    genes = Series(dict((i,i) for i in cancer.q_genes.index), name='gene')
    gene_table = cancer.q_genes.join(counts).join(genes)
    gene_table = gene_table.ix[:,::-1]
    if 'survival' in gene_table:
        gene_table = gene_table.sort(columns='survival')
    gene_table.to_csv(gene_table_file)
    genes_to_show = cancer.q_genes[(cancer.q_genes < .2).sum(1) > 0].index
    gene_table = gene_table.ix[genes_to_show]
    if 'survival' in gene_table:
        gene_table = gene_table.sort(columns='survival')
    gene_table = gene_table.head(20)
    gene_table_r = com.convert_to_r_dataframe(gene_table) #@UndefinedVariable
    
    if len(gene_table) == 0:
        return nz.addTo(nz.newSubSection('Gene Mutations'), nz.newParagraph(''))
    
    #Overview
    tableCaption1 = "Association of gene mutations with patient clinical features."
    table1 = nz.newTable(gene_table_r, tableCaption1, file=gene_table_file, 
                         significantDigits=2);
    #Fill in the details
    gene_pos = dict((g,i+1) for i,g in enumerate(gene_table.index))
    col_pos = dict((c,i+1) for i,c in enumerate(gene_table.columns))
    
    #age violin plots
    if 'age' in gene_table:
        for g,val in gene_table['age'].iteritems():
            num_genes = (match_series(hit_matrix.ix[g], cancer.clinical.age)[0] > 0).sum()
            if val < cutoff and num_genes > 2:
                table1 = add_violin_plot(hit_matrix.ix[g], cancer, table1, 
                                         (gene_pos[g], col_pos['age']),
                                         path + FIG_EXT)
        
    #survival curves
    if 'survival' in gene_table:
        for g,val in gene_table['survival'].iteritems():
            if val < cutoff:
                table1 = add_survival_curve(hit_matrix.ix[g], cancer, table1, (gene_pos[g], 
                                            col_pos['survival']), path + FIG_EXT) 
    
    section = nz.addTo(nz.newSubSection('Gene Mutations'), table1)
    return section

Exemple #13

0

Afficher le fichier

Fichier : Pandas.py Projet : anyone1985/TCGA_Working

def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args):
    fig, ax = init_ax(ax, figsize=(6,4))
    if 's' not in plot_args:
        plot_args['s'] = 75
    if 'alpha' not in plot_args:
        plot_args['alpha'] = .5
    ax.scatter(*match_series(s1, s2), **plot_args)
    ax.set_xlabel(s1.name)
    ax.set_ylabel(s2.name)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02),
                    xycoords='axes fraction', ha='right',va='bottom', size=14)
    if ann == 'fancy_p':
        ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02),
                    xycoords='axes fraction', ha='right',va='bottom', size=14)
    if filename is not None:
        fig.savefig(filename)

Exemple #14

0

Afficher le fichier

Fichier : Pandas.py Projet : xulijunji/TCGA

def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args):
    fig, ax = init_ax(ax, figsize=(6, 4))
    if 's' not in plot_args:
        plot_args['s'] = 75
    if 'alpha' not in plot_args:
        plot_args['alpha'] = .5
    ax.scatter(*match_series(s1, s2), **plot_args)
    ax.set_xlabel(s1.name)
    ax.set_ylabel(s2.name)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if ann == 'fancy_p':
        ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if filename is not None:
        fig.savefig(filename)

Exemple #15

0

Afficher le fichier

Fichier : Reports_Old.py Projet : anyone1985/TCGA_Working

def pathway_mutation_section(cancer, gene_sets, cutoff=.25):
    #Format data for report
    path = cancer.report_folder + '/'
    pathway_table_file = path + 'pathway_table.csv'
    pathway_table = format_pathway_table(cancer, gene_sets)    
    if 'survival' in pathway_table:
        pathway_table = pathway_table.sort(columns='survival')
    pathway_table.to_csv(pathway_table_file)
    keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index
    pathway_table = pathway_table.ix[keepers]
    if 'survival' in pathway_table:
        pathway_table = pathway_table.sort(columns='survival')
    pathway_table = pathway_table.head(20)
    pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable
    if len(pathway_table) == 0:
        return nz.addTo(nz.newSubSection('Pathway Mutations'), nz.newParagraph(''))
    
    #Overview
    tableCaption1 = ('Association of pathway level mutations with patient' + 
                     'clinical features.')
    table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, 
                             significantDigits=2);                      
   
    #Fill in the details
    pathway_pos = dict((p,i+1) for i,p in enumerate(pathway_table.index))
    col_pos = dict((c,i+1) for i,c in enumerate(pathway_table.columns))
    
    #age violin plots
    if 'age' in pathway_table:
        for g,val in pathway_table['age'].iteritems():
            num_patients = (match_series(cancer.meta_matrix.ix[g], cancer.clinical.age)[0] > 0).sum()
            if val < cutoff and num_patients > 2:
                table1 = add_violin_plot(cancer.meta_matrix.ix[g], cancer, table1, 
                                         (pathway_pos[g], col_pos['age']),
                                         path + FIG_EXT)        
    
    #survival curves
    if 'survival' in pathway_table:
        for g,val in pathway_table['survival'].iteritems():
            if val < cutoff:
                table1 = add_survival_curve_pathway(cancer.meta_matrix.ix[g], cancer, table1, 
                            (pathway_pos[g], col_pos['survival']), path + FIG_EXT) 
                
    section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1)
    return section

Exemple #16

0

Afficher le fichier

Fichier : Boxplots.py Projet : Krysia/TCGA

def violin_plot_pandas(bin_vec, real_vec, ann='p', order=None, ax=None,
                       filename=None):
    """
    http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html
    Wrapper around matplotlib's boxplot function to add violin profile.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    fig, ax = init_ax(ax)
    ax.set_ylabel(real_vec.name)
    ax.set_xlabel(bin_vec.name)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    try:
        if order is None:
            categories = bin_vec.value_counts().index
        else:
            categories = order
        _violin_plot(ax, [real_vec[bin_vec == num] for num in categories],
                     pos=categories, bp=True)
        ax.set_xticklabels([str(c) + '\n(n=%i)' % sum(bin_vec == c) 
                            for c in categories])
    except:
        box_plot_pandas(bin_vec, real_vec, ax=ax)
        
    #if type(bin_vec.name) == str:
    #    ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name))
        
    p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p']
    if ann == 'p_fancy':
        ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02),
                    xycoords='axes fraction', ha='right', va='bottom', size=12)
    elif ann is not None:
        ax.annotate(ann, (.95, .02), xycoords='axes fraction', ha='right',
                    va='bottom', size=12)
    if filename is not None:
        fig.savefig(filename)
    return

Exemple #17

0

Afficher le fichier

Fichier : Tests.py Projet : anyone1985/TCGA_Working

 def test(hit_vec):
     hit_vec, response_vec = match_series(hit_vec, self.response_vec)
     res =  f_oneway(*[response_vec[hit_vec == num] for num in 
               hit_vec.unique()])
     return Series({'stat': res[0], 'p': res[1]})