def get_corrs(data, adjust=identity, corr_func='pearson'):
    max_slice = defaultdict(int)
    for sl in data.columns:
        sl = sl.split('_sl')
        emb = sl[0]
        max_slice[emb] = max(max_slice[emb], int(sl[1][0:2]))
    xs = pd.Series(index=data.columns,
                   data=[
                       int(a.split('_sl')[1][:2]) /
                       max_slice[a.split('_sl')[0]] for a in data.columns
                       if 'sl' in a
                   ])

    corrs_same = defaultdict(list)
    corrs_diff = defaultdict(list)
    all_corrs = [corrs_diff, corrs_same]
    for emb1_name in pb()(max_slice):
        emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust)
        genotype = emb1_name.split('_')[0]
        xs1 = xs.select(startswith(emb1_name))
        for emb2_name in max_slice:
            if emb1_name == emb2_name: continue
            emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust)
            xs2 = xs.select(startswith(emb2_name))
            closest = {
                column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1]
                for column, x1 in xs1.items()
            }
            for col in emb1.columns:
                same = genotype == emb2_name.split('_')[0]
                all_corrs[same][genotype].append(emb1.ix[:, col].corr(
                    emb2.ix[:, closest[col]],
                    corr_func,
                ))
    return all_corrs
def get_corrs(data, adjust=identity, corr_func='pearson'):
    max_slice = defaultdict(int)
    for sl in data.columns:
        sl = sl.split('_sl')
        emb = sl[0]
        max_slice[emb] = max(max_slice[emb], int(sl[1][0:2]))
    xs = pd.Series(index=data.columns,
                   data=[int(a.split('_sl')[1][:2])/max_slice[a.split('_sl')[0]]
                         for a in data.columns if 'sl' in a])

    corrs_same = defaultdict(list)
    corrs_diff = defaultdict(list)
    all_corrs = [corrs_diff, corrs_same]
    for emb1_name in pb()(max_slice):
        emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust)
        genotype = emb1_name.split('_')[0]
        xs1 = xs.select(startswith(emb1_name))
        for emb2_name in max_slice:
            if emb1_name == emb2_name: continue
            emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust)
            xs2 = xs.select(startswith(emb2_name))
            closest = {
                column:
                min((abs(x2 - x1), c2)
                    for c2, x2 in xs2.items())[1]
                for column, x1 in xs1.items()
            }
            for col in emb1.columns:
                same = genotype == emb2_name.split('_')[0]
                all_corrs[same][genotype].append(emb1.ix[:, col].corr(
                    emb2.ix[:, closest[col]],
                    corr_func,
                ))
    return all_corrs
Esempio n. 3
0
def is_directionally_biased(ase, gene, bias_direction=None, style='ttest', ase_level=0.33,
                            min_slices=10, too_few_slices_val=99,
                            frac_for_biased=0.65, two_tailed=False, alpha=.05):
    if bias_direction is None:
        bias_direction = [1 for col in ase.columns]
    genotypes = {col.split('_')[0] for col in ase.columns}
    biases = {}
    for genotype in genotypes:
        genease = (ase.ix[gene] * bias_direction).select(startswith(genotype))
        if style == 'ttest':
            tstat, pval = ttest_1samp(genease, 0, nan_policy='omit')
            if isinstance(pval, np.ma.core.MaskedConstant):
                biases[genotype] = too_few_slices_val
                continue
            if two_tailed:
                biases[genotype] = np.sign(tstat) * (pval * len(ase) < alpha)

            else:
                pval = pval/2 if tstat > 0 else 1-pval/2
                biases[genotype] = pval * len(ase)  < alpha

        elif style == 'cutoff':
            slices_with_aseval = genease.count()
            if slices_with_aseval < min_slices:
                biases[genotype] = too_few_slices_val
                continue
            biases[genotype] = 0
            for dir in [-1, 1]:
                if ((dir * genease > ase_level).sum()
                    > max(frac_for_biased * slices_with_aseval, min_slices)):
                    biases[genotype] = dir
                    break
        else:
            raise NotImplementedError("Don't know how to use test style '{}'".format(style))
    return biases
Esempio n. 4
0
def get_class(gene, ase, subset='', slices_with_expr=None, expr=None):
    sample = ase.ix[gene]
    sample = sample.select(startswith(subset))

    if slices_with_expr is not None and gene in slices_with_expr.index:
        slices_with_expr = slices_with_expr.ix[gene]
    elif slices_with_expr is None and expr is not None and gene in expr.index:
        slices_with_expr = (expr.ix[gene].select(startswith(subset)) > EXPR_MIN).sum()
    else:
        return nan
    ase_vals = (abs(sample) > ASE_MIN) * sign(sample)
    slices_with_ase = isfinite(sample).sum()
    if slices_with_expr < len(sample) * .90:
        return 99
    if slices_with_ase < .5 * slices_with_expr:
        return 999
    if sum(ase_vals == 1) > slices_with_ase * FRAC_FOR_MATERNAL:
        return 1
    if sum(ase_vals == -1) > slices_with_ase * FRAC_FOR_MATERNAL:
        return -1
    return 0
Esempio n. 5
0
def get_class(gene, ase, subset='', slices_with_expr=None, expr=None):
    sample = ase.ix[gene]
    sample = sample.select(startswith(subset))

    if slices_with_expr is not None and gene in slices_with_expr.index:
        slices_with_expr = slices_with_expr.ix[gene]
    elif slices_with_expr is None and expr is not None and gene in expr.index:
        slices_with_expr = (expr.ix[gene].select(startswith(subset)) >
                            EXPR_MIN).sum()
    else:
        return nan
    ase_vals = (abs(sample) > ASE_MIN) * sign(sample)
    slices_with_ase = isfinite(sample).sum()
    if slices_with_expr < len(sample) * .90:
        return 99
    if slices_with_ase < .5 * slices_with_expr:
        return 999
    if sum(ase_vals == 1) > slices_with_ase * FRAC_FOR_MATERNAL:
        return 1
    if sum(ase_vals == -1) > slices_with_ase * FRAC_FOR_MATERNAL:
        return -1
    return 0
Esempio n. 6
0
def get_diffs(expr, mel_spline, sim_spline, col_headers, offset=EXPR_MIN):
    mel = expr.select(startswith('melXmel_'))
    sim = expr.select(startswith('simXsim_'))
    melXsim = expr.select(startswith('melXsim_'))
    simXmel = expr.select(startswith('simXmel_'))
    hybrids = expr.select(startswith(('melXsim', 'simXmel')))
    parental_diffs = dd.earth_mover_multi_rep(
        mel+offset, sim+offset,
        #normer=lambda x: expr.max(),
    )
    mel_hyb_diffs = dd.earth_mover_multi_rep(
        mel+offset, melXsim+offset,
        #normer=lambda x: expr.max(),
    )
    sim_hyb_diffs = dd.earth_mover_multi_rep(
        sim+offset, simXmel+offset,
        #normer=lambda x: expr.max(),
    )

    hyb_hyb_diffs = dd.earth_mover_multi_rep(
        melXsim+offset, simXmel+offset,
        #normer=lambda x: expr.max(),
        #normer=pd.np.sum,
    )
    within_melXsim_diff = dd.earth_mover_within(
        melXsim+offset,
        #normer=expr.max(),
    )
    within_simXmel_diff = dd.earth_mover_within(
        simXmel+offset,
        #normer=expr.max(),
    )


    avgs = pd.Series((mel_spline(xs) + sim_spline(xs))/2,
                     index=col_headers,
                    )

    avg_hyb_diffs = dd.earth_mover_multi_rep(
        avgs.astype(float).clip(0, 1e6),
        hybrids,
        normer=lambda x: expr.max(),
    )
    avg_level = avgs.max()
    hyb_level = [hybrids.select(startswith(g)).max()
                 for g in ['melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'melXsim_cyc14C_rep3',
                            'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2']]

    return (
        hyb_hyb_diffs,
        parental_diffs, mel_hyb_diffs, sim_hyb_diffs, avgs, avg_hyb_diffs,
        avg_level, hyb_level,
        within_melXsim_diff, within_simXmel_diff,
    )
Esempio n. 7
0
def is_directionally_biased(ase,
                            gene,
                            bias_direction=None,
                            style='ttest',
                            ase_level=0.33,
                            min_slices=10,
                            too_few_slices_val=99,
                            frac_for_biased=0.65,
                            two_tailed=False,
                            alpha=.05):
    if bias_direction is None:
        bias_direction = [1 for col in ase.columns]
    genotypes = {col.split('_')[0] for col in ase.columns}
    biases = {}
    for genotype in genotypes:
        genease = (ase.ix[gene] * bias_direction).select(startswith(genotype))
        if style == 'ttest':
            tstat, pval = ttest_1samp(genease, 0, nan_policy='omit')
            if isinstance(pval, np.ma.core.MaskedConstant):
                biases[genotype] = too_few_slices_val
                continue
            if two_tailed:
                biases[genotype] = np.sign(tstat) * (pval * len(ase) < alpha)

            else:
                pval = pval / 2 if tstat > 0 else 1 - pval / 2
                biases[genotype] = pval * len(ase) < alpha

        elif style == 'cutoff':
            slices_with_aseval = genease.count()
            if slices_with_aseval < min_slices:
                biases[genotype] = too_few_slices_val
                continue
            biases[genotype] = 0
            for dir in [-1, 1]:
                if ((dir * genease > ase_level).sum() > max(
                        frac_for_biased * slices_with_aseval, min_slices)):
                    biases[genotype] = dir
                    break
        else:
            raise NotImplementedError(
                "Don't know how to use test style '{}'".format(style))
    return biases
        path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'),
        **pd_kwargs)
        .select(**sel_startswith(('melXsim', 'simXmel')))
    )
    n_slices = slices_per_embryo(ase)
    actual = []
    computed = []

    for embryo, n in n_slices.items():
        if n not in virtual_slices:
            virtual_slices[n] = make_virtual_slices(
                mel_expr_at_stage, sim_expr_at_matching,
                mel_atlas_pos.ix[:, :, mel_stage].T,
                n
            )
        actual.extend(ase.ix[target].select(startswith(embryo)))
        computed.extend(virtual_slices[n][1][0])

    vslice_25 = virtual_slices[25][1][0].copy()
    vslice_25[13:19] = np.nan
    vslice_25 = pd.Series(index=['virtual_sl{}'.format(i+1) for i in range(25)],
                          data=vslice_25)
    kw = pu.kwargs_ase_heatmap.copy()
    kw.pop('draw_row_labels')
    kw.pop('draw_name')
    kw['box_height'] = 60
    kw['total_width'] = 200
    pu.svg_heatmap(vslice_25,
                   'analysis/results/hb_atlas_ase_slice_25_pu_M{}S{}.svg'
                   .format(mel_atlas_expr.minor_axis.get_loc(mel_stage),
                           sim_atlas_expr.minor_axis.get_loc(sim_stage)),
Esempio n. 9
0
        'analysis/results/{prefix}peak{suffix}_fits'.format(
            prefix=args.prefix, suffix=args.suffix), )

    if args.print_keggs:
        synonyms = get_synonyms()
        wnt_genes = [line.strip() for line in open('prereqs/wnt.kegg.genes')]
        wnt_scores = pd.Series(index=synonyms[wnt_genes],
                               data=best_r2[synonyms[wnt_genes]])
        wnt_scores.index = ['dme:Dmel_' + CG for CG in wnt_genes]
        wnt_scores.index.name = '#dme'
        wnt_scores.name = 'svASE'
        (wnt_scores.sort_values(na_position='first').to_csv(
            'analysis/results/wnt_scores.tsv', sep='\t', header=True))

        all_cgs = synonyms.select(
            startswith(('CG1', 'CG2', 'CG3', 'CG4', 'CG5', 'CG6', 'CG7', 'CG8',
                        'CG9')))
        all_scores = pd.Series(index=all_cgs, data=best_r2[all_cgs])
        all_scores.index = ['dme:Dmel_' + CG for CG in all_cgs.index]
        all_scores.index.name = '#dme'
        all_scores.name = 'svASE'
        all_scores2 = all_scores.copy()
        all_scores2.index = [ix.split(':')[1] for ix in all_scores2.index]
        (all_scores.sort_values(na_position='first').dropna().to_csv(
            'analysis/results/all_svase_scores_cg.tsv', sep='\t', header=True))
        keggs = {
            line.split()[0]: line.split()[1].strip().strip(',').split(',')
            for line in open('prereqs/kegg_database.txt')
        }
        kegg_stats = Counter()
        kegg_pvals = Counter()
        for pathway in ProgressBar()(keggs):
Esempio n. 10
0
    from GetASEStats import slices_per_embryo
    virtual_slices = {}
    ase = (pd.read_table(
        path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'),
        **pd_kwargs).select(**sel_startswith(('melXsim', 'simXmel'))))
    n_slices = slices_per_embryo(ase)
    actual = []
    computed = []

    for embryo, n in n_slices.items():
        if n not in virtual_slices:
            virtual_slices[n] = make_virtual_slices(
                mel_expr_at_stage, sim_expr_at_matching,
                mel_atlas_pos.ix[:, :, mel_stage].T, n)
        actual.extend(ase.ix[target].select(startswith(embryo)))
        computed.extend(virtual_slices[n][1][0])

    vslice_25 = virtual_slices[25][1][0].copy()
    vslice_25[13:19] = np.nan
    vslice_25 = pd.Series(
        index=['virtual_sl{}'.format(i + 1) for i in range(25)],
        data=vslice_25)
    kw = pu.kwargs_ase_heatmap.copy()
    kw.pop('draw_row_labels')
    kw.pop('draw_name')
    kw['box_height'] = 60
    kw['total_width'] = 200
    pu.svg_heatmap(
        vslice_25,
        'analysis/results/hb_atlas_ase_slice_25_pu_M{}S{}.svg'.format(
Esempio n. 11
0
    if args.print_keggs:
        synonyms = get_synonyms()
        wnt_genes = [line.strip() for line in open('prereqs/wnt.kegg.genes')]
        wnt_scores = pd.Series(index=synonyms[wnt_genes],
                               data=best_r2[synonyms[wnt_genes]])
        wnt_scores.index = ['dme:Dmel_' + CG for CG in wnt_genes]
        wnt_scores.index.name = '#dme'
        wnt_scores.name = 'svASE'
        (wnt_scores
         .sort_values(na_position='first')
         .to_csv('analysis/results/wnt_scores.tsv',
                 sep='\t',
                 header=True)
        )

        all_cgs = synonyms.select(startswith(('CG1', 'CG2', 'CG3', 'CG4', 'CG5',
                                              'CG6', 'CG7', 'CG8', 'CG9')))
        all_scores = pd.Series(index=all_cgs,
                               data=best_r2[all_cgs])
        all_scores.index = ['dme:Dmel_' + CG for CG in all_cgs.index]
        all_scores.index.name = '#dme'
        all_scores.name = 'svASE'
        all_scores2 = all_scores.copy()
        all_scores2.index = [ix.split(':')[1] for ix in all_scores2.index]
        (all_scores
         .sort_values(na_position='first')
         .dropna()
         .to_csv('analysis/results/all_svase_scores_cg.tsv',
                 sep='\t',
                 header=True)
        )
        keggs = {line.split()[0]:line.split()[1].strip().strip(',').split(',')
Esempio n. 12
0
def get_diffs(expr, mel_spline, sim_spline, col_headers, offset=EXPR_MIN):
    mel = expr.select(startswith('melXmel_'))
    sim = expr.select(startswith('simXsim_'))
    melXsim = expr.select(startswith('melXsim_'))
    simXmel = expr.select(startswith('simXmel_'))
    hybrids = expr.select(startswith(('melXsim', 'simXmel')))
    parental_diffs = dd.earth_mover_multi_rep(
        mel + offset,
        sim + offset,
        #normer=lambda x: expr.max(),
    )
    mel_hyb_diffs = dd.earth_mover_multi_rep(
        mel + offset,
        melXsim + offset,
        #normer=lambda x: expr.max(),
    )
    sim_hyb_diffs = dd.earth_mover_multi_rep(
        sim + offset,
        simXmel + offset,
        #normer=lambda x: expr.max(),
    )

    hyb_hyb_diffs = dd.earth_mover_multi_rep(
        melXsim + offset,
        simXmel + offset,
        #normer=lambda x: expr.max(),
        #normer=pd.np.sum,
    )
    within_melXsim_diff = dd.earth_mover_within(melXsim + offset,
                                                #normer=expr.max(),
                                                )
    within_simXmel_diff = dd.earth_mover_within(simXmel + offset,
                                                #normer=expr.max(),
                                                )

    avgs = pd.Series(
        (mel_spline(xs) + sim_spline(xs)) / 2,
        index=col_headers,
    )

    avg_hyb_diffs = dd.earth_mover_multi_rep(
        avgs.astype(float).clip(0, 1e6),
        hybrids,
        normer=lambda x: expr.max(),
    )
    avg_level = avgs.max()
    hyb_level = [
        hybrids.select(startswith(g)).max() for g in [
            'melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2',
            'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2'
        ]
    ]

    return (
        hyb_hyb_diffs,
        parental_diffs,
        mel_hyb_diffs,
        sim_hyb_diffs,
        avgs,
        avg_hyb_diffs,
        avg_level,
        hyb_level,
        within_melXsim_diff,
        within_simXmel_diff,
    )