コード例 #1
0
ファイル: Run.py プロジェクト: airanmehr/bio
 def one(method):
     ff = lambda x: ((x.alt - x.null) * x.s.apply(np.sign)).fillna(0).sort_index()
     path = utl.outpath + 'ROC/runs/'
     files = pd.Series(utl.files(path))
     files = files[files.apply(lambda x: method in x)]
     if method == 'MarkovChain':
         pd.concat([ff(pd.read_pickle(path + f)) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
     else:
         pd.concat([pd.read_pickle(path + f) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
コード例 #2
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def loadAllScores(h=None, scores=True):
    path = utl.outpath + 'real/HMM/'
    if h is None:
        return pd.concat(map(lambda x: pd.read_pickle(path + x), utl.files(path)), axis=1)
    else:
        a = pd.read_pickle('{}h{:E}.df'.format(path, h))[h]
        if scores:
            a = (a.alt - a.null) * a.s.apply(np.sign)
        return a
コード例 #3
0
ファイル: GeneAnalysis.py プロジェクト: airanmehr/bio
def saveLatex():
    for name in [x for x in utl.files('/home/arya/out/real/gowinda/') if x[-4:] == '.tsv']:
        # name='cand.local.damped.0.out.tsv'
        a = pd.read_csv('/home/arya/out/real/gowinda/' + name, sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
        a.columns = ['GO', '-logPval', 'Hits', 'VarGenes', 'TotGenes', 'Term', 'Genes']
        a = a[a.Hits >= 3]
        a['-logPval'] = -a['-logPval'].apply(np.log10).round(1)
        a['Genes'] = a['Genes'].apply(lambda x: x.replace(',', '          '))
        utl.DataframetolaTexTable(a.iloc[:, 1:], fname=utl.paperPath + 'new/' + name.replace('.tsv', '.tex'),
                                  alignment=list('cccc') + ['p{2in}', 'p{2in}'])
コード例 #4
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def load(dampHighCov=True):
    def dampHighCoverage(x):
        cutoff = min(500, utl.ceilto(x[x.name].D.quantile(0.999), 10))
        x.loc[(x[x.name].D > cutoff).values, (x.name[0], x.name[1], x.name[2]) + ("C",)] = 0
        x.loc[(x[x.name].D > cutoff).values, (x.name[0], x.name[1], x.name[2]) + ("D",)] = 0
        return x

    fname = "/home/arya/storage/Data/Dmelanogaster/Hypoxia/data.df"

    try:
        if dampHighCov:
            fname = fname.replace(".df", ".damped.df")
        return add_gen4_hyperoxia(pd.read_pickle(fname))
    except:
        print "Loading.."
        path = "/home/arya/storage/Data/Dmelanogaster/Hypoxia/readcounts/"

        def load(names, f):
            return pd.concat(
                [
                    f(pd.read_csv(path + n, sep="\t", na_values=".").set_index(["CHROM", "POS"])).rename(
                        n.split(".")[0]
                    )
                    for n in names
                ],
                axis=1,
            )

        def fixcols(a):
            cols = pd.Series(a.columns)
            for i, c in cols.iteritems():
                if len(c) == 2:
                    cols[i] = "F200" + cols[i]
                if "HOF" in c or "LOF" in c:
                    cols[i] = "F200" + cols[i][0] + cols[i][-1]
                tmp = (int(cols[i][1:-2]), cols[i][-2], int(cols[i][-1]))
                cols[i] = tmp
            a.columns = pd.MultiIndex.from_tuples(cols.tolist(), names=["GEN", "POP", "REP"]).reorder_levels([1, 0, 2])
            a = a.reset_index().replace("dmel_mitochondrion_genome", "M").set_index(["CHROM", "POS"])
            a = a.T.reset_index()
            a["GEN"] = a["GEN"].replace(200, 180)
            a = a.set_index(["POP", "GEN", "REP"]).T
            return a.sort_index(axis=1).sort_index()

        files = pd.Series(utl.files(path))
        r = fixcols(load(files[files.apply(lambda x: "RO." in x)], f=lambda x: x.astype(np.float).sum(1)))
        r

        def fff(x):
            try:
                return np.sum(map(np.float, x.split(",")))
            except:
                return None

        a = fixcols(load(files[files.apply(lambda x: "AO." in x)], f=lambda x: x.applymap(fff).astype(np.float).sum(1)))
        d = a + r
        a = pd.concat([a, d], axis=1, keys=["C", "D"]).reorder_levels([1, 2, 3, 0], axis=1).sort_index(axis=1)
        a.columns.set_names(["POP", "GEN", "REP", "READ"], inplace=True)
        a.to_pickle(fname)
        a = pd.read_pickle(fname).groupby(level=range(3), axis=1).apply(lambda x: dampHighCoverage(x)).fillna(0)
        a.to_pickle(fname.replace(".df", ".damped.df"))
コード例 #5
0
ファイル: msmsData.py プロジェクト: airanmehr/bio
        gens = np.arange(start, start + 50 + 1, step);
        origin_count = (a - np.random.rand()).abs().idxmin() + 100
        fname = path + 'sim{}.OC{:.0f}.s{:.0E}.g'.format(i, origin_count, 0)
        Simulation.MSMSSelection(msms, Ne, n, numReplicates, theta, rho, window_size, 0, origin_count,
                                 posUnderSelection, gens, fname)
        # print  Simulation.MSMS.load(fname +'1.msms')[0][25000].mean(),Simulation.MSMS.load(fname +'51.msms')[0][25000].mean()


for s in [0.005, 0.01, 0.05, 0.1]:
    T = int(2 * np.log(2 * Ne * s) / s)
    if finale:
        starts = np.ones(100) * int(1.7 * T)
        print starts
    else:
        starts = np.sort(np.random.choice(T, 100, replace=False)) + 1
    args = [(i, s, start) for i, start in enumerate(starts)]
    multiprocessing.Pool(4).map(one, args)

files = utl.files(path)
df = pd.DataFrame(map(lambda x: x.split('.'), files))
res = []
for name, a in df.groupby([0, 2]):
    data = pd.concat([(Simulation.MSMS.load(path + f)[0].mean()) for f in a.apply(lambda x: '.'.join(x), axis=1)],
                     axis=1)
    data.columns = a.iloc[:, 3].apply(lambda x: int(x[1:]))
    data.columns.name = None
    data = data.T.sort_index()
    res += [((float(name[1][1:]), int(name[0][3:])), data)]
df = pd.Series(list(zip(*res)[1]), index=pd.MultiIndex.from_tuples(zip(*res)[0], names=['s', 'i']))
df.sort_index().to_pickle(utl.outpath + 'msmsSelection/{}.df'.format(('sweep', 'finale')[finale]))