def one(method): ff = lambda x: ((x.alt - x.null) * x.s.apply(np.sign)).fillna(0).sort_index() path = utl.outpath + 'ROC/runs/' files = pd.Series(utl.files(path)) files = files[files.apply(lambda x: method in x)] if method == 'MarkovChain': pd.concat([ff(pd.read_pickle(path + f)) for f in files]).to_pickle(utl.outpath + 'ROC/' + method) else: pd.concat([pd.read_pickle(path + f) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
def loadAllScores(h=None, scores=True): path = utl.outpath + 'real/HMM/' if h is None: return pd.concat(map(lambda x: pd.read_pickle(path + x), utl.files(path)), axis=1) else: a = pd.read_pickle('{}h{:E}.df'.format(path, h))[h] if scores: a = (a.alt - a.null) * a.s.apply(np.sign) return a
def saveLatex(): for name in [x for x in utl.files('/home/arya/out/real/gowinda/') if x[-4:] == '.tsv']: # name='cand.local.damped.0.out.tsv' a = pd.read_csv('/home/arya/out/real/gowinda/' + name, sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]] a.columns = ['GO', '-logPval', 'Hits', 'VarGenes', 'TotGenes', 'Term', 'Genes'] a = a[a.Hits >= 3] a['-logPval'] = -a['-logPval'].apply(np.log10).round(1) a['Genes'] = a['Genes'].apply(lambda x: x.replace(',', ' ')) utl.DataframetolaTexTable(a.iloc[:, 1:], fname=utl.paperPath + 'new/' + name.replace('.tsv', '.tex'), alignment=list('cccc') + ['p{2in}', 'p{2in}'])
def load(dampHighCov=True): def dampHighCoverage(x): cutoff = min(500, utl.ceilto(x[x.name].D.quantile(0.999), 10)) x.loc[(x[x.name].D > cutoff).values, (x.name[0], x.name[1], x.name[2]) + ("C",)] = 0 x.loc[(x[x.name].D > cutoff).values, (x.name[0], x.name[1], x.name[2]) + ("D",)] = 0 return x fname = "/home/arya/storage/Data/Dmelanogaster/Hypoxia/data.df" try: if dampHighCov: fname = fname.replace(".df", ".damped.df") return add_gen4_hyperoxia(pd.read_pickle(fname)) except: print "Loading.." path = "/home/arya/storage/Data/Dmelanogaster/Hypoxia/readcounts/" def load(names, f): return pd.concat( [ f(pd.read_csv(path + n, sep="\t", na_values=".").set_index(["CHROM", "POS"])).rename( n.split(".")[0] ) for n in names ], axis=1, ) def fixcols(a): cols = pd.Series(a.columns) for i, c in cols.iteritems(): if len(c) == 2: cols[i] = "F200" + cols[i] if "HOF" in c or "LOF" in c: cols[i] = "F200" + cols[i][0] + cols[i][-1] tmp = (int(cols[i][1:-2]), cols[i][-2], int(cols[i][-1])) cols[i] = tmp a.columns = pd.MultiIndex.from_tuples(cols.tolist(), names=["GEN", "POP", "REP"]).reorder_levels([1, 0, 2]) a = a.reset_index().replace("dmel_mitochondrion_genome", "M").set_index(["CHROM", "POS"]) a = a.T.reset_index() a["GEN"] = a["GEN"].replace(200, 180) a = a.set_index(["POP", "GEN", "REP"]).T return a.sort_index(axis=1).sort_index() files = pd.Series(utl.files(path)) r = fixcols(load(files[files.apply(lambda x: "RO." in x)], f=lambda x: x.astype(np.float).sum(1))) r def fff(x): try: return np.sum(map(np.float, x.split(","))) except: return None a = fixcols(load(files[files.apply(lambda x: "AO." in x)], f=lambda x: x.applymap(fff).astype(np.float).sum(1))) d = a + r a = pd.concat([a, d], axis=1, keys=["C", "D"]).reorder_levels([1, 2, 3, 0], axis=1).sort_index(axis=1) a.columns.set_names(["POP", "GEN", "REP", "READ"], inplace=True) a.to_pickle(fname) a = pd.read_pickle(fname).groupby(level=range(3), axis=1).apply(lambda x: dampHighCoverage(x)).fillna(0) a.to_pickle(fname.replace(".df", ".damped.df"))
gens = np.arange(start, start + 50 + 1, step); origin_count = (a - np.random.rand()).abs().idxmin() + 100 fname = path + 'sim{}.OC{:.0f}.s{:.0E}.g'.format(i, origin_count, 0) Simulation.MSMSSelection(msms, Ne, n, numReplicates, theta, rho, window_size, 0, origin_count, posUnderSelection, gens, fname) # print Simulation.MSMS.load(fname +'1.msms')[0][25000].mean(),Simulation.MSMS.load(fname +'51.msms')[0][25000].mean() for s in [0.005, 0.01, 0.05, 0.1]: T = int(2 * np.log(2 * Ne * s) / s) if finale: starts = np.ones(100) * int(1.7 * T) print starts else: starts = np.sort(np.random.choice(T, 100, replace=False)) + 1 args = [(i, s, start) for i, start in enumerate(starts)] multiprocessing.Pool(4).map(one, args) files = utl.files(path) df = pd.DataFrame(map(lambda x: x.split('.'), files)) res = [] for name, a in df.groupby([0, 2]): data = pd.concat([(Simulation.MSMS.load(path + f)[0].mean()) for f in a.apply(lambda x: '.'.join(x), axis=1)], axis=1) data.columns = a.iloc[:, 3].apply(lambda x: int(x[1:])) data.columns.name = None data = data.T.sort_index() res += [((float(name[1][1:]), int(name[0][3:])), data)] df = pd.Series(list(zip(*res)[1]), index=pd.MultiIndex.from_tuples(zip(*res)[0], names=['s', 'i'])) df.sort_index().to_pickle(utl.outpath + 'msmsSelection/{}.df'.format(('sweep', 'finale')[finale]))