def onexp(args): method = 'xpehh' pop, root = args path = '{}{}/'.format(root, pop.split('.')[0]) out = '{}.{}'.format(pop, method) f = 'chr{}.' + out + '.gz' utl.mergeResults(path=path, f=f, out=out, outpath=outpath)
def saveFolder(d): a=pd.Series(utl.files(path+d)) a=pd.DataFrame(a[a.apply(lambda x: x[-5:]=='.norm')]) if not a.size: return a['method']=a[0].apply(getMethod) a['POP']=d; a['POPXP']='NA';I=(a.method=='xpehh');a.loc[I,'POPXP']=a.loc[I,0].apply(getPopXP) a['CHROM']=a[0].apply(lambda x: utl.INT(x.split('.')[0][3:])) a.set_index(['method','CHROM','POP','POPXP'],inplace=True) return a.groupby(level=[0,1,2,3]).apply(lambda x: load(x.loc[x.name],path+d)).unstack(['method','POP','POPXP']).sort_index()
def getCHROM(VCFin): return utl.INT( Popen(['zgrep -v "#" -m1 {} | cut -f1'.format(VCFin)], stdout=PIPE, stdin=PIPE, stderr=STDOUT, shell=True).communicate()[0].strip().split('\n')[-1])
def mergeidf(path='/home/arya/storage/Data/Human/scan/selscan/'): print 'merging idfs' a=pd.Series(utl.files(path)) a=a[a.apply(lambda x: x[-4:]=='.idf')] a=a[a!='panel.idf'] b=pd.concat(map(lambda x: pd.read_pickle(path+x),a.values[:]),1).sort_index(1) b.to_pickle(path+'panel.idf') b.isnull().mean().sort_values()
def scan1000GP(pop, wins=[50, 200, 500, 1000]): if pop == 'ALL': n = 2504 else: n = utl.VCF.loadPanel().groupby('pop').size()[pop] df = pd.concat(map(lambda w: scanGenome(w * 1000, pop, n), wins), 1, keys=wins) df.to_pickle( utl.parentdir(utl.dataPath1000GP) + '/scan/{}.SFS.df'.format(pop))
def scanChrom(args): CHROM, winSize, pop, n = args if isinstance(CHROM, str) or isinstance(CHROM, int): CHROM = loadChrom(CHROM, pop) return utl.scanGenome( CHROM, uf=lambda x: est.Estimate.getAllEstimatesX(x, n=n * 2), winSize=winSize)
def mergeXP(): a = pd.read_csv( '/home/arya/workspace/bio/Scripts/LearningSelection/XPSFS/pops', header=None).iloc[:, 0] # for x in a: for x in ['CEU.YRI', 'CHB.YRI']: f = 'chr{}.xpehh.' + x + '.gz' out = 'xpehh.{}'.format(x) try: path = '/home/arya/POP/HAT/{}/{}/'.format(x.replace('.', '+'), x.split('.')[0]) utl.mergeResults(path=path, f=f, out=out) except: try: path = '/home/arya/POP/{}/'.format(x.plit('.')[0]) utl.mergeResults(path=path, f=f, out=out) except: print 'Error in', x
def SAVE(d): POPS=None def load(x,path): method,chrom,_,_= x.name skp=(0,1)[method=='xpehh'] f=lambda x: pd.read_csv(path+'/'+x,skiprows=skp,sep='\t',header=None).iloc[:,[1,-2]].set_index(1).iloc[:,0].rename(method) a= f(x[0]) a.index.name='POS' return a def getMethod(x): if 'ihs' in x : return 'ihs' if 'nsl' in x : return 'nsl' if 'xpehh' in x : return 'xpehh' outpath='/home/arya/storage/Data/Human/scan/selscan/' getPopXP= lambda x:x.split('_')[1].split('.')[0] print d def saveFolder(d): a=pd.Series(utl.files(path+d)) a=pd.DataFrame(a[a.apply(lambda x: x[-5:]=='.norm')]) if not a.size: return a['method']=a[0].apply(getMethod) a['POP']=d; a['POPXP']='NA';I=(a.method=='xpehh');a.loc[I,'POPXP']=a.loc[I,0].apply(getPopXP) a['CHROM']=a[0].apply(lambda x: utl.INT(x.split('.')[0][3:])) a.set_index(['method','CHROM','POP','POPXP'],inplace=True) return a.groupby(level=[0,1,2,3]).apply(lambda x: load(x.loc[x.name],path+d)).unstack(['method','POP','POPXP']).sort_index() if POPS is not None: if d not in POPS:return fout=outpath+d+'.df' d=saveFolder(d) d.to_pickle(fout) try: print d utl.scanGenome(pd.read_pickle(fout).abs()).to_pickle(fout.replace('.df','.idf')) except: pass
def load(f): CHROM = utl.INT(f.split('chr')[1].split('.')[0]) def one(f, CHROM, skiprows=0): print f a = pd.concat([ pd.read_csv(f, sep='\t', header=None, skiprows=skiprows).iloc[:, [1, -2]].set_index(1) ], keys=[CHROM]).iloc[:, 0] a.index.names = ['CHROM', 'POS'] return a try: a = one(f.replace('.out', '.out.100bins.norm'), CHROM, 0) except: a = one(f, CHROM, 1) return a
], keys=[CHROM]).iloc[:, 0] a.index.names = ['CHROM', 'POS'] return a try: a = one(f.replace('.out', '.out.100bins.norm'), CHROM, 0) except: a = one(f, CHROM, 1) return a if __name__ == "__main__": VCF, VCFXP, method, pop, panel, proc, popxp = options.vcf, options.vcfXP, options.method, options.pop, options.panel, options.proc, options.popxp # proc=10;VCF='/pedigree2/projects/HA_selection2/Beagle/filtered/chr2.1kg.phase3.v5a.vcf.gz';method='ihs';pop='CEU';panel='/home/arya/HA_selection2/Beagle/panel' #proc=10;VCF='/pedigree2/projects/HA_selection2/1000GP/hg19/POP/CEU/chr22.vcf.gz';method='ihs';pop=None;panel='/home/arya/HA_selection2/Beagle/panel' #proc=10;VCF='/pedigree2/projects/HA_selection2/Kyrgyz/hg19/phased/chr22.vcf.gz';method='ihs';pop='Sick';popxp='Healthy';panel='~/HA_selection2/Kyrgyz/kyrgyz.panel' #proc=1;VCF='VCF=/pedigree2/projects/HA_selection2/1000GP/hg19/POP/KGZ/phased/HAPH/chr22.vcf.gz';pop='HAPH';method='nsl'; popxp=None;panel='~/HA_selection2/Kyrgyz/panel/kyrgyz.panel' #split() if popxp is not None or VCFXP is not None: out = scanXP(VCF, VCFXP, pop, popxp, panel, proc) else: out = scan(VCF, method, pop, panel, proc) print 'Normalizing...', out # os.system("grep -v 'nan' {0} > {0}.tmp && mv {0}.tmp {0} && {1} --{2} --files {0}".format(out,selscanNorm,method.replace('nsl','ihs'))) # for x in $(ls * out); do grep -v 'nan' $x > $x.tmp & & mv $x.tmp $x & & ~ / workspace / bio / Scan / selscan / bin / linux / norm --xpehh --files $x; done pops = (pop, '{}.{}'.format(pop, popxp))[method == 'xpehh'] f = os.path.dirname(out) + '/chr{}.{}.{}.gz'.format( utl.INT(out.split('chr')[1].split('.')[0]), pops, method) utl.gz.save(load(out).rename(method), f)
def one(args): pop, method = args path = '/home/arya/POP/{}/'.format(pop) out = '{}.{}'.format(pop, method) f = 'chr{}.' + out + '.gz' utl.mergeResults(path=path, f=f, out=out, outpath=outpath)
pd.options.display.max_rows = 20 pd.options.display.expand_frame_repr = False import seaborn as sns import pylab as plt import matplotlib as mpl import UTILS.Util as utl import os, sys import UTILS.Estimate as est home = os.path.expanduser('~') + '/' #CHROM=22 CHROM = sys.argv[1] fname = '/home/arya/HA_selection2/Beagle/filtered/chr{}.1kg.phase3.v5a.aa.df'.format( CHROM) pop = utl.VCF.loadPanel().groupby('super_pop').size() pop df = pd.read_pickle(fname)[pop.index] winSize = 50000 f = lambda x: pd.DataFrame( utl.scanGenome(x[x.name], uf=lambda X: est.Estimate.getEstimate(X, n=pop[x.name], bins=20, removeFixedSites=True, normalizeTajimaD=False ), winSize=winSize)) stats = df.groupby(level=0, axis=1).apply(f).T.reset_index(level=0, drop=True).T stats.to_pickle(fname.replace('.df', '.SFS.df'))
def genesA(): pop = 'CEU' genes = loadGenes().loc[pop] scan = pd.read_pickle( utl.parentdir(utl.dataPath1000GP) + '/scan/{}.SFS.df'.format(pop)) scan.columns = [50, 200, 500, 1000]
''' Copyleft May 01, 2017 Arya Iranmehr, PhD Student, Bafna Lab, UC San Diego, Email: [email protected] ''' import numpy as np import sys sys.path.insert(1, '/home/arya/workspace/bio/') np.set_printoptions(linewidth=200, precision=5, suppress=True) import pandas as pd pd.options.display.max_rows = 20 pd.options.display.expand_frame_repr = False import seaborn as sns import pylab as plt import matplotlib as mpl import os home = os.path.expanduser('~') + '/' import UTILS.Util as utl if __name__ == "__main__": vcf, chrom = sys.argv[1:] print chrom, vcf utl.VCF.createGeneticMap(vcf, utl.INT(chrom), recompute=True)
''' Copyleft Apr 17, 2017 Arya Iranmehr, PhD Student, Bafna Lab, UC San Diego, Email: [email protected] ''' import sys sys.path.insert(1, '/home/arya/workspace/bio') import numpy as np np.set_printoptions(linewidth=200, precision=5, suppress=True) import pandas as pd pd.options.display.max_rows = 20 pd.options.display.expand_frame_repr = False import seaborn as sns import pylab as plt import matplotlib as mpl import UTILS.Util as utl import os, sys import UTILS.Estimate as est home = os.path.expanduser('~') + '/' #CHROM=22 CHROM = sys.argv[1] if __name__ == '__main__': print sys.argv utl.scanXPSFS(sys.argv[1].split('.'), utl.INT(sys.argv[2]))