Ejemplo n.º 1
0
def onexp(args):
    method = 'xpehh'
    pop, root = args
    path = '{}{}/'.format(root, pop.split('.')[0])
    out = '{}.{}'.format(pop, method)
    f = 'chr{}.' + out + '.gz'
    utl.mergeResults(path=path, f=f, out=out, outpath=outpath)
Ejemplo n.º 2
0
 def saveFolder(d):
     a=pd.Series(utl.files(path+d))
     a=pd.DataFrame(a[a.apply(lambda x: x[-5:]=='.norm')])
     if not a.size: return
     a['method']=a[0].apply(getMethod)
     a['POP']=d; a['POPXP']='NA';I=(a.method=='xpehh');a.loc[I,'POPXP']=a.loc[I,0].apply(getPopXP)
     a['CHROM']=a[0].apply(lambda x: utl.INT(x.split('.')[0][3:]))
     a.set_index(['method','CHROM','POP','POPXP'],inplace=True)
     return a.groupby(level=[0,1,2,3]).apply(lambda x: load(x.loc[x.name],path+d)).unstack(['method','POP','POPXP']).sort_index()
Ejemplo n.º 3
0
def getCHROM(VCFin):
    return utl.INT(
        Popen(['zgrep -v "#" -m1 {} | cut -f1'.format(VCFin)],
              stdout=PIPE,
              stdin=PIPE,
              stderr=STDOUT,
              shell=True).communicate()[0].strip().split('\n')[-1])
Ejemplo n.º 4
0
def mergeidf(path='/home/arya/storage/Data/Human/scan/selscan/'):
    print 'merging idfs'
    a=pd.Series(utl.files(path))
    a=a[a.apply(lambda x: x[-4:]=='.idf')]
    a=a[a!='panel.idf']
    b=pd.concat(map(lambda x: pd.read_pickle(path+x),a.values[:]),1).sort_index(1)
    b.to_pickle(path+'panel.idf')
    b.isnull().mean().sort_values()
Ejemplo n.º 5
0
def scan1000GP(pop, wins=[50, 200, 500, 1000]):
    if pop == 'ALL': n = 2504
    else: n = utl.VCF.loadPanel().groupby('pop').size()[pop]
    df = pd.concat(map(lambda w: scanGenome(w * 1000, pop, n), wins),
                   1,
                   keys=wins)
    df.to_pickle(
        utl.parentdir(utl.dataPath1000GP) + '/scan/{}.SFS.df'.format(pop))
Ejemplo n.º 6
0
def scanChrom(args):
    CHROM, winSize, pop, n = args
    if isinstance(CHROM, str) or isinstance(CHROM, int):
        CHROM = loadChrom(CHROM, pop)
    return utl.scanGenome(
        CHROM,
        uf=lambda x: est.Estimate.getAllEstimatesX(x, n=n * 2),
        winSize=winSize)
Ejemplo n.º 7
0
def mergeXP():
    a = pd.read_csv(
        '/home/arya/workspace/bio/Scripts/LearningSelection/XPSFS/pops',
        header=None).iloc[:, 0]
    # for x in a:
    for x in ['CEU.YRI', 'CHB.YRI']:
        f = 'chr{}.xpehh.' + x + '.gz'
        out = 'xpehh.{}'.format(x)
        try:
            path = '/home/arya/POP/HAT/{}/{}/'.format(x.replace('.', '+'),
                                                      x.split('.')[0])
            utl.mergeResults(path=path, f=f, out=out)
        except:
            try:
                path = '/home/arya/POP/{}/'.format(x.plit('.')[0])
                utl.mergeResults(path=path, f=f, out=out)
            except:
                print 'Error in', x
Ejemplo n.º 8
0
def SAVE(d):
    POPS=None
    def load(x,path):
        method,chrom,_,_= x.name
        skp=(0,1)[method=='xpehh']
        f=lambda x: pd.read_csv(path+'/'+x,skiprows=skp,sep='\t',header=None).iloc[:,[1,-2]].set_index(1).iloc[:,0].rename(method)
        a= f(x[0])
        a.index.name='POS'
        return a

    def getMethod(x):
        if 'ihs' in x : return 'ihs'
        if 'nsl' in x : return 'nsl'
        if 'xpehh' in x : return 'xpehh'

    outpath='/home/arya/storage/Data/Human/scan/selscan/'
    getPopXP= lambda x:x.split('_')[1].split('.')[0]
    print d
    def saveFolder(d):
        a=pd.Series(utl.files(path+d))
        a=pd.DataFrame(a[a.apply(lambda x: x[-5:]=='.norm')])
        if not a.size: return
        a['method']=a[0].apply(getMethod)
        a['POP']=d; a['POPXP']='NA';I=(a.method=='xpehh');a.loc[I,'POPXP']=a.loc[I,0].apply(getPopXP)
        a['CHROM']=a[0].apply(lambda x: utl.INT(x.split('.')[0][3:]))
        a.set_index(['method','CHROM','POP','POPXP'],inplace=True)
        return a.groupby(level=[0,1,2,3]).apply(lambda x: load(x.loc[x.name],path+d)).unstack(['method','POP','POPXP']).sort_index()

    if POPS is not None:
        if d not in POPS:return
    fout=outpath+d+'.df'
    d=saveFolder(d)
    d.to_pickle(fout)
    try:
        print d
        utl.scanGenome(pd.read_pickle(fout).abs()).to_pickle(fout.replace('.df','.idf'))
    except:
        pass
Ejemplo n.º 9
0
def load(f):
    CHROM = utl.INT(f.split('chr')[1].split('.')[0])

    def one(f, CHROM, skiprows=0):
        print f
        a = pd.concat([
            pd.read_csv(f, sep='\t', header=None,
                        skiprows=skiprows).iloc[:, [1, -2]].set_index(1)
        ],
                      keys=[CHROM]).iloc[:, 0]
        a.index.names = ['CHROM', 'POS']
        return a

    try:
        a = one(f.replace('.out', '.out.100bins.norm'), CHROM, 0)
    except:
        a = one(f, CHROM, 1)
    return a
Ejemplo n.º 10
0
        ],
                      keys=[CHROM]).iloc[:, 0]
        a.index.names = ['CHROM', 'POS']
        return a

    try:
        a = one(f.replace('.out', '.out.100bins.norm'), CHROM, 0)
    except:
        a = one(f, CHROM, 1)
    return a


if __name__ == "__main__":
    VCF, VCFXP, method, pop, panel, proc, popxp = options.vcf, options.vcfXP, options.method, options.pop, options.panel, options.proc, options.popxp
    # proc=10;VCF='/pedigree2/projects/HA_selection2/Beagle/filtered/chr2.1kg.phase3.v5a.vcf.gz';method='ihs';pop='CEU';panel='/home/arya/HA_selection2/Beagle/panel'
    #proc=10;VCF='/pedigree2/projects/HA_selection2/1000GP/hg19/POP/CEU/chr22.vcf.gz';method='ihs';pop=None;panel='/home/arya/HA_selection2/Beagle/panel'
    #proc=10;VCF='/pedigree2/projects/HA_selection2/Kyrgyz/hg19/phased/chr22.vcf.gz';method='ihs';pop='Sick';popxp='Healthy';panel='~/HA_selection2/Kyrgyz/kyrgyz.panel'
    #proc=1;VCF='VCF=/pedigree2/projects/HA_selection2/1000GP/hg19/POP/KGZ/phased/HAPH/chr22.vcf.gz';pop='HAPH';method='nsl'; popxp=None;panel='~/HA_selection2/Kyrgyz/panel/kyrgyz.panel'
    #split()
    if popxp is not None or VCFXP is not None:
        out = scanXP(VCF, VCFXP, pop, popxp, panel, proc)
    else:
        out = scan(VCF, method, pop, panel, proc)
    print 'Normalizing...', out
    # os.system("grep -v 'nan' {0} > {0}.tmp && mv {0}.tmp {0} && {1} --{2} --files {0}".format(out,selscanNorm,method.replace('nsl','ihs')))
    # for x in $(ls * out); do grep -v 'nan' $x > $x.tmp & & mv $x.tmp $x & & ~ / workspace / bio / Scan / selscan / bin / linux / norm --xpehh --files $x; done
    pops = (pop, '{}.{}'.format(pop, popxp))[method == 'xpehh']
    f = os.path.dirname(out) + '/chr{}.{}.{}.gz'.format(
        utl.INT(out.split('chr')[1].split('.')[0]), pops, method)
    utl.gz.save(load(out).rename(method), f)
Ejemplo n.º 11
0
def one(args):
    pop, method = args
    path = '/home/arya/POP/{}/'.format(pop)
    out = '{}.{}'.format(pop, method)
    f = 'chr{}.' + out + '.gz'
    utl.mergeResults(path=path, f=f, out=out, outpath=outpath)
Ejemplo n.º 12
0
pd.options.display.max_rows = 20
pd.options.display.expand_frame_repr = False
import seaborn as sns
import pylab as plt
import matplotlib as mpl
import UTILS.Util as utl
import os, sys
import UTILS.Estimate as est
home = os.path.expanduser('~') + '/'
#CHROM=22
CHROM = sys.argv[1]
fname = '/home/arya/HA_selection2/Beagle/filtered/chr{}.1kg.phase3.v5a.aa.df'.format(
    CHROM)
pop = utl.VCF.loadPanel().groupby('super_pop').size()
pop
df = pd.read_pickle(fname)[pop.index]

winSize = 50000
f = lambda x: pd.DataFrame(
    utl.scanGenome(x[x.name],
                   uf=lambda X: est.Estimate.getEstimate(X,
                                                         n=pop[x.name],
                                                         bins=20,
                                                         removeFixedSites=True,
                                                         normalizeTajimaD=False
                                                         ),
                   winSize=winSize))
stats = df.groupby(level=0, axis=1).apply(f).T.reset_index(level=0,
                                                           drop=True).T
stats.to_pickle(fname.replace('.df', '.SFS.df'))
Ejemplo n.º 13
0
def genesA():
    pop = 'CEU'
    genes = loadGenes().loc[pop]
    scan = pd.read_pickle(
        utl.parentdir(utl.dataPath1000GP) + '/scan/{}.SFS.df'.format(pop))
    scan.columns = [50, 200, 500, 1000]
Ejemplo n.º 14
0
'''
Copyleft May 01, 2017 Arya Iranmehr, PhD Student, Bafna Lab, UC San Diego,  Email: [email protected]
'''
import numpy as np
import sys
sys.path.insert(1, '/home/arya/workspace/bio/')
np.set_printoptions(linewidth=200, precision=5, suppress=True)
import pandas as pd

pd.options.display.max_rows = 20
pd.options.display.expand_frame_repr = False
import seaborn as sns
import pylab as plt
import matplotlib as mpl
import os

home = os.path.expanduser('~') + '/'
import UTILS.Util as utl

if __name__ == "__main__":
    vcf, chrom = sys.argv[1:]
    print chrom, vcf
    utl.VCF.createGeneticMap(vcf, utl.INT(chrom), recompute=True)
Ejemplo n.º 15
0
'''
Copyleft Apr 17, 2017 Arya Iranmehr, PhD Student, Bafna Lab, UC San Diego,  Email: [email protected]
'''
import sys
sys.path.insert(1, '/home/arya/workspace/bio')

import numpy as np

np.set_printoptions(linewidth=200, precision=5, suppress=True)
import pandas as pd

pd.options.display.max_rows = 20
pd.options.display.expand_frame_repr = False
import seaborn as sns
import pylab as plt
import matplotlib as mpl
import UTILS.Util as utl
import os, sys
import UTILS.Estimate as est
home = os.path.expanduser('~') + '/'
#CHROM=22
CHROM = sys.argv[1]
if __name__ == '__main__':
    print sys.argv
    utl.scanXPSFS(sys.argv[1].split('.'), utl.INT(sys.argv[2]))