Beispiel #1
0
def get_list(path='.'):
    # open the list files and turn them into a dictionary
    lists = G(os.path.join(path, '*.list'))
    groups = {}
    if lists:
        for l in lists:
            group = l[:-5]
            groups[group] = []
            with open(l) as L:
                for line in L:
                    if line.strip() == '':
                        continue
                    bl = line.strip().strip('.pdb')
                    if not bl in groups[group]:
                        groups[group].append(bl)
    else:
        pdbs = G(os.path.join(path, '*.pdb'))
        for p in pdbs:
            q = os.path.split(p)[-1].strip().strip('.pdb')
            q = os.path.join(path, q)
            if '.CXCR4.' in p:
                if not 'X4' in groups:
                    groups['X4'] = []
                groups['X4'].append(q)
            elif '.CCR5_CXCR4.' in p:
                if not 'DUAL' in groups:
                    groups['DUAL'] = []
                groups['DUAL'].append(q)
            elif '.CCR5.' in p:
                if not 'R5' in groups:
                    groups['R5'] = []
                groups['R5'].append(q)

    return groups
Beispiel #2
0
def split_aln(pathtorefal, pathtopdbs, datadict, equi):
    tout = open('Test.gm', 'w')
    dout = open('Alignment.gm', 'w')
    old = [x.strip('.pdb') for x in G(pathtorefal + '/*.pdb')]
    new = [y.strip('.pdb') for y in G(pathtopdbs + '/*.pdb')]
    for i in old:
        i = i.split('/')[-1]
        dout.write('>' + equi[i] + ':' + i + ';' + ';'.join(datadict[i]) +
                   '\n')
    for j in new:
        j = j.split('/')[-1]
        tout.write('>' + equi[j] + ':' + j + ';' + ';'.join(datadict[j]) +
                   '\n')
    tout.close()
    dout.close()
def Parallel_GRM(args, qsub=qsub):
    """
    Create the triangular genetic relationship matrix using plink in a HPC 
    cluster (Guillimin)
    """
    pref = args.prefix
    comm = (
        '%s --bfile %s --keep-allele-order --make-rel --parallel %d %d -out'
        ' %s.GRM')
    cat = ['cat']
    for i in xrange(1, args.chunks + 1):
        tname = '%s_%d' % (pref, i)
        c = comm % (args.PlinkExe, args.GenotypeFile, i, args.chunks, pref)
        with open('temp.%d.sh' % (i)) as qs:
            qs.write(qsub % (tname, tname, tname, c), 'w')
        ch = Popen('qsub temp.%d.sh' % (i), shell=True)
        ch.communicate()
        cat.append('%s.GRM.rel' % (pref))
    ## join the results
    #count=0
    while not os.path.isfile('%s.GRM.rel.bin.%d'%(pref, args.chunks+1)) and \
          (len(G('%s.GRM.rel.bin.*'%(pref))) != args.chunks):
        sleep(1800)
        #count += 1
    fn = '%s.fullGRM' % (pref)
    cat = Popen('%s > %s' % (' '.join(cat), fn), shell=True)
    cat.communicate()
    return fn
Beispiel #4
0
def Parallel_LD(args, qsub=qsub):
    """
    Create the triangular genetic relationship matrix using plink in a HPC 
    cluster (Guillimin)
    """
    pref = '%s_%s' % (args.prefix, args.LDwindow)
    comm = '%s --bfile %s --ld-window-cm %d --%s d with-freqs --parallel %d %d '
    comm += '--keep-allele-order --out %s'
    cat = ['cat']
    shs = []
    for i in xrange(1, args.chunks + 1):
        tname = '%s_%d' % (pref, i)
        c = comm % (args.PlinkExe, args.GenotypeFile, args.LDwindow,
                    args.typeLD, i, args.chunks, pref)
        with open('%s.sh' % (tname), 'w') as qs:
            qs.write(qsub % (args.walltime, tname, tname, tname, c))
            shs.append('%s.sh' % (tname))
        ch = Popen('qsub %s.sh' % (tname), shell=True)
        ch.communicate()
        cat.append('%s.ld.%d' % (pref, i))
    ## join the results
    #count=0
    while not os.path.isfile('%s.ld.%d'%(pref, args.chunks)) and \
          (len(G('%s.ld.*'%(pref))) != args.chunks):
        sleep(600)
        #count += 1
    fn = '%s.%s.fullLD' % (pref, args.typeLD)
    catp = Popen('%s > %s' % (' '.join(cat), fn), shell=True)
    catp.communicate()
    map(os.remove, cat[1:])
    map(os.remove, shs)
    return fn
Beispiel #5
0
def list2dict():
    ld = {}
    lists = G('*.list')
    for f in lists:
        typ = f[:-5]
        inf = open(f)
        for line in inf:
            if not line == '\n' or not line == '':
                ld[line.strip()[:-4]] = typ
    return ld
Beispiel #6
0
 def cleanup(self, top10):
     """
     cleanup files and report the top 10 predictions
     """
     files = self.results.File
     tocle = files[~files.isin(top10.File)]
     tocle = [x for y in tocle for x in G('%s*' % y)]
     print('Cleaning up ...')
     for fi in tqdm(tocle, total=len(tocle)):
         if os.path.isfile(fi):
             os.remove(fi)
Beispiel #7
0
def main(pathtopdbs, pathtorefal, options):
    ''' execute code '''
    if options.knownTest:
        pathtopdbs = get_test_pdbs(options.knownTest)
    if not isfile('alignment_final.gm'):
        align(pathtopdbs, pathtorefal, options, [pathtopdbs, pathtorefal])
    data = readAlGM()
    for f in G('./_input/*.pdb'):
        copy(f, getcwd())
    GP120classifier('alignment_final', True)
    equi = get_equi('alignment_final.pdbequi')
    split_aln(pathtorefal, pathtopdbs, data, equi)
    copy('alignment_final_beforesampling.landmarks', 'Alignment.landmarks')
    copy('alignment_final.gm', 'Alignment.gm')
    copy('alignment_final.cls', 'Alignment.cls')
    ar = 'Alignment R5.gm R5 dual.gm dual X4.gm X4'
    op = opti(options)
    classifyGM(op, ar.split())
    #classify = Popen('classifyGM.py Alignment R5.gm R5 R5X4.gm dual X4.gm X4  -f -p Test.gm -s .',shell=True)
    #classify.wait()
    #translate_pred('Alignment.prediction',equi)
    [remove(x) for x in G('*.pdb')]
Beispiel #8
0
 def executeRange(self):
     """
     perform the grid optimization
     """
     combos = ((x, y) for y in self.Ps for x in self.LDs)
     print('Performing clumping in %s...' % self.outpref)
     for r2, pval in tqdm(combos, total=(len(self.Ps) * len(self.LDs))):
         self.ScoreClumped(self.clumpVars(pval, r2))
     self.results = pd.DataFrame(self.results)
     self.results.sort_values('R2', inplace=True, ascending=False)
     self.results.to_csv('%s.results' % (self.outpref),
                         sep='\t',
                         index=False)
     top10 = self.results.nlargest(10, 'pR2')
     top = top10.nlargest(1, 'pR2')
     for i in G(top.File[0]):
         shutil.copy(i, '%s.BEST' % (i))
     if self.clean:
         self.cleanup(top10)
     if not os.path.isdir('LOGs'):
         os.mkdir('LOGs')
     for f in G('*.log'):
         shutil.move(f, 'LOGs')
def PED2BED(prefix, path, plinkexe):
    """
    Convert all chromosomes in a ped file to a single bed file
    
    :param str prefix: prefix for output
    :param str path: path to the individual chr{i}.ped files
    :param str plinkexe: path to plink executable (including executable's name)
    """
    files = G('%s/*.ped'%(path.strip('/')))
    with open('mergelist.txt','w') as M:
        for ped in files:
            M.write('%s %s.fam'%(ped, ped[:ped.find('.')] ))
    line = '%s --merge-list mergelist.txt --make-bed -out %s'
    merge = Popen(line%(plinkexe, prefix), shell=True)
def main(ptCloudDir, ptCloudType):
	initPBPXMLDoc()
	ptCloudList = G(ptCloudDir + "*.%s" % (ptCloudType))
	importLoop(ptCloudList)
	otherSettings()
	exportSection(ptCloudDir)
def ProcessOutput(prefix, expectedfiles, parallel=True, verbose=True):
    """
    wait for outputs and process them
    """
    if verbose: print('Processing LDs')
    processed = []
    R = []
    Rappend = R.append
    D = []
    Dappend = D.append
    if parallel:
        while len(set(expectedfiles).difference(processed)) != 0:
            files = G('*.ld')
            if not files: sleep(10)
            actual = [f for f in files if f not in processed]
            for fn in actual:
                pref = fn[:fn.find('.ld')]
                iappend(int(re.findall('\d+', pref)[0]))
                #data[(idx, pref)] = read_LD(fn)
                data = read_LD(fn)
                Dappend(data.D)
                Rappend(data.R)
                processed.append(pref)
                #os.remove(fn)
    #sorted_keys = sorted(data.keys(), key=lambda v: v[0])
    #full = sorted_keys.pop(0)

        with open('%s_DR.pickle' % (pref), 'wb') as DR:
            P.dump((D, R), DR)
        pyline = "import os, re, argparse, itertools;"
        pyline += "from subprocess import Popen;from glob import glob as G;"
        pyline += "from time import sleep;import pandas as pd;"
        pyline += "import numpy as np;import pickle as P;"
        pyline += "picklefile = '%s_DR.pickle'" % (pref)
        pyline += "with open(picklefile, 'rb') as RDR: D, R = P.load(RDR);"
        pyline += "oriD = D.pop(0); D = pd.concat(D, axis=1);n = D.shape[1];"
        pyline += "thetadot = D.apply(np.mean,axis=1);"
        pyline += "l2tract = [thetadot - D.loc[:,col] for col in D.columns];"
        pyline += "iminusdot = pd.concat(l2tract, axis=1);"
        pyline += "Var_j = np.divide((n-1),n) * (iminusdot**2).sum();"
        pyline += "Bias = (n-1) * (thetadot - oriD);"
        pyline += "D_corrected = (n * D) - ((n-1) * thetadot);"
        pyline += "na = '%s_Djack.pickle';" % (pref)
        pyline += "obj = (D, thetadot, Var_j, Bias, D_corrected)"
        pyline += "with open(na,'wb') as DF: P.dump(obj,DF);del D, D_corrected;"
        pyline += "report = 'Jackknife-estimated D uncertainty description: ';"
        pyline += "report += str(Var_j.describe());"
        pyline += "report += 'Jackknife D bias: ' + str(Bias.describe());"
        pyline += "oriR = R.pop(0); R =pd.concat(R, axis=1); n = R.shape[1];"
        pyline += "thetadot = R.apply(np.mean, axis=1);"
        pyline += "l2tract = [thetadot - R.loc[:,col] for col in R.columns]"
        pyline += "iminusdot = pd.concat(l2tract, axis=1);"
        pyline += "Var_j = np.divide((n-1),n)* (iminusdot**2).sum();"
        pyline += "Bias = (n-1) * (thetadot - oriR);"
        pyline += "R_corrected = (n * R) - ((n-1) * thetadot);"
        pyline += "obj = (R, thetadot, Var_j, Bias, R_corrected)"
        pyline += "nam = '%s_Rjack.pickle'" % (pref)
        pyline += "with open(nam, 'wb') as RF: P.dump(obj, RF);"
        pyline += "del R, R_corrected, iminusdot, obj, l2tract;"
        pyline += "report += 'Jackknife-estimated uncertainty of R descrition:'"
        pyline += "+str(Var_j.describe());report += 'Jackknife bias of R"
        pyline += " description:\n' + str(Bias.describe());"
        pyline += "print('Uncertainty report:\n, report);"
        comm = 'python -c "%s"' % (pyline)
        name = '%s_processed'
        a = Popen(qsub % (1, name, name, name, comm), shell=True)
        a.communicate()

    else:
        for fn in G('*.ld'):
            pref = fn[:fn.find('.ld')]
            name = 'jack%s' % (re.findall('\d+', pref)[0])
            #data[(idx, pref)] = read_LD(fn)
            data = read_LD(fn)
            data.D.name = name
            data.R.name = name
            Dappend(data.D)
            Rappend(data.R)
            processed.append(pref)
        ##Process D
        oriD = D.pop(0)
        D = pd.concat(
            D, axis=1
        )  #.dropna()#.rename(columns={x:'n-%d'%(x) for x in indices})
        mean = D.apply(np.mean, axis=1)
        n = D.shape[1]
        PSi = pd.concat([(n * oriD) - ((n - 1) * D.loc[:, col])
                         for col in D.columns],
                        axis=1)
        PS = PSi.apply(np.mean, axis=1)
        Vps = ((PSi - PS)**2).sum(axis=1) * (1 / (n - 1))
        e = (1.960) * np.sqrt(Vps / n)
        low, high = (PS - e), (PS + e)
        plotEstimation(oriD, mean, low, high, '%s_D' % (prefix))
        #thetadot = D.apply(np.mean,axis=1)
        #iminusdot = pd.concat([thetadot - D.loc[:,col] for col in D.columns],
        #                      axis=1)
        #Var_j = ((n-1)/n) * (iminusdot**2).sum()
        #Bias = (n-1) * (thetadot - oriD)
        #D_corrected = (n * oriD) - ((n-1) * thetadot)
        with open('%s_Djack.pickle' % prefix, 'wb') as DF:
            P.dump((D, PSi, PS, Vps, low, high), DF)
        #del D, D_corrected
        #report = 'Jackknife-estimated uncertainty of D descrition:\n %s \n'%(
        #Var_j.describe())
        #report += 'Jackknife bias of D description: %s\n'%(Bias.describe())
        ##Process R
        oriR = R.pop(0)
        R = pd.concat(R, axis=1)
        mean = R.apply(np.mean, axis=1)
        PSi = pd.concat([(n * oriR) - ((n - 1) * R.loc[:, col])
                         for col in R.columns],
                        axis=1)
        PS = PSi.apply(np.mean, axis=1)
        Vps = ((PSi - PS)**2).sum(axis=1) * (1 / (n - 1))
        e = (1.960) * np.sqrt(Vps / n)
        low, high = (PS - e), (PS + e)
        plotEstimation(oriR, mean, low, high, '%s_R' % (prefix))

        ##.dropna()#.rename(columns={x:'n-%d'%(x) for x in indices})
        #n = R.shape[1]
        #thetadot = R.apply(np.mean,axis=1)
        #iminusdot = pd.concat([thetadot - R.loc[:,col] for col in R.columns],
        #axis=1)
        #Var_j = ((n-1)/n) * (iminusdot**2).sum()
        #Bias = (n-1) * (thetadot - oriR)
        #R_corrected = (n * oriR) - ((n-1) * thetadot)
        with open('%s_Rjack.pickle' % (prefix), 'wb') as RF:
            P.dump((R, PSi, PS, Vps, low, high), RF)
        #del R, R_corrected
        #report += 'Jackknife-estimated uncertainty of R descrition:\n %s'%(
        #Var_j.describe())
        #report+= 'Jackknife bias of R description: %s'%(Bias.describe())
        if verbose: print('Processing done...\n')