def get_list(path='.'): # open the list files and turn them into a dictionary lists = G(os.path.join(path, '*.list')) groups = {} if lists: for l in lists: group = l[:-5] groups[group] = [] with open(l) as L: for line in L: if line.strip() == '': continue bl = line.strip().strip('.pdb') if not bl in groups[group]: groups[group].append(bl) else: pdbs = G(os.path.join(path, '*.pdb')) for p in pdbs: q = os.path.split(p)[-1].strip().strip('.pdb') q = os.path.join(path, q) if '.CXCR4.' in p: if not 'X4' in groups: groups['X4'] = [] groups['X4'].append(q) elif '.CCR5_CXCR4.' in p: if not 'DUAL' in groups: groups['DUAL'] = [] groups['DUAL'].append(q) elif '.CCR5.' in p: if not 'R5' in groups: groups['R5'] = [] groups['R5'].append(q) return groups
def split_aln(pathtorefal, pathtopdbs, datadict, equi): tout = open('Test.gm', 'w') dout = open('Alignment.gm', 'w') old = [x.strip('.pdb') for x in G(pathtorefal + '/*.pdb')] new = [y.strip('.pdb') for y in G(pathtopdbs + '/*.pdb')] for i in old: i = i.split('/')[-1] dout.write('>' + equi[i] + ':' + i + ';' + ';'.join(datadict[i]) + '\n') for j in new: j = j.split('/')[-1] tout.write('>' + equi[j] + ':' + j + ';' + ';'.join(datadict[j]) + '\n') tout.close() dout.close()
def Parallel_GRM(args, qsub=qsub): """ Create the triangular genetic relationship matrix using plink in a HPC cluster (Guillimin) """ pref = args.prefix comm = ( '%s --bfile %s --keep-allele-order --make-rel --parallel %d %d -out' ' %s.GRM') cat = ['cat'] for i in xrange(1, args.chunks + 1): tname = '%s_%d' % (pref, i) c = comm % (args.PlinkExe, args.GenotypeFile, i, args.chunks, pref) with open('temp.%d.sh' % (i)) as qs: qs.write(qsub % (tname, tname, tname, c), 'w') ch = Popen('qsub temp.%d.sh' % (i), shell=True) ch.communicate() cat.append('%s.GRM.rel' % (pref)) ## join the results #count=0 while not os.path.isfile('%s.GRM.rel.bin.%d'%(pref, args.chunks+1)) and \ (len(G('%s.GRM.rel.bin.*'%(pref))) != args.chunks): sleep(1800) #count += 1 fn = '%s.fullGRM' % (pref) cat = Popen('%s > %s' % (' '.join(cat), fn), shell=True) cat.communicate() return fn
def Parallel_LD(args, qsub=qsub): """ Create the triangular genetic relationship matrix using plink in a HPC cluster (Guillimin) """ pref = '%s_%s' % (args.prefix, args.LDwindow) comm = '%s --bfile %s --ld-window-cm %d --%s d with-freqs --parallel %d %d ' comm += '--keep-allele-order --out %s' cat = ['cat'] shs = [] for i in xrange(1, args.chunks + 1): tname = '%s_%d' % (pref, i) c = comm % (args.PlinkExe, args.GenotypeFile, args.LDwindow, args.typeLD, i, args.chunks, pref) with open('%s.sh' % (tname), 'w') as qs: qs.write(qsub % (args.walltime, tname, tname, tname, c)) shs.append('%s.sh' % (tname)) ch = Popen('qsub %s.sh' % (tname), shell=True) ch.communicate() cat.append('%s.ld.%d' % (pref, i)) ## join the results #count=0 while not os.path.isfile('%s.ld.%d'%(pref, args.chunks)) and \ (len(G('%s.ld.*'%(pref))) != args.chunks): sleep(600) #count += 1 fn = '%s.%s.fullLD' % (pref, args.typeLD) catp = Popen('%s > %s' % (' '.join(cat), fn), shell=True) catp.communicate() map(os.remove, cat[1:]) map(os.remove, shs) return fn
def list2dict(): ld = {} lists = G('*.list') for f in lists: typ = f[:-5] inf = open(f) for line in inf: if not line == '\n' or not line == '': ld[line.strip()[:-4]] = typ return ld
def cleanup(self, top10): """ cleanup files and report the top 10 predictions """ files = self.results.File tocle = files[~files.isin(top10.File)] tocle = [x for y in tocle for x in G('%s*' % y)] print('Cleaning up ...') for fi in tqdm(tocle, total=len(tocle)): if os.path.isfile(fi): os.remove(fi)
def main(pathtopdbs, pathtorefal, options): ''' execute code ''' if options.knownTest: pathtopdbs = get_test_pdbs(options.knownTest) if not isfile('alignment_final.gm'): align(pathtopdbs, pathtorefal, options, [pathtopdbs, pathtorefal]) data = readAlGM() for f in G('./_input/*.pdb'): copy(f, getcwd()) GP120classifier('alignment_final', True) equi = get_equi('alignment_final.pdbequi') split_aln(pathtorefal, pathtopdbs, data, equi) copy('alignment_final_beforesampling.landmarks', 'Alignment.landmarks') copy('alignment_final.gm', 'Alignment.gm') copy('alignment_final.cls', 'Alignment.cls') ar = 'Alignment R5.gm R5 dual.gm dual X4.gm X4' op = opti(options) classifyGM(op, ar.split()) #classify = Popen('classifyGM.py Alignment R5.gm R5 R5X4.gm dual X4.gm X4 -f -p Test.gm -s .',shell=True) #classify.wait() #translate_pred('Alignment.prediction',equi) [remove(x) for x in G('*.pdb')]
def executeRange(self): """ perform the grid optimization """ combos = ((x, y) for y in self.Ps for x in self.LDs) print('Performing clumping in %s...' % self.outpref) for r2, pval in tqdm(combos, total=(len(self.Ps) * len(self.LDs))): self.ScoreClumped(self.clumpVars(pval, r2)) self.results = pd.DataFrame(self.results) self.results.sort_values('R2', inplace=True, ascending=False) self.results.to_csv('%s.results' % (self.outpref), sep='\t', index=False) top10 = self.results.nlargest(10, 'pR2') top = top10.nlargest(1, 'pR2') for i in G(top.File[0]): shutil.copy(i, '%s.BEST' % (i)) if self.clean: self.cleanup(top10) if not os.path.isdir('LOGs'): os.mkdir('LOGs') for f in G('*.log'): shutil.move(f, 'LOGs')
def PED2BED(prefix, path, plinkexe): """ Convert all chromosomes in a ped file to a single bed file :param str prefix: prefix for output :param str path: path to the individual chr{i}.ped files :param str plinkexe: path to plink executable (including executable's name) """ files = G('%s/*.ped'%(path.strip('/'))) with open('mergelist.txt','w') as M: for ped in files: M.write('%s %s.fam'%(ped, ped[:ped.find('.')] )) line = '%s --merge-list mergelist.txt --make-bed -out %s' merge = Popen(line%(plinkexe, prefix), shell=True)
def main(ptCloudDir, ptCloudType): initPBPXMLDoc() ptCloudList = G(ptCloudDir + "*.%s" % (ptCloudType)) importLoop(ptCloudList) otherSettings() exportSection(ptCloudDir)
def ProcessOutput(prefix, expectedfiles, parallel=True, verbose=True): """ wait for outputs and process them """ if verbose: print('Processing LDs') processed = [] R = [] Rappend = R.append D = [] Dappend = D.append if parallel: while len(set(expectedfiles).difference(processed)) != 0: files = G('*.ld') if not files: sleep(10) actual = [f for f in files if f not in processed] for fn in actual: pref = fn[:fn.find('.ld')] iappend(int(re.findall('\d+', pref)[0])) #data[(idx, pref)] = read_LD(fn) data = read_LD(fn) Dappend(data.D) Rappend(data.R) processed.append(pref) #os.remove(fn) #sorted_keys = sorted(data.keys(), key=lambda v: v[0]) #full = sorted_keys.pop(0) with open('%s_DR.pickle' % (pref), 'wb') as DR: P.dump((D, R), DR) pyline = "import os, re, argparse, itertools;" pyline += "from subprocess import Popen;from glob import glob as G;" pyline += "from time import sleep;import pandas as pd;" pyline += "import numpy as np;import pickle as P;" pyline += "picklefile = '%s_DR.pickle'" % (pref) pyline += "with open(picklefile, 'rb') as RDR: D, R = P.load(RDR);" pyline += "oriD = D.pop(0); D = pd.concat(D, axis=1);n = D.shape[1];" pyline += "thetadot = D.apply(np.mean,axis=1);" pyline += "l2tract = [thetadot - D.loc[:,col] for col in D.columns];" pyline += "iminusdot = pd.concat(l2tract, axis=1);" pyline += "Var_j = np.divide((n-1),n) * (iminusdot**2).sum();" pyline += "Bias = (n-1) * (thetadot - oriD);" pyline += "D_corrected = (n * D) - ((n-1) * thetadot);" pyline += "na = '%s_Djack.pickle';" % (pref) pyline += "obj = (D, thetadot, Var_j, Bias, D_corrected)" pyline += "with open(na,'wb') as DF: P.dump(obj,DF);del D, D_corrected;" pyline += "report = 'Jackknife-estimated D uncertainty description: ';" pyline += "report += str(Var_j.describe());" pyline += "report += 'Jackknife D bias: ' + str(Bias.describe());" pyline += "oriR = R.pop(0); R =pd.concat(R, axis=1); n = R.shape[1];" pyline += "thetadot = R.apply(np.mean, axis=1);" pyline += "l2tract = [thetadot - R.loc[:,col] for col in R.columns]" pyline += "iminusdot = pd.concat(l2tract, axis=1);" pyline += "Var_j = np.divide((n-1),n)* (iminusdot**2).sum();" pyline += "Bias = (n-1) * (thetadot - oriR);" pyline += "R_corrected = (n * R) - ((n-1) * thetadot);" pyline += "obj = (R, thetadot, Var_j, Bias, R_corrected)" pyline += "nam = '%s_Rjack.pickle'" % (pref) pyline += "with open(nam, 'wb') as RF: P.dump(obj, RF);" pyline += "del R, R_corrected, iminusdot, obj, l2tract;" pyline += "report += 'Jackknife-estimated uncertainty of R descrition:'" pyline += "+str(Var_j.describe());report += 'Jackknife bias of R" pyline += " description:\n' + str(Bias.describe());" pyline += "print('Uncertainty report:\n, report);" comm = 'python -c "%s"' % (pyline) name = '%s_processed' a = Popen(qsub % (1, name, name, name, comm), shell=True) a.communicate() else: for fn in G('*.ld'): pref = fn[:fn.find('.ld')] name = 'jack%s' % (re.findall('\d+', pref)[0]) #data[(idx, pref)] = read_LD(fn) data = read_LD(fn) data.D.name = name data.R.name = name Dappend(data.D) Rappend(data.R) processed.append(pref) ##Process D oriD = D.pop(0) D = pd.concat( D, axis=1 ) #.dropna()#.rename(columns={x:'n-%d'%(x) for x in indices}) mean = D.apply(np.mean, axis=1) n = D.shape[1] PSi = pd.concat([(n * oriD) - ((n - 1) * D.loc[:, col]) for col in D.columns], axis=1) PS = PSi.apply(np.mean, axis=1) Vps = ((PSi - PS)**2).sum(axis=1) * (1 / (n - 1)) e = (1.960) * np.sqrt(Vps / n) low, high = (PS - e), (PS + e) plotEstimation(oriD, mean, low, high, '%s_D' % (prefix)) #thetadot = D.apply(np.mean,axis=1) #iminusdot = pd.concat([thetadot - D.loc[:,col] for col in D.columns], # axis=1) #Var_j = ((n-1)/n) * (iminusdot**2).sum() #Bias = (n-1) * (thetadot - oriD) #D_corrected = (n * oriD) - ((n-1) * thetadot) with open('%s_Djack.pickle' % prefix, 'wb') as DF: P.dump((D, PSi, PS, Vps, low, high), DF) #del D, D_corrected #report = 'Jackknife-estimated uncertainty of D descrition:\n %s \n'%( #Var_j.describe()) #report += 'Jackknife bias of D description: %s\n'%(Bias.describe()) ##Process R oriR = R.pop(0) R = pd.concat(R, axis=1) mean = R.apply(np.mean, axis=1) PSi = pd.concat([(n * oriR) - ((n - 1) * R.loc[:, col]) for col in R.columns], axis=1) PS = PSi.apply(np.mean, axis=1) Vps = ((PSi - PS)**2).sum(axis=1) * (1 / (n - 1)) e = (1.960) * np.sqrt(Vps / n) low, high = (PS - e), (PS + e) plotEstimation(oriR, mean, low, high, '%s_R' % (prefix)) ##.dropna()#.rename(columns={x:'n-%d'%(x) for x in indices}) #n = R.shape[1] #thetadot = R.apply(np.mean,axis=1) #iminusdot = pd.concat([thetadot - R.loc[:,col] for col in R.columns], #axis=1) #Var_j = ((n-1)/n) * (iminusdot**2).sum() #Bias = (n-1) * (thetadot - oriR) #R_corrected = (n * oriR) - ((n-1) * thetadot) with open('%s_Rjack.pickle' % (prefix), 'wb') as RF: P.dump((R, PSi, PS, Vps, low, high), RF) #del R, R_corrected #report += 'Jackknife-estimated uncertainty of R descrition:\n %s'%( #Var_j.describe()) #report+= 'Jackknife bias of R description: %s'%(Bias.describe()) if verbose: print('Processing done...\n')