import os, sys import subprocess as sub selfbin = os.path.realpath(os.path.dirname(sys.argv[0])) sys.path.insert(0, '/home/grigoryanlab/home/fzheng/modules_py') import General if len(sys.argv) - 1 != 1: print '<usage> [a list file]' exit(0) lst = sys.argv[1] for l in open(lst): info = l.strip().split('/') name, pds = info[-2], info[-1] cmap = General.changeExt(pds, 'cmap') if os.path.isfile(name + '/' + cmap): sub.call(['python', selfbin + '/threeBodyContactPotential.py', name + '/' + cmap, '0.01'])
par.add_argument('--conR', action = 'store_true', help = 'whether this is for a pair of contact') par.add_argument('--env', help = 'if not None, also modify an .env file') args = par.parse_args() dirs = [x for x in os.listdir('.') if os.path.isdir(x)] dirs.sort() odir = os.getcwd() for d in dirs: os.chdir(odir) os.chdir(d) pdbs = glob.glob('*.pdb') cmds = [] resn = int(d.split('_')[1][2:]) for pdb in pdbs: matchf = args.head + '_' + General.changeExt(pdb, 'match') if not os.path.isfile(matchf): continue pos = PDB.findPositionInPDB(pdb, resn) # if output file is already there, skip the job if os.path.isfile('nr'+args.id +'_'+matchf): continue cmd = ['python', selfbin + '/removeLocalRedundancy.py', '--m', matchf, '--cres', str(pos), '--id', args.id, '--outh', 'nr'+args.id] if not args.db == None: cmd.extend(['--db', args.db]) if args.conR: conresn = General.getBase(pdb).split('_')[2][1:] conpos = PDB.findPositionInPDB(pdb, conresn) cmd.extend(['--conres', str(conpos)])
# this script create .env file from .cmap files import os, sys, csv, re import General, Fragment, PDB if len(sys.argv) -1 <1: print '[Usage] <.cmap files as arguments>' exit(0) cmap_files = sys.argv[1:] headers = ['residue', 'AA', 'total_contact_degree', 'crowdness', 'phipsi', 'permanent_contacts'] for cmap in cmap_files: outf = General.changeExt(cmap, 'env_dun') if os.path.isfile(outf): continue residues = {} cmap_fh = open(cmap) out_fh = open(outf, 'w') f_csv = csv.DictWriter(out_fh, headers, delimiter = '\t') f_csv.writeheader() for line in cmap_fh: if line.startswith('contact'): res1, res2, degree, resname1, resname2 = line.rstrip('\n').split()[1:] # resnum1, resnum2 = res1.split(',')[1], res2.split(',')[1] # if not (resnum1.isdigit() and resnum2.isdigit()): # continue if not res1 in residues:
par.add_argument('--o', required = True, help = 'the output files') args = par.parse_args() odir = os.getcwd() aas = [x for x in PDB.a2aaa] pairtable = {} for i in aas: for j in aas: pairtable[i+'|'+j] = 0 ctable = {x : 0 for x in aas} for l in open(args.l): info = l.strip().split('/') subdir, name = info[-2], info[-1] cfile = General.changeExt(subdir + '/' + name, args.ext) if not os.path.isfile(cfile): continue for ll in open(cfile): # if ll.find('contact') != 0: # continue info2 = ll.strip().split() cond, aa1, aa2 = [info2[args.coln[i]] for i in range(3)] aa1, aa2 = PDB.t2s(aa1), PDB.t2s(aa2) # optional, for sc cond sc_cond = float(info2[-1]) if sc_cond > 0.01: continue if (float(cond) >= args.range[0]) and (float(cond) <= args.range[1]):
import os, sys, argparse, csv import General par = argparse.ArgumentParser() par.add_argument('--cmap', nargs = '+', help = '.cmap files, one or more') args = par.parse_args() for cmapf in args.cmap: assert os.path.isfile(cmapf) envf = General.changeExt(cmapf, 'env') assert os.path.isfile(envf) outf = General.changeExt(cmapf, 'cmapm') total_cd = {} with open(envf) as ef: f_csv = csv.DictReader(ef, delimiter = '\t') for row in f_csv: resid, tcd = row['residue'], float(row['total_contact_degree']) total_cd[resid] = tcd out = open(outf, 'w') for l in open(cmapf): if not l.startswith('contact'): continue items = l.strip().split() res1, res2, cd = items[1:4] cd = round(float(cd), 3) if (cd == 0) or (not res1 in total_cd) or (not res2 in total_cd): continue mutual_cd = cd * (0.5/total_cd[res1] + 0.5/total_cd[res2]) nitems = [x for x in items]
import General, PDB par = argparse.ArgumentParser() par.add_argument('--l', required = True, help = 'the used list file') par.add_argument('--e', default = 'env', help = 'the extension of files with environment score') par.add_argument('--c', default = 'cmap', help = 'the extension of files with contacts') par.add_argument('--range', nargs = 2, default = [0.01, 1], help = 'the range of contact degree') par.add_argument('--o', required = True, help = 'the output file') args = par.parse_args() out = open(args.o, 'w') for l in open(args.l): info = l.strip().split('/') subdir, name = info[-2], info[-1] # use the environment file for the whole protein but cmap file for single chain, don't know if this is good envfile = General.changeExt(subdir + '/' + name.split('_')[0], args.e) cmapfile = General.changeExt(subdir + '/' + name, args.c) if not (os.path.isfile(envfile) and os.path.isfile(cmapfile)): continue env = {} with open(envfile) as ef: f_csv = csv.DictReader(ef, delimiter = '\t') for row in f_csv: env[row['residue']] = row['environment_score'] cf = open(cmapfile) for cfl in cf: if not cfl.startswith('contact'): continue
par.add_argument('--sl', help = 'use a searchDB list file') par.add_argument('--o', required = True, help = 'name of the output file') args = par.parse_args() out = open(args.o, 'w') def outputSeq(seqs, name, out, chains = None): if (chains != None) and (not isinstance(chains, list)): chains = list(chains) keys = seqs.keys() keys.sort() for k in keys: if (chains != None) and (k not in chains): continue out.write('>' + name + '_' + k + '\n') out.write(seqs[k]+'\n') if args.sl == None: for l in open(args.pl): pid, cid = l.strip().split('_') p = pid.lower() + '.clean.pdb' seqs = PDB.pdb2seq(p) outputSeq(seqs, pid.lower(), out, cid) else: for l in open(args.sl): p = General.changeExt(l.rstrip('\n'), 'pdb') seqs = PDB.pdb2seq(p) name = General.removePath(p).split('.')[0] outputSeq(seqs, name, out) out.close()
assert args.ppfile != None pp_file = args.ppfile pp_lines = open(pp_file).readlines() pp_bins = [float(x) for x in pp_lines[0].strip().split()] pp_aatypes = pp_lines[2].strip().split() pp_aaindex = {pp_aatypes[x] : x for x in range(20)} pp_pots = np.zeros((36, 36, 20)) for ppl in pp_lines[4:]: ppl_items = ppl.strip().split() pp_pots[int(ppl_items[0])-1, int(ppl_items[1])-1] = map(float, ppl_items[2:]) seqfs = glob.glob(args.head + '*.seq') cid, resnum = args.resid for seqf in seqfs: pdbf = General.changeExt( seqf.replace(args.head + '_', ''), 'pdb') if not os.path.isfile(pdbf): print(pdbf + ' doesn\'t exist!') continue outf = General.changeExt(pdbf, args.o) if args.wgap != None: # specific to gap assert args.conR == False, 'wgap and conR cannot be specified simultaneously' dirname = General.getBase(pdbf) pdbf = args.wgap + '/' + dirname + '/'+ pdbf index = PDB.findPositionInPDB(pdbf, resnum, cid) aacol = Analyze.readColumn(seqf, index, top = args.uplimit) if args.conR: # should contacting residue be constrained?
key=lambda x: ( x.split()[0].split(",")[0], int(re.search("\d+", x.split()[0]).group(0)), x.split()[1].split(",")[0], int(re.search("\d+", x.split()[1]).group(0)), ), ) for outstring in sort_outstrs: out_fh.write(outstring + "\n") out_fh.close() for pdb in args.p: assert os.path.isfile(pdb) # run confind in verbose mode cmapv = General.changeExt(pdb, "cmapv") if not os.path.isfile(cmapv): cmap_fh = open(cmapv, "w") cmd_confind = [CONFIND, "--p", pdb, "--rLib", rotLib, "--verb"] sub.call(cmd_confind, stdout=cmap_fh) cmap_fh.close() # parse cmapv file for all pairs of contacts G, V = VerboseToGraph(cmapv, pdb) outf = General.changeExt(pdb, args.ext) if not args.var: calculator(G, outf) else: calculator(G, outf, V) os.remove(cmapv)
uplimit = args.uplimit nseq = 0 for match_line in open(args.m): if (uplimit != None) and (nseq == uplimit): break match_line = match_line.strip() indices = Analyze.index_from_match(match_line) index1, index2 = indices[args.n[0]], indices[args.n[1]] target_pds = match_line.split()[1] targetid = General.getBase( General.removePath(match_line.split()[1]) ) env_dict = database_path + '/' + targetid[1:3] + '/' + targetid + '.freedom.db' db = shelve.open(env_dict, 'r') # extract post-processed pdb files from target_pds resfile = database_path + '/' + targetid[1:3] + '/' + General.changeExt( General.removePath(target_pds), 'post.res') allres = open(resfile).read().splitlines() resid1, resid2 = allres[index1], allres[index2] resid1, resid2 = resid1[0] + ',' + resid1[1:], resid2[0] + ',' + resid2[1:] fields = ['sumcond', 'crwdnes', 'freedom', 'phi', 'psi', 'aa'] outfh.write(targetid + '\t') if not resid1 in db: outfh.write('\t'.join([resid1] + ['NA' for x in range(len(fields))])) else: res_info = db[resid1] outfields = [str(res_info[x]) if x in res_info else 'NA' for x in fields] outfh.write('\t'.join([resid1] + outfields)) outfh.write('\t') if not resid2 in db: outfh.write('\t'.join([resid2] + ['NA' for x in range(len(fields))])) else:
args = parser.parse_args() # parameters required for using smart rmsd cutoff from Craig # rmsdmax, perLen = 1,1, 15 # dependencies between arguments if (args.rmsd == None) and (args.bbrmsd == None) and (args.nohomo == None) and (not args.uniq) and (not args.smart): raise General.myerror('I am doing nothing...') if (args.uniq == False) and (args.nonat == True): raise General.myerror('cannot specify nonat without uniq') if args.ohead == args.head: raise General.myerror('after process the head name is the same, not allowed...') pid = args.pdb.split('_')[0] matchf = args.head+'_'+General.changeExt(args.pdb, 'match') seqf = General.changeExt(matchf, 'seq') conres = PDB.ConRes(args.pdb) # if using smart rmsd cutoff, need to create a list in which each element is the length of a segment if args.smart: resnums = [r.getResnum() for r in conres] resnums.sort() segments = [1] for i in range(1, len(resnums)): if resnums[i] - 1 == resnums[i-1]: segments[-1] += 1 else: segments.append(1)
par.add_argument('--db', default = '/home/anthill/fzheng/home/searchDB/statistics/bc-30-sc-20141022.peprm2.db', help = 'a shelve db object which contains the sequences of database targets') par.add_argument('--cres', required = True, type = int, help = 'the index of the central position in the match, start from 1') par.add_argument('--wd', default = 15, type = int, help = 'the size of comparing window on each side of the central position, so 7 means a 15 residue window') par.add_argument('--id', required = True, help = 'identity cutoff for clustering') par.add_argument('--outh', default = 'nr', help = 'a head to put before processed seq and match file') par.add_argument('--conres', type = int, help = 'the index of the contacting position') par.add_argument('--env', default = 'envpair', help = 'the extension of the environment file') args = par.parse_args() matches = open(args.m) database = shelve.open(args.db) # create a temporary file of all sequence context odir = os.getcwd() ldir = General.createLocalSpace() tempfile = General.changeExt(args.m, 'seqcontext.fasta') tempfh = open(ldir + '/' + tempfile, 'w') matchind = 0 # if consider the contacting position if args.conres != None: tempfile2 = General.changeExt(args.m, 'seqcontext.fasta2') tempfh2 = open(ldir + '/' + tempfile2, 'w') # output file names nr_matchf = args.outh + '_' + args.m nr_seqf = General.changeExt(nr_matchf, 'seq') nr_env = None oenv = General.changeExt(args.m, args.env) if os.path.isfile(oenv): nr_env = General.changeExt(nr_matchf, args.env)
import itertools sys.path.insert(0, '/home/grigoryanlab/home/fzheng/modules_py') import General if len(sys.argv) - 1 != 2: print '<Usage> [a .cmap file] [a threshold]' cmap, cut = sys.argv[1:] condict = {} residues = {} for l in open(cmap): info = l.strip().split() if l.startswith('contact') and (float(info[3]) > float(cut)): residues[info[1]] = info[4] residues[info[2]] = info[5] condict[info[1]+'-'+info[2]] = info[3] resi = sorted(residues.keys()) combinations = itertools.combinations(resi, 3) out = open(General.changeExt(cmap,'con3'), 'w') for cb in combinations: r1, r2, r3 = cb if (r1+'-'+r2 in condict) and (r1+'-'+r3 in condict) and (r2+'-'+r3 in condict): out.write('\t'.join([r1, r2, r3, residues[r1], residues[r2], residues[r3], condict[r1+'-'+r2], condict[r1+'-'+r3], condict[r2+'-'+r3]]) + '\n') out.close()
errorfree = True smart = False if args.rmsd == None: smart = True else: rmsdeff = args.rmsd for pdb in args.p: # set rmsd cutoff using Craig's function residues = PDB.ConRes(pdb) segments = Fragment.getSegments(residues) if smart: rmsdeff = mustpress.rmsdEff(segments, args.params[0], args.params[1]) seqout = args.head + '_' + General.changeExt(pdb, 'seq') matchout = General.changeExt(seqout, 'match') if os.path.isfile(seqout): continue # search is additonal to a previous round if args.more != None: oseqf = args.more + '_' + General.changeExt(pdb, 'seq') if not os.path.isfile(oseqf): continue olen = sum([1 for x in open(oseqf)]) if olen >= args.topn: omatchf = General.changeExt(oseqf, 'match') os.system('ln -s ' + oseqf + ' ' + seqout) os.system('ln -s ' + omatchf + ' ' + matchout)
pdbs = glob.glob(FRAGMENTS_OUT + '*.pdb') pdbs.sort() ldir = General.createLocalSpace() for p in pdbs: base = General.removePath( General.getBase(p) ) if base.startswith('hit1'): odir = ABUNDANCE_OUT seqf = ABUNDANCE_OUT + args.h[1] + '_' + base + '.seq' else: odir = DESIGNSCORE_OUT seqf = DESIGNSCORE_OUT + args.h[0] + '_' + base + '.seq' seqout = args.nh + '_' + base + '.seq' matchout = General.changeExt(seqout, 'match') if os.path.isfile(odir + seqout): continue nlines = 0 if os.path.isfile(seqf): for l in open(seqf): nlines += 1 if nlines == args.cut: break if nlines >= args.cut: continue pds_file = changeExt(p, 'pds') cmd_master = [Master + 'master', '--query', pds_file, '--targetList', list_tmp, '--rmsdCut', args.rmsd, '--topN', str(args.cut), '--bbRMSD', '--matchOut', ldir + '/' + matchout, '--seqOut', ldir + '/' +seqout] sub.call(cmd_master)
exit(0) lst, out = sys.argv[1:] # aa is in a certain order aatypes = ['G', 'A', 'V', 'L', 'I', 'M', 'F', 'W', 'P', 'S', 'T', 'C', 'Y', 'N', 'Q', 'D', 'E', 'K', 'R', 'H'] aaindex = {} for i in range(20): aaindex[aatypes[i]] = i # make an 20^3 matrix mat = np.zeros((20,20,20), dtype=int) for l in open(lst): info = l.strip().split('/') name, pds = info[-2], info[-1] con3f = name + '/' + General.changeExt(pds, 'con3') if not os.path.isfile(con3f): continue fh = open(con3f) for ll in fh: ll = ll.strip() aas = ll.split()[3:6] aas = map(PDB.t2s, aas) permut = itertools.permutations(aas, 3) for k in permut: mat[aaindex[k[0]], aaindex[k[1]], aaindex[k[2]]] += 1 np.save(out, mat)
par = argparse.ArgumentParser() par.add_argument('--f', required = True, help = 'a .tab file') par.add_argument('--verb', action = 'store_true', help = 'if output verbose') args = par.parse_args() CONFIND = '/home/grigoryanlab/home/gevorg/work/MSL/latest/trunk/bin/confind' rotLib = '/home/anthill/fzheng/home/scripts/termanal/support.default/rotlib/RR2000.rotlib' for l in open(args.f): l = l.rstrip('\n') pid, cid = l.split()[0:2] cleanpdb = pid.lower() + '.clean.pdb' if not os.path.isfile(cleanpdb): continue if args.verb: cmap = General.changeExt(cleanpdb,'cmapv') else: cmap = General.changeExt(cleanpdb,'cmap') if os.path.isfile(cmap): continue with warnings.catch_warnings(): try: if args.verb: out = open(cmap, 'w') cmd_confind = [CONFIND, '--p', cleanpdb, '--rLib', rotLib, '--verb'] sub.call(cmd_confind, stdout = out) else: cmd_confind = [CONFIND, '--p', cleanpdb, '--o', cmap, '--rLib', rotLib] sub.call(cmd_confind) except Warning: print 'warnings in', pid
env_data_path = '/home/anthill/fzheng/home/searchDB/support_bc-30-sc-correct-20141022/others' for res in residues: cid, resnum = res.getChid().strip(), res.getResnum() if not args.post: frag_pdb = FRAGMENTS_OUT + '/' + base + '_' + cid + str(resnum) + '.pdb' pds_file = changeExt(frag_pdb, 'pds') seqf1 = DESIGNSCORE_OUT + '/' + head[1] + '_' + base + '_' + cid + str(resnum) + '.seq' else: frag_pdb = outdir + '/' + base + '/fragments/' + base + '_' + cid + str(resnum) + '.pdb' pds_file = changeExt(frag_pdb, 'pds') seqf1 = outdir + '/' + base + '/designscore/' + head[1] + '_' + base + '_' + cid + str(resnum) + '.seq' if not os.path.isfile(frag_pdb): continue matchf1 = General.changeExt(seqf1, 'match') cmd_matchInFile = [MASTER + '/master', '--query', pds_file, '--matchIn', matchf1, '--structOut', General.changeExt(seqf1, 'sout'), '--outType', 'match', '--bbRMSD', '--topN', str(topN)] cmd_matchInFile = sub.call(cmd_matchInFile) match_lines = [] nmatch = 0 for match_l in open(matchf1): nmatch += 1 match_lines.append(match_l) if nmatch == topN: break # determine the index of original residue in its own TERM rn = findPositionInPDB(frag_pdb, resnum, cid) envf = open(General.changeExt(seqf1, 'envi'), 'w') for i in range(1, len(match_lines)+1): midx = str(i).rjust(len(str(len(match_lines))), '0')
sys.path.insert(0, "/home/grigoryanlab/home/fzheng/modules_py/") import General par = argparse.ArgumentParser() par.add_argument("--fa", required=True, help="a fasta file") par.add_argument("--o", default="peprm", help="extension of the output file") par.add_argument( "--cut", default=30, help="the cutoff, when the sequence length is lower than this, it is removed from the list" ) args = par.parse_args() dir = os.path.dirname(os.path.realpath(sys.argv[0])) + "/../" lines = open(args.fa).readlines() # will have two output file, new list and new fa out_l = open("list_" + args.o, "w") out_fa = open(General.changeExt(args.fa, args.o + ".fa"), "w") for i in range(0, len(lines), 2): length = len(lines[i + 1].strip()) if length < args.cut: continue name = General.changeExt(lines[i].strip()[1:], "pds") path = os.path.realpath(dir) + "/" + name[1:3] + "/" + name out_l.write(path + "\n") out_fa.write(lines[i]) out_fa.write(lines[i + 1]) out_l.close() out_fa.close()