import os, sys
import subprocess as sub
selfbin = os.path.realpath(os.path.dirname(sys.argv[0]))
sys.path.insert(0, '/home/grigoryanlab/home/fzheng/modules_py')
import General

if len(sys.argv) - 1 != 1:
	print '<usage> [a list file]'
	exit(0)
lst = sys.argv[1]

for l in open(lst):
	info = l.strip().split('/')
	name, pds = info[-2], info[-1]
	cmap = General.changeExt(pds, 'cmap')
	if os.path.isfile(name + '/' + cmap):
		sub.call(['python', selfbin + '/threeBodyContactPotential.py', name + '/' + cmap, '0.01'])
par.add_argument('--conR', action = 'store_true', help = 'whether this is for a pair of contact')
par.add_argument('--env', help = 'if not None, also modify an .env file')
args = par.parse_args()

dirs = [x for x in os.listdir('.') if os.path.isdir(x)]
dirs.sort()
odir = os.getcwd()

for d in dirs:
	os.chdir(odir)
	os.chdir(d)
	pdbs = glob.glob('*.pdb')
	cmds = []
	resn = int(d.split('_')[1][2:])
	for pdb in pdbs:
		matchf = args.head + '_' + General.changeExt(pdb, 'match')
		if not os.path.isfile(matchf):
			continue

		pos = PDB.findPositionInPDB(pdb, resn)
		# if output file is already there, skip the job
		if os.path.isfile('nr'+args.id +'_'+matchf):
			continue

		cmd = ['python', selfbin + '/removeLocalRedundancy.py', '--m', matchf, '--cres', str(pos), '--id', args.id, '--outh', 'nr'+args.id]
		if not args.db == None:
			cmd.extend(['--db', args.db])
		if args.conR:
			conresn = General.getBase(pdb).split('_')[2][1:]
			conpos = PDB.findPositionInPDB(pdb, conresn)
			cmd.extend(['--conres', str(conpos)])
# this script create .env file from .cmap files

import os, sys, csv, re
import General, Fragment, PDB

if len(sys.argv) -1 <1:
    print '[Usage] <.cmap files as arguments>'
    exit(0)
cmap_files = sys.argv[1:]

headers = ['residue', 'AA', 'total_contact_degree', 'crowdness', 'phipsi', 'permanent_contacts']
for cmap in cmap_files:
    outf = General.changeExt(cmap, 'env_dun')
    if os.path.isfile(outf):
        continue

    residues = {}

    cmap_fh = open(cmap)
    out_fh = open(outf, 'w')
    f_csv = csv.DictWriter(out_fh, headers, delimiter = '\t')
    f_csv.writeheader()
    
    for line in cmap_fh:
        
        if line.startswith('contact'):
            res1, res2, degree, resname1, resname2 = line.rstrip('\n').split()[1:]
            # resnum1, resnum2 = res1.split(',')[1], res2.split(',')[1]
            # if not (resnum1.isdigit() and resnum2.isdigit()):
            #     continue
            if not res1 in residues:
Example #4
0
par.add_argument('--o', required = True, help = 'the output files')
args = par.parse_args()

odir = os.getcwd()

aas = [x for x in PDB.a2aaa]
pairtable = {}
for i in aas:
    for j in aas:
        pairtable[i+'|'+j] = 0
ctable = {x : 0 for x in aas} 

for l in open(args.l):
    info = l.strip().split('/')
    subdir, name = info[-2], info[-1]
    cfile = General.changeExt(subdir + '/' + name, args.ext)
    if not os.path.isfile(cfile):
        continue
    for ll in open(cfile):
        # if ll.find('contact') != 0:
        #     continue
        info2 = ll.strip().split()
        cond, aa1, aa2 = [info2[args.coln[i]] for i in range(3)] 
        aa1, aa2 = PDB.t2s(aa1), PDB.t2s(aa2)

        # optional, for sc cond
        sc_cond = float(info2[-1])
        if sc_cond > 0.01:
            continue

        if (float(cond) >= args.range[0]) and (float(cond) <= args.range[1]):
import os, sys, argparse, csv
import General

par = argparse.ArgumentParser()
par.add_argument('--cmap', nargs = '+', help = '.cmap files, one or more')
args = par.parse_args()

for cmapf in args.cmap:
	assert os.path.isfile(cmapf)
	envf = General.changeExt(cmapf, 'env')
	assert os.path.isfile(envf)
	outf = General.changeExt(cmapf, 'cmapm')

	total_cd = {}
	with open(envf) as ef:
		f_csv = csv.DictReader(ef, delimiter = '\t')
		for row in f_csv:
			resid, tcd = row['residue'], float(row['total_contact_degree'])
			total_cd[resid] = tcd

	out = open(outf, 'w')
	for l in open(cmapf):
		if not l.startswith('contact'):
			continue
		items = l.strip().split()
		res1, res2, cd = items[1:4]
		cd = round(float(cd), 3)
		if (cd == 0) or (not res1 in total_cd) or (not res2 in total_cd):
			continue
		mutual_cd = cd * (0.5/total_cd[res1] + 0.5/total_cd[res2])
		nitems = [x for x in items]
Example #6
0
import General, PDB

par = argparse.ArgumentParser()
par.add_argument('--l', required = True, help = 'the used list file')
par.add_argument('--e', default = 'env', help = 'the extension of files with environment score')
par.add_argument('--c', default = 'cmap', help = 'the extension of files with contacts')
par.add_argument('--range', nargs = 2, default = [0.01, 1], help = 'the range of contact degree')
par.add_argument('--o', required = True, help = 'the output file')
args = par.parse_args()

out = open(args.o, 'w')
for l in open(args.l):
    info = l.strip().split('/')
    subdir, name = info[-2], info[-1]
    # use the environment file for the whole protein but cmap file for single chain, don't know if this is good
    envfile = General.changeExt(subdir + '/' + name.split('_')[0], args.e)
    cmapfile = General.changeExt(subdir + '/' + name, args.c)
    
    if not (os.path.isfile(envfile) and os.path.isfile(cmapfile)):
        continue
    
    env = {}
    with open(envfile) as ef:
        f_csv = csv.DictReader(ef, delimiter = '\t')
        for row in f_csv:
            env[row['residue']] = row['environment_score']
    
    cf = open(cmapfile)
    for cfl in cf:
        if not cfl.startswith('contact'):
            continue
Example #7
0
par.add_argument('--sl', help = 'use a searchDB list file')
par.add_argument('--o', required = True, help = 'name of the output file')
args = par.parse_args()

out = open(args.o, 'w')

def outputSeq(seqs, name, out, chains = None):
	if (chains != None) and (not isinstance(chains, list)):
		chains = list(chains)
	keys = seqs.keys()
	keys.sort()
	for k in keys:
		if (chains != None) and (k not in chains):
			continue
		out.write('>' + name + '_' + k + '\n')
		out.write(seqs[k]+'\n')

if args.sl == None:
	for l in open(args.pl):
		pid, cid = l.strip().split('_')
		p = pid.lower() + '.clean.pdb'
		seqs = PDB.pdb2seq(p)
		outputSeq(seqs, pid.lower(), out, cid)		
else:
	for l in open(args.sl):
		p = General.changeExt(l.rstrip('\n'), 'pdb')
		seqs = PDB.pdb2seq(p)
		name = General.removePath(p).split('.')[0]
		outputSeq(seqs, name, out)
out.close()
	assert args.ppfile != None
	pp_file = args.ppfile
	pp_lines = open(pp_file).readlines()
	pp_bins = [float(x) for x in pp_lines[0].strip().split()]
	pp_aatypes = pp_lines[2].strip().split()
	pp_aaindex = {pp_aatypes[x] : x for x in range(20)}
	pp_pots = np.zeros((36, 36, 20))
	for ppl in pp_lines[4:]:
		ppl_items = ppl.strip().split()
		pp_pots[int(ppl_items[0])-1, int(ppl_items[1])-1] = map(float, ppl_items[2:])

seqfs = glob.glob(args.head + '*.seq')
cid, resnum = args.resid

for seqf in seqfs:
	pdbf = General.changeExt( seqf.replace(args.head + '_', ''), 'pdb')
	if not os.path.isfile(pdbf):
		print(pdbf + ' doesn\'t exist!')
		continue

	outf = General.changeExt(pdbf, args.o)

	if args.wgap != None: # specific to gap
		assert args.conR == False, 'wgap and conR cannot be specified simultaneously'
		dirname = General.getBase(pdbf)
		pdbf = args.wgap + '/' + dirname + '/'+ pdbf

	index = PDB.findPositionInPDB(pdbf, resnum, cid)
	aacol = Analyze.readColumn(seqf, index, top = args.uplimit)

	if args.conR: # should contacting residue be constrained?
Example #9
0
        key=lambda x: (
            x.split()[0].split(",")[0],
            int(re.search("\d+", x.split()[0]).group(0)),
            x.split()[1].split(",")[0],
            int(re.search("\d+", x.split()[1]).group(0)),
        ),
    )
    for outstring in sort_outstrs:
        out_fh.write(outstring + "\n")
    out_fh.close()


for pdb in args.p:
    assert os.path.isfile(pdb)
    # run confind in verbose mode
    cmapv = General.changeExt(pdb, "cmapv")
    if not os.path.isfile(cmapv):
        cmap_fh = open(cmapv, "w")
        cmd_confind = [CONFIND, "--p", pdb, "--rLib", rotLib, "--verb"]
        sub.call(cmd_confind, stdout=cmap_fh)
        cmap_fh.close()

        # parse cmapv file for all pairs of contacts
    G, V = VerboseToGraph(cmapv, pdb)

    outf = General.changeExt(pdb, args.ext)
    if not args.var:
        calculator(G, outf)
    else:
        calculator(G, outf, V)
    os.remove(cmapv)
Example #10
0
uplimit = args.uplimit
nseq = 0
for match_line in open(args.m):
    if (uplimit != None) and (nseq == uplimit):
        break
    match_line = match_line.strip()
    indices = Analyze.index_from_match(match_line)
    index1, index2 = indices[args.n[0]], indices[args.n[1]]

    target_pds = match_line.split()[1]
    targetid = General.getBase( General.removePath(match_line.split()[1]) )
    env_dict = database_path + '/' + targetid[1:3] + '/' + targetid + '.freedom.db'
    db = shelve.open(env_dict, 'r')

    # extract post-processed pdb files from target_pds
    resfile = database_path + '/' + targetid[1:3] + '/' + General.changeExt( General.removePath(target_pds), 'post.res')
    allres = open(resfile).read().splitlines()
    resid1, resid2 = allres[index1], allres[index2]
    resid1, resid2 = resid1[0] + ',' + resid1[1:], resid2[0] + ',' + resid2[1:]
    fields = ['sumcond', 'crwdnes', 'freedom', 'phi', 'psi', 'aa']
    outfh.write(targetid + '\t')
    if not resid1 in db:
        outfh.write('\t'.join([resid1] + ['NA' for x in range(len(fields))]))
    else:
        res_info = db[resid1]
        outfields = [str(res_info[x]) if x in res_info else 'NA' for x in fields]
        outfh.write('\t'.join([resid1] + outfields))
    outfh.write('\t')
    if not resid2 in db:
        outfh.write('\t'.join([resid2] + ['NA' for x in range(len(fields))]))
    else:
Example #11
0
args = parser.parse_args()

# parameters required for using smart rmsd cutoff from Craig
# rmsdmax, perLen = 1,1, 15

# dependencies between arguments
if (args.rmsd == None) and (args.bbrmsd == None) and (args.nohomo == None) and (not args.uniq) and (not args.smart):
    raise General.myerror('I am doing nothing...')
if (args.uniq == False) and (args.nonat == True):
    raise General.myerror('cannot specify nonat without uniq')
if args.ohead == args.head:
    raise General.myerror('after process the head name is the same, not allowed...')

pid = args.pdb.split('_')[0]
matchf = args.head+'_'+General.changeExt(args.pdb, 'match')
seqf = General.changeExt(matchf, 'seq')

conres = PDB.ConRes(args.pdb)

# if using smart rmsd cutoff, need to create a list in which each element is the length of a segment
if args.smart:
    resnums = [r.getResnum() for r in conres]
    resnums.sort()
    segments = [1]
    for i in range(1, len(resnums)):
        if resnums[i] - 1 == resnums[i-1]:
            segments[-1] += 1
        else:
            segments.append(1)
par.add_argument('--db', default = '/home/anthill/fzheng/home/searchDB/statistics/bc-30-sc-20141022.peprm2.db', help = 'a shelve db object which contains the sequences of database targets')
par.add_argument('--cres', required = True, type = int, help = 'the index of the central position in the match, start from 1')
par.add_argument('--wd', default = 15, type = int, help = 'the size of comparing window on each side of the central position, so 7 means a 15 residue window')
par.add_argument('--id', required = True, help = 'identity cutoff for clustering')
par.add_argument('--outh', default = 'nr', help = 'a head to put before processed seq and match file')
par.add_argument('--conres', type = int, help = 'the index of the contacting position')
par.add_argument('--env', default = 'envpair', help = 'the extension of the environment file')
args = par.parse_args()

matches = open(args.m)
database = shelve.open(args.db)

# create a temporary file of all sequence context
odir = os.getcwd()
ldir = General.createLocalSpace()
tempfile = General.changeExt(args.m, 'seqcontext.fasta') 
tempfh = open(ldir + '/' + tempfile, 'w')
matchind = 0

# if consider the contacting position
if args.conres != None:
	tempfile2 = General.changeExt(args.m, 'seqcontext.fasta2')
	tempfh2 = open(ldir + '/' + tempfile2, 'w')

# output file names
nr_matchf = args.outh + '_' + args.m
nr_seqf = General.changeExt(nr_matchf, 'seq')
nr_env = None
oenv = General.changeExt(args.m, args.env)
if os.path.isfile(oenv):
	nr_env = General.changeExt(nr_matchf, args.env)
import itertools
sys.path.insert(0, '/home/grigoryanlab/home/fzheng/modules_py')
import General

if len(sys.argv) - 1 != 2:
	print '<Usage> [a .cmap file] [a threshold]'

cmap, cut = sys.argv[1:]

condict = {}
residues = {}
for l in open(cmap):
	info = l.strip().split()
	if l.startswith('contact') and (float(info[3]) > float(cut)):
		residues[info[1]] = info[4]
		residues[info[2]] = info[5]
		condict[info[1]+'-'+info[2]] = info[3]

resi = sorted(residues.keys())
combinations = itertools.combinations(resi, 3)

out = open(General.changeExt(cmap,'con3'), 'w')
for cb in combinations:
	r1, r2, r3 = cb
	if (r1+'-'+r2 in condict) and (r1+'-'+r3 in condict) and (r2+'-'+r3 in condict):
		out.write('\t'.join([r1, r2, r3, residues[r1], residues[r2], residues[r3], 
			condict[r1+'-'+r2], condict[r1+'-'+r3], condict[r2+'-'+r3]]) + '\n')
out.close()


Example #14
0
errorfree = True

smart = False
if args.rmsd == None:
	smart = True
else:
	rmsdeff = args.rmsd

for pdb in args.p:
	# set rmsd cutoff using Craig's function
	residues = PDB.ConRes(pdb)
	segments = Fragment.getSegments(residues)
	if smart:
		rmsdeff = mustpress.rmsdEff(segments, args.params[0], args.params[1])

	seqout = args.head + '_' + General.changeExt(pdb, 'seq')
	matchout = General.changeExt(seqout, 'match')

	if os.path.isfile(seqout):
		continue

	# search is additonal to a previous round
	if args.more != None:
		oseqf = args.more + '_' + General.changeExt(pdb, 'seq')
		if not os.path.isfile(oseqf):
			continue
		olen = sum([1 for x in open(oseqf)])
		if olen >= args.topn:
			omatchf = General.changeExt(oseqf, 'match')
			os.system('ln -s ' + oseqf + ' ' + seqout)
			os.system('ln -s ' + omatchf + ' ' + matchout)
pdbs = glob.glob(FRAGMENTS_OUT + '*.pdb')
pdbs.sort()
ldir = General.createLocalSpace()

for p in pdbs:
	base = General.removePath( General.getBase(p) )

	if base.startswith('hit1'):
		odir = ABUNDANCE_OUT
		seqf = ABUNDANCE_OUT + args.h[1] + '_' + base + '.seq'
	else:
		odir = DESIGNSCORE_OUT
		seqf = DESIGNSCORE_OUT + args.h[0] + '_' + base + '.seq'

	seqout = args.nh + '_' + base + '.seq'
	matchout = General.changeExt(seqout, 'match')
	if os.path.isfile(odir + seqout):
		continue

	nlines = 0
	if os.path.isfile(seqf):
		for l in open(seqf):
			nlines += 1
			if nlines == args.cut:
				break
	if nlines >= args.cut:
		continue

	pds_file = changeExt(p, 'pds')
	cmd_master = [Master + 'master', '--query', pds_file, '--targetList', list_tmp, '--rmsdCut', args.rmsd, '--topN', str(args.cut), '--bbRMSD', '--matchOut', ldir + '/' + matchout, '--seqOut', ldir + '/' +seqout]
	sub.call(cmd_master)
Example #16
0
	exit(0)
lst, out = sys.argv[1:]

# aa is in a certain order
aatypes = ['G', 'A', 'V', 'L', 'I', 'M', 'F', 'W', 'P', 'S', 'T', 'C', 'Y', 'N', 'Q', 'D', 'E', 'K', 'R', 'H']
aaindex = {}
for i in range(20):
	aaindex[aatypes[i]] = i

# make an 20^3 matrix
mat = np.zeros((20,20,20), dtype=int)

for l in open(lst):
	info = l.strip().split('/')
	name, pds = info[-2], info[-1]
	con3f = name + '/' + General.changeExt(pds, 'con3')
	if not os.path.isfile(con3f):
		continue
	fh = open(con3f)
	for ll in fh:
		ll = ll.strip()
		aas = ll.split()[3:6]
		aas = map(PDB.t2s, aas)
		permut = itertools.permutations(aas, 3)
		for k in permut:
			mat[aaindex[k[0]], aaindex[k[1]], aaindex[k[2]]] += 1

np.save(out, mat)


Example #17
0
par = argparse.ArgumentParser()
par.add_argument('--f', required = True, help = 'a .tab file')
par.add_argument('--verb', action = 'store_true', help = 'if output verbose')
args = par.parse_args()

CONFIND = '/home/grigoryanlab/home/gevorg/work/MSL/latest/trunk/bin/confind'
rotLib = '/home/anthill/fzheng/home/scripts/termanal/support.default/rotlib/RR2000.rotlib'

for l in open(args.f):
    l = l.rstrip('\n')
    pid, cid = l.split()[0:2]
    cleanpdb = pid.lower() + '.clean.pdb'
    if not os.path.isfile(cleanpdb):
        continue
    if args.verb:
        cmap = General.changeExt(cleanpdb,'cmapv')
    else:
        cmap = General.changeExt(cleanpdb,'cmap')
    if os.path.isfile(cmap):
        continue
    with warnings.catch_warnings():
        try:
            if args.verb:
                out = open(cmap, 'w')
                cmd_confind = [CONFIND, '--p', cleanpdb, '--rLib', rotLib, '--verb']
                sub.call(cmd_confind, stdout = out)
            else:
                cmd_confind = [CONFIND, '--p', cleanpdb, '--o', cmap, '--rLib', rotLib]
                sub.call(cmd_confind)
        except Warning:
            print 'warnings in', pid
Example #18
0
    env_data_path = '/home/anthill/fzheng/home/searchDB/support_bc-30-sc-correct-20141022/others'
    for res in residues:
        cid, resnum = res.getChid().strip(), res.getResnum()
        if not args.post:
            frag_pdb = FRAGMENTS_OUT + '/' + base + '_' + cid + str(resnum) + '.pdb'
            pds_file = changeExt(frag_pdb, 'pds')
            seqf1 = DESIGNSCORE_OUT + '/' + head[1] + '_' + base + '_' + cid + str(resnum) + '.seq'
        else:
            frag_pdb = outdir + '/' + base + '/fragments/' + base + '_' + cid + str(resnum) + '.pdb'
            pds_file = changeExt(frag_pdb, 'pds')
            seqf1 = outdir + '/' + base + '/designscore/' + head[1] + '_' + base + '_' + cid + str(resnum) + '.seq'

        if not os.path.isfile(frag_pdb):
            continue

        matchf1 = General.changeExt(seqf1, 'match')

        cmd_matchInFile = [MASTER + '/master', '--query', pds_file, '--matchIn', matchf1, '--structOut', General.changeExt(seqf1, 'sout'), '--outType', 'match', '--bbRMSD', '--topN', str(topN)]
        cmd_matchInFile = sub.call(cmd_matchInFile)
        match_lines = []
        nmatch = 0
        for match_l in open(matchf1):
            nmatch += 1
            match_lines.append(match_l)
            if nmatch == topN:
                break
        # determine the index of original residue in its own TERM
        rn = findPositionInPDB(frag_pdb, resnum, cid)
        envf = open(General.changeExt(seqf1, 'envi'), 'w')
        for i in range(1, len(match_lines)+1):
            midx = str(i).rjust(len(str(len(match_lines))), '0')
Example #19
0
sys.path.insert(0, "/home/grigoryanlab/home/fzheng/modules_py/")
import General

par = argparse.ArgumentParser()
par.add_argument("--fa", required=True, help="a fasta file")
par.add_argument("--o", default="peprm", help="extension of the output file")
par.add_argument(
    "--cut", default=30, help="the cutoff, when the sequence length is lower than this, it is removed from the list"
)
args = par.parse_args()

dir = os.path.dirname(os.path.realpath(sys.argv[0])) + "/../"
lines = open(args.fa).readlines()

# will have two output file, new list and new fa
out_l = open("list_" + args.o, "w")
out_fa = open(General.changeExt(args.fa, args.o + ".fa"), "w")

for i in range(0, len(lines), 2):
    length = len(lines[i + 1].strip())
    if length < args.cut:
        continue
    name = General.changeExt(lines[i].strip()[1:], "pds")
    path = os.path.realpath(dir) + "/" + name[1:3] + "/" + name
    out_l.write(path + "\n")
    out_fa.write(lines[i])
    out_fa.write(lines[i + 1])
out_l.close()
out_fa.close()