def main(): """ main forgetnrpdblist.py Usage: getnrpdblist.py Input is CD-HIT cluster file on stdin Output is list of PDB identifiers on stdout Each has format: <repr-id>: <list of ids in cluster> where <repr-id> is the PDB (+ chain e.g. 2ssp_B) id in pdb_seqres format that has been chosen as the representative for the cluster and following is list of pdb_seqres identifiers in that cluster. Only proteins are output, not DNA, RNA, etc. (see module docstring at top of file). """ if len(sys.argv) > 1: usage(sys.argv[0]) TMPDIR = os.tempnam(None, "pdbgz") os.mkdir(TMPDIR) try: get_nr_pdb_list(TMPDIR) finally: cleanup_tmpdir(TMPDIR)
def get_tableau_from_pdbstruct(pdbid, domain, pdb_structure, ptnode_list): """ Build a PTTableau object for the tableau by first creating a simple PDB file with only the ATOM records for residues in the domain we are processing, and also a .SSEsInfo file containing the secnodary structure assignments we already have, then running TableauCreator on it (using our simple PDB file and SSEsInfo) and parsing the output. Parameters: pdbid - PDB identifier of the strucutre domain - The PTDomain object for our current domain pdb_structure - parsed Bio.PDB structure ptnode_list - list of PTNode objects (ie iterable of PTNode) representing the SSEs (helices,strands) the tabelau is for. Return value: PTTableau object built from TableauCreator output """ TMPDIR = os.tempnam(None, "pttabin") os.mkdir(TMPDIR) try: filename = pdbid if domain.domainid != None: filename += '-' + domain.domainid filename += '.pdb' domain_pdb_filename = os.path.join(TMPDIR, filename) io = PDBIO() io.set_structure(pdb_structure) io.save(domain_pdb_filename, DomainSelect(domain)) ssesinfo_filename = os.path.join(TMPDIR, filename + ".input-SSEsInfo") write_ssesinfo(ssesinfo_filename, ptnode_list) tableau = read_tableau_from_tableaucreator(domain_pdb_filename, ptnode_list, ssesinfo_filename) os.unlink(domain_pdb_filename) os.unlink(ssesinfo_filename) finally: cleanup_tmpdir(TMPDIR) return tableau
def get_tableaux(pdb_filename, secstruct_program='dssp', domain_program='none', include_310_helices=True, include_pi_helices=True, sse_id_list=None, min_sse_len=None, use_numeric=False, use_hk=False, build_dist_matrix=False): """ Get a tableau for a single PDB or ASTRAL pdb-style file (compressed files e.g. pdb1qlp.ent.gz) or uncompressed or the ASTRAL pdb-style hierarchy (uncompressed files e.g. d1qlpa_.ent). Parameters: pdb_filename - filename of PDB or ASTRAL pdb-style file, as above. secstruct_program - secondary structure definition program ('stride' or 'dssp' or 'pdb') to use. domain_progam - domain decompositino method ('ddomain','cath', etc.) include_310_helices - if True, include 3_10 helices in the graph include_pi_helices - if True, include pi helices in the graph sse_id_list - list of ints representing SSE sequential id numbers to include in tableau. Default None. When None, all SSEs are included. min_sse_len - min number of residues in SSE to be ncluded. Default None (no min length). use_numeric - if True build Numeric.array Omega matrix not PTTableau use_hk - If True build tableaux with HH and KK codes for strands in same sheet. default False. build_dist_matrix - If True, build SSE midpoint distance matrices instead of tableaux. Return value: tuple (pdbid, tableaux_list, sse_string_list) from the pdb file, only one in lists unless domain decomposition is used and finds multidomains in input. tableaux_list is list of tableaux or omega matrices sse_string_list is SSE string description e.g. 'EEHHE' etc. """ tableaux_list = [] # check for compressed files. We only support gzip (.gz) # Note we are not using the zlib or GzipFile python modules # since we are calling to external programs which require the # file uncompressed themsevles anyway so we'll just run gzip # to uncompress the file to a temporary directory. pdb_file_basename = os.path.basename(pdb_filename) (name, extension) = os.path.splitext(pdb_file_basename) if extension == '.gz': TMPDIR = os.tempnam(None, "ptgz") os.mkdir(TMPDIR) tmp_pdbfilename = os.path.join(TMPDIR, name) os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) our_pdb_filename = tmp_pdbfilename used_tmp_file = True else: our_pdb_filename = pdb_filename used_tmp_file = False try: pdbid = name if len(pdbid) >= 6 and pdbid[:3].upper() == "PDB": pdbid = pdbid[3:7].upper() # parse PDB file pdb_parser = PDBParser() pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) # create the Tableaux and output them (tableaux_list, sse_string_list) = make_tableaux( our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, use_numeric, sse_id_list, use_hk, min_sse_len, build_dist_matrix) finally: if used_tmp_file: cleanup_tmpdir(TMPDIR) return (pdbid, tableaux_list, sse_string_list)
def read_tableau_from_tableaucreator(pdb_filename, ptnode_list, ssesinfo_filename): """ Run Arun's TableauCreator program on the supplied pdb_filename using SSEsInfo file. Parameters: pdb_filename - PDB file to run TableauCreator on ptnode_list - list of PTNode objects (ie iterable of PTNode) representing the SSEs (helices,strands) the tabelau is for. ssesinfo_filename - filename of the .SSEsInfo file that was written to define SSEs for TableauCreator. Return value: PTTableau object built from TableauCreator output NB: TableauCreator is not yet published or available (October 2007) and I am using a private version which Arun sent me, which I modified to add the -s option to use STRIDE rather than DSSP and to have the -i option to parse .SSEsInfo files. """ # TableauCreator needs an output directory where it writes all its # intermediate/output files, only puts progress information/errors # to stdout/stderr. tmpdir = os.tempnam(None, "pttab") os.mkdir(tmpdir) command = "TableauCreator " command += "-i " + ssesinfo_filename + " " command += pdb_filename + " " + tmpdir command += " >/dev/null" if verbose: sys.stderr.write("running '" + command + "'...") os.system(command) if verbose: sys.stderr.write("done\n") # output files are: # <pdbfilename>.angles # <pdbfilename>.SSEsInfo # <pdbfilename>.stride or <pdbfilename>.dssp # <pdbfilename>.tableau outfile_prefix = os.path.join(tmpdir, os.path.basename(pdb_filename)) if not os.path.isfile(os.path.join(tmpdir, "TABCREATE_OK")): sys.stderr.write("ERROR: TableauCreator failed\n") cleanup_tmpdir(tmpdir) return None # Now the tricky thing is TableauCreator indexes its matrix just with # purely sequential numbers from 0 (as conventional) # assuming all SSEs in one domain and in fact one chain # (so we handle this by creating our own simple PDB file with only # ATOM records for our current domain, and only one TER record on # end so chains concatenated effectively). # And also (as in comments above functions) we have the dodginess of # doing the same thing in different ways in multiple places (DSSP/STRIDE # parsing, PDB parsing, etc.). # So let's check that the TableauCreator SSE info lines up with ours # (otherwise we can't use the tableau data). # parse the SSEsInfo file and check lines up with ptnodes, # returns list of ptnodes corresponding to Tableau entries (may be shorter # than our input node list; some removed as no equivalent in tableua). nodelist = parse_tableaucreator_ssesinfo(outfile_prefix + '.SSEsInfo', ptnode_list) if nodelist != None: tableau_filename = outfile_prefix + ".tableau" tableau = parse_tableaucreator_output(tableau_filename, nodelist) if tableau != None: if verbose: sys.stderr.write(str(tableau)) else: sys.stderr.write( 'WARNING: problem parsing TableauCreator output;\n' ' tableau information will not be used\n') else: sys.stderr.write('WARNING: problem with TableauCreator output;\n' ' tableau information will not be used\n') tableau = None cleanup_tmpdir(tmpdir) return tableau
def read_domains_from_ddomain(pdb_filename, pdb_model, chainid=None): """ Use the DDOMAIN program to parse the structure from a PDB file into domains and return the corresponding list of PTDomain objects. DDOMAIN is described in Zhou, Xue, Zhou 2007 'DDOMAIN: Dividing structures into domains using a normalized domain-domain interaction profile' Protein Science 16:947-955. It is available as a 64-bit linux executable and FORTRAN-77 source code from http://sparks.informatics.iupui.edu/Resource_files/DDOMAIN.tar.gz Parameters: pdb_filename - filename of PDB file to run DDOMAIN on pdb_model - Bio.PDB model struct for this PDB entry. Note that this is only needed in the case that a DDomain domain has different chain identifiers for start and end and is then used just to find last residue number in chain. chainid - (default None). If not None, only the specified chain is requested. Return value: List of PTDomain objects, one for each domain. NOTE: if there is only one domain, we will return a list with a single PTDomain with all data None, signifying a single domain protein with no further information. This is mainly because of when there are multiple chains, in which case the single domain is reported by DDOMAIN as having a different chain id for start and end. If there is a single domain we really don't want to do anything special, so it is better to just have it as a special case where no domain processing is done. """ # DDOMAIN needs the PDB file in its working directory, and it reads # the PDB code (e.g. 1QLP for PDB file 1QLP.pdb) from stdin # (optionaly with chain suffix, which we won't use) # Note it requires this filename format, so for format like pdb1qlp.ent # we need to rename the file for DDOMAIN to 1QLP.pdb # This is nasty, but otherwise have to modify DDOMAIN FORTRAN-77 source # so that's even more hassle to have to have a custom version (like we # did with STRIDE). # So we'll work in /tmp directory, make a symlink (TODO: only UNIX allows # this, maybe should actually copy file so works on other platforms) # and run DDOMAIN there. oldcwd = os.getcwd() TMPDIR = os.tempnam(None, "ptdd") os.mkdir(TMPDIR) symlink_path = None try: pdb_file_basename = os.path.basename(pdb_filename) (name,extension) = os.path.splitext(pdb_file_basename) if extension.lower() == '.pdb': # e.g. 1QLP.pdb pdb_identifier = name pdb_file_directory = os.path.split(pdb_filename)[0] symlink_path = os.path.join(TMPDIR, pdb_file_basename) os.symlink(os.path.abspath(pdb_filename), symlink_path) elif extension != '.ent' or name[:3].lower() != 'pdb': sys.stderr.write('WARNING: unknown PDB filename format "' + pdb_file_basename + '"\n') sys.stderr.write(' Not running DDomain\n') domain_list = [PTDomain(None, None)] # one-domain protein, no further info return domain_list else: # e.g. pdb1qlp.ent, make a symlink to it in format 1QLP.pdb pdb_identifier = name[3:7].upper() symlink_path = os.path.join(TMPDIR, pdb_identifier + '.pdb') os.symlink(os.path.abspath(pdb_filename), symlink_path) os.chdir(TMPDIR) if verbose: sys.stderr.write("running DDomain...") (ddomain_stdin, ddomain_stdout) = os.popen2("DDomain") if chainid != None: pdbchainid = pdb_identifier + chainid else: pdbchainid = pdb_identifier ddomain_stdin.write(pdbchainid + '\n') ddomain_stdin.close() domain_list = parse_ddomain_output(ddomain_stdout, pdb_model) ddomain_stdout.close() if verbose: sys.stderr.write("done\n") finally: if symlink_path: os.unlink(symlink_path) os.chdir(oldcwd) cleanup_tmpdir(TMPDIR) return domain_list
def get_tableaux( pdb_filename, secstruct_program="dssp", domain_program="none", include_310_helices=True, include_pi_helices=True, sse_id_list=None, min_sse_len=None, use_numeric=False, use_hk=False, build_dist_matrix=False, ): """ Get a tableau for a single PDB or ASTRAL pdb-style file (compressed files e.g. pdb1qlp.ent.gz) or uncompressed or the ASTRAL pdb-style hierarchy (uncompressed files e.g. d1qlpa_.ent). Parameters: pdb_filename - filename of PDB or ASTRAL pdb-style file, as above. secstruct_program - secondary structure definition program ('stride' or 'dssp' or 'pdb') to use. domain_progam - domain decompositino method ('ddomain','cath', etc.) include_310_helices - if True, include 3_10 helices in the graph include_pi_helices - if True, include pi helices in the graph sse_id_list - list of ints representing SSE sequential id numbers to include in tableau. Default None. When None, all SSEs are included. min_sse_len - min number of residues in SSE to be ncluded. Default None (no min length). use_numeric - if True build Numeric.array Omega matrix not PTTableau use_hk - If True build tableaux with HH and KK codes for strands in same sheet. default False. build_dist_matrix - If True, build SSE midpoint distance matrices instead of tableaux. Return value: tuple (pdbid, tableaux_list, sse_string_list) from the pdb file, only one in lists unless domain decomposition is used and finds multidomains in input. tableaux_list is list of tableaux or omega matrices sse_string_list is SSE string description e.g. 'EEHHE' etc. """ tableaux_list = [] # check for compressed files. We only support gzip (.gz) # Note we are not using the zlib or GzipFile python modules # since we are calling to external programs which require the # file uncompressed themsevles anyway so we'll just run gzip # to uncompress the file to a temporary directory. pdb_file_basename = os.path.basename(pdb_filename) (name, extension) = os.path.splitext(pdb_file_basename) if extension == ".gz": TMPDIR = os.tempnam(None, "ptgz") os.mkdir(TMPDIR) tmp_pdbfilename = os.path.join(TMPDIR, name) os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) our_pdb_filename = tmp_pdbfilename used_tmp_file = True else: our_pdb_filename = pdb_filename used_tmp_file = False try: pdbid = name if len(pdbid) >= 6 and pdbid[:3].upper() == "PDB": pdbid = pdbid[3:7].upper() # parse PDB file pdb_parser = PDBParser() pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) # create the Tableaux and output them (tableaux_list, sse_string_list) = make_tableaux( our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, use_numeric, sse_id_list, use_hk, min_sse_len, build_dist_matrix, ) finally: if used_tmp_file: cleanup_tmpdir(TMPDIR) return (pdbid, tableaux_list, sse_string_list)
try: exit_status = 0 # parse PDB file - only needed for DDOMAIN when segment spans chains pdb_parser = PDBParser() pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) test_domains = get_domains(test_domain_method, pdbid, our_pdb_filename, pdb_struct) ref_domains = get_domains(ref_domain_method, pdbid, our_pdb_filename, pdb_struct) if print_domains: print test_domain_method write_domains(sys.stdout, test_domains) print ref_domain_method write_domains(sys.stdout, ref_domains) print domeval.domain_eval(test_domains, ref_domains) except NotInCATH_Exception,ex_pdbid: sys.stderr.write(str(ex_pdbid) + " not found in CATH CDF file\n") exit_status = 1 finally: if used_tmp_file: cleanup_tmpdir(TMPDIR) sys.exit(exit_status) if __name__ == "__main__": warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) main()
def main(): """ main for pytableaucreate.py Usage: pytableaucreate [-35nefuv] [-d|-b] [-t structprog] [-p domainprog] [-a domainid] [-s sse_num_list] [-c chainid] [-m min_sse_len] [-o savefile] <PDBfile> -3 specifies to include 3_10 helices in the diagram. Default is only alpha helices. -5 specifies to include pi helices in the diagram. Defaul is only alpha helices. -k use the HH and KK codes for respectively antiparallel and parallel strands in the same sheet, rather than the O, P etc. codes. -n output a numeric omega matrix instead of tableau. -e output numeric tableau angles in degrees, in the original TableauCreator .angles file format, with number of entries on first line, SSE sequence description on second line (E/H), then (full) matrix with angles in degrees (rather than radians). For distance matrix, same format with distances between SSEs in Angstroms. -f output the matrix in 'FORTRAN style' lower triangle with header line suitable for input to TMATN. -d build SSE axis midpoint distance matrix rather than tableau. -b build both the tableau and distance matrix and output together, for use with tsrchd etc. for example. If -u is used to permute the matrices, they are permuted the same way so they are still consistent. -p specify the domain decomposition method. Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'. -a domainid : only output specified domain -t specifies the secondary structure assignment program to use. Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride' or 'pmml'. Default 'pdb'. -s sse_num_list specifies a comman-separated list of SSE sequential ids to build the tableau for. SSE sequential id's start at 1 and go from N to C terminus. E.g. -s1,5,8 includes only the 1st, 5th and 8ths SSEs. Numbers do not restart at chains (but do restart in each domain). These nubmers are those assigned by 'ptgraph2 -b sequential' option. TODO: this currently does not make sense when multiple domains are being procssed, this option applies to each domain. -c chainid : specify chain identifier; only build tableau for that chain -m min_sse_len : minimum nubmer of residues in SSE for it to be included -i identifier : when using fortran format (-f), specify the identifier to use in the output rather than deriving it from the filename -o savefile : save tableau in packed format for use in other programs, such as tabsearchqpml.py WARNING: savefile is overwritten if it exists TODO: this currently does not make sense when multiple domains are being procssed, this option only saves first domain. -u randomly pemute the rows+cols (symmetric) of the tableau/distance matrix. writes the permutation vector in form permutation = i,j,..,m e.g. permutation = 3,1,2,4 as first line of output before identifier information and tableau -v specifies verbose mode: debugging output is written to stderr. """ global verbose try: opts, args = getopt.getopt(sys.argv[1:], "35bdfknep:a:t:s:c:m:i:o:uv?") except getopt.GetoptError: usage(os.path.basename(sys.argv[0])) valid_secstruct_programs = ["dssp", "stride", "pdb", "pmml"] valid_domain_programs = getdomains.valid_domain_programs + [r"none"] valid_domain_programs_re = [ re.compile(re_str) for re_str in valid_domain_programs ] verbose = False # global (python globals are only 'global' to module though) secstruct_program = "pdb" include_310_helices = False include_pi_helices = False domain_program = "none" sse_id_list = None use_numeric = False use_hk = False savefilename = None min_sse_len = None fortran_format = False build_distance_matrix = False chainid = None fident = None do_shuffle = False build_both = False # both tableau and dist matrix use_old_format = False # size + SSE chain + degrees omega matrix domainid = None for opt,arg in opts: if opt == "-3": # include 3_10 helices include_310_helices = True elif opt == "-5": # include pi helices include_pi_helices = True elif opt == "-d": # build SSE midpoint distance matrix not tableau build_distance_matrix = True elif opt == "-b": # build both tableau and distance matrix build_both = True elif opt == "-k": # use HH and KK codes use_hk = True elif opt == "-n": # output numeric matrix not tableau use_numeric = True elif opt == "-e": # use TableauCreator .angles file format use_old_format = True elif opt == "-f": # FORTRAN style format for TMATN fortran_format = True elif opt == "-p": # domain parsing program domain_program = None for valid_domarg_re in valid_domain_programs_re: if valid_domarg_re.match(arg): domain_program = arg break if domain_program == None: sys.stderr.write("valid values for -p are: " + str(valid_domain_programs) + "\n") usage(sys.argv[0]) elif opt == "-a": # only output tableau for specified domain id domainid = arg elif opt == "-t": if arg not in valid_secstruct_programs: sys.stderr.write("valid values for -t are: " + str(valid_secstruct_programs) + "\n") usage(sys.argv[0]) secstruct_program = arg elif opt == "-s": sse_id_list_str = arg.split(',') sse_id_list = [] sse_id_uniq_dict = {} # { id : True } just for checking all unique for sse_id_str in sse_id_list_str: if sse_id_str.isdigit(): if sse_id_uniq_dict.has_key(int(sse_id_str)): sys.stderr.write("duplicate SSE sequential number " + sse_id_str + "\n") usage(sys.argv[0]) sse_id_uniq_dict[int(sse_id_str)] = True sse_id_list.append(int(sse_id_str)) else: sys.stderr.write("not a valid SSE sequential number '" + sse_id_str + "'\n") usage(sys.argv[0]) sse_id_list.sort() # ensure SSEs are in order elif opt == "-c": # chain identifier if len(arg) != 1: sys.stderr.write("invalid chain identifier for -c option\n") usage(sys.argv[0]) chainid = arg.upper() elif opt == "-m": # min sse len min_sse_len = int(arg) elif opt == "-i": # identifier to use for fortran format fident = arg elif opt == "-o": # save tableau in packed format savefilename = arg elif opt == "-u": # randomly permute the tableau/matrix do_shuffle = True elif opt == "-v": # verbose verbose = True # this module only ptnode_set_verbose(True) # ptnode module ptsecstruct.ptsecstruct_set_verbose(True) # ptsecstruct module ptdomain_set_verbose(True) # ptdomain module else: usage(sys.argv[0]) if use_numeric and use_hk: sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are " "mutually exlusive\n") usage(sys.argv[0]) if build_distance_matrix and build_both: sys.stderr.write("WARNING: both -d (build dist matrix) and -b " "(build both) specified, ignoring -d\n") build_distance_matrix = False if savefilename and do_shuffle: sys.stderr.write('WARNING: saved tableau will not be shuffled\n') if build_distance_matrix: if use_numeric: use_numeric = False sys.stderr.write("WARNING: -n (numeric) ignored for -d (distance matrix)\n") if use_hk: sys.stderr.write("-k (use HH and KK) invalid for -d (distance matrix)\n"); usage(sys.argv[0]) if (secstruct_program == "pmml" and (min_sse_len == None or min_sse_len < 3)): sys.stderr.write("WARNING: PMML can give SSEs of length 1 or 2 causing axis fitting to fail, setting minimum length to 3 as if -m3 were specfified\n") min_sse_len = 3 if fident: if not fortran_format: sys.stderr.write("-i is only valid with -f\n") usage(sys.argv[0]) elif len(fident) > 8: sys.stderr.write("identifier must be 8 chars or less\n") usage(sys.argv[0]) if use_old_format and (build_both or use_hk or use_numeric or fortran_format or do_shuffle or savefilename): sys.stderr.write("-e (use old .angles format) is not compatible " "with -b -k or -n or -f or -u or -o\n") usage(os.path.basename(sys.argv[0])) if len(args) != 1: usage(os.path.basename(sys.argv[0])) pdb_filename = args[0] # check for compressed files. We only support gzip (.gz) # Note we are not using the zlib or GzipFile python modules # since we are calling to external programs which require the # file uncompressed themsevles anyway so we'll just run gzip # to uncompress the file to a temporary directory. pdb_file_basename = os.path.basename(pdb_filename) (name,extension) = os.path.splitext(pdb_file_basename) if extension == '.gz': TMPDIR = os.tempnam(None, "ptgz") os.mkdir(TMPDIR) tmp_pdbfilename = os.path.join(TMPDIR, name) os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) our_pdb_filename = tmp_pdbfilename used_tmp_file = True else: our_pdb_filename = pdb_filename used_tmp_file = False try: if fortran_format and fident: pdbid = fident else: pdbid = name.upper() if len(pdbid) >= 6 and pdbid[:3] == "PDB": pdbid = pdbid[3:7] if chainid: pdbid += '_' + chainid # parse PDB file pdb_parser = PDBParser() pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) # create the Tableaux and output them (tableaux_list, ssestr_list) = make_tableaux(our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, (use_numeric or use_old_format), sse_id_list, use_hk, min_sse_len, build_distance_matrix, chainid, domainid) if build_both: (distmatrix_list, ssestr_list) = make_tableaux(our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, use_numeric, sse_id_list, use_hk, min_sse_len, True, # build_distance_matrix chainid, domainid) i = 1 for tableau in tableaux_list: n = len(tableau) permutation = range(n) # used to permute rows/cols: null permutation if do_shuffle: random.shuffle(permutation) # actually permute for shuffle mode if verbose: sys.stderr.write('permutation is: ' + str(permutation)+'\n') sys.stdout.write('permutation = ' + ','.join([str(x+1) for x in permutation]) + '\n') if i > 1: sys.stdout.write('\ndomain ' + str(i) + ':\n') if fortran_format: sys.stdout.write("%7.7s %4d\n" % (pdbid.upper(), n)) if use_old_format: if build_distance_matrix: write_distmatrix_old_format(n, tableau, ssestr_list[i-1]) else: write_tableau_old_format(n, tableau, ssestr_list[i-1]) else: write_tableau(n, tableau, permutation, use_numeric, fortran_format, build_distance_matrix) if build_both: write_tableau(n, distmatrix_list[i-1], permutation, use_numeric, fortran_format, True) i += 1 finally: if used_tmp_file: cleanup_tmpdir(TMPDIR) if savefilename: if verbose: sys.stderr.write('writing tableau to ' + savefilename +'\n') fh = open(savefilename, "w") if len(tableaux_list) > 1: sys.stderr.write('WARNING: only saving first tableau in list\n') if build_distance_matrix: pickle.dump(distmatrix, fh) elif use_numeric: # Numeric/numpy seems to have no 'packed' format for symmetric # matrices, so we just have to dump the whole thing. pickle.dump(Omega, fh) else: pickle.dump(PTTableauPacked(tableaux_list[0]), fh) fh.close()
def read_tableau_from_tableaucreator(pdb_filename, ptnode_list, ssesinfo_filename): """ Run Arun's TableauCreator program on the supplied pdb_filename using SSEsInfo file. Parameters: pdb_filename - PDB file to run TableauCreator on ptnode_list - list of PTNode objects (ie iterable of PTNode) representing the SSEs (helices,strands) the tabelau is for. ssesinfo_filename - filename of the .SSEsInfo file that was written to define SSEs for TableauCreator. Return value: PTTableau object built from TableauCreator output NB: TableauCreator is not yet published or available (October 2007) and I am using a private version which Arun sent me, which I modified to add the -s option to use STRIDE rather than DSSP and to have the -i option to parse .SSEsInfo files. """ # TableauCreator needs an output directory where it writes all its # intermediate/output files, only puts progress information/errors # to stdout/stderr. tmpdir = os.tempnam(None, "pttab") os.mkdir(tmpdir) command = "TableauCreator " command += "-i " + ssesinfo_filename + " " command += pdb_filename + " " + tmpdir command += " >/dev/null" if verbose: sys.stderr.write("running '" + command + "'...") os.system(command) if verbose: sys.stderr.write("done\n") # output files are: # <pdbfilename>.angles # <pdbfilename>.SSEsInfo # <pdbfilename>.stride or <pdbfilename>.dssp # <pdbfilename>.tableau outfile_prefix = os.path.join(tmpdir, os.path.basename(pdb_filename)) if not os.path.isfile(os.path.join(tmpdir, "TABCREATE_OK")): sys.stderr.write("ERROR: TableauCreator failed\n") cleanup_tmpdir(tmpdir) return None # Now the tricky thing is TableauCreator indexes its matrix just with # purely sequential numbers from 0 (as conventional) # assuming all SSEs in one domain and in fact one chain # (so we handle this by creating our own simple PDB file with only # ATOM records for our current domain, and only one TER record on # end so chains concatenated effectively). # And also (as in comments above functions) we have the dodginess of # doing the same thing in different ways in multiple places (DSSP/STRIDE # parsing, PDB parsing, etc.). # So let's check that the TableauCreator SSE info lines up with ours # (otherwise we can't use the tableau data). # parse the SSEsInfo file and check lines up with ptnodes, # returns list of ptnodes corresponding to Tableau entries (may be shorter # than our input node list; some removed as no equivalent in tableua). nodelist = parse_tableaucreator_ssesinfo(outfile_prefix + '.SSEsInfo', ptnode_list) if nodelist != None: tableau_filename = outfile_prefix + ".tableau" tableau = parse_tableaucreator_output(tableau_filename, nodelist) if tableau != None: if verbose: sys.stderr.write(str(tableau)) else: sys.stderr.write('WARNING: problem parsing TableauCreator output;\n' ' tableau information will not be used\n') else: sys.stderr.write('WARNING: problem with TableauCreator output;\n' ' tableau information will not be used\n') tableau = None cleanup_tmpdir(tmpdir) return tableau
def main(): """ main for pytableaucreate.py Usage: pytableaucreate [-35nefuv] [-d|-b] [-t structprog] [-p domainprog] [-a domainid] [-s sse_num_list] [-c chainid] [-m min_sse_len] [-o savefile] <PDBfile> -3 specifies to include 3_10 helices in the diagram. Default is only alpha helices. -5 specifies to include pi helices in the diagram. Defaul is only alpha helices. -k use the HH and KK codes for respectively antiparallel and parallel strands in the same sheet, rather than the O, P etc. codes. -n output a numeric omega matrix instead of tableau. -e output numeric tableau angles in degrees, in the original TableauCreator .angles file format, with number of entries on first line, SSE sequence description on second line (E/H), then (full) matrix with angles in degrees (rather than radians). For distance matrix, same format with distances between SSEs in Angstroms. -f output the matrix in 'FORTRAN style' lower triangle with header line suitable for input to TMATN. -d build SSE axis midpoint distance matrix rather than tableau. -b build both the tableau and distance matrix and output together, for use with tsrchd etc. for example. If -u is used to permute the matrices, they are permuted the same way so they are still consistent. -p specify the domain decomposition method. Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'. -a domainid : only output specified domain -t specifies the secondary structure assignment program to use. Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride' or 'pmml'. Default 'pdb'. -s sse_num_list specifies a comman-separated list of SSE sequential ids to build the tableau for. SSE sequential id's start at 1 and go from N to C terminus. E.g. -s1,5,8 includes only the 1st, 5th and 8ths SSEs. Numbers do not restart at chains (but do restart in each domain). These nubmers are those assigned by 'ptgraph2 -b sequential' option. TODO: this currently does not make sense when multiple domains are being procssed, this option applies to each domain. -c chainid : specify chain identifier; only build tableau for that chain -m min_sse_len : minimum nubmer of residues in SSE for it to be included -i identifier : when using fortran format (-f), specify the identifier to use in the output rather than deriving it from the filename -o savefile : save tableau in packed format for use in other programs, such as tabsearchqpml.py WARNING: savefile is overwritten if it exists TODO: this currently does not make sense when multiple domains are being procssed, this option only saves first domain. -u randomly pemute the rows+cols (symmetric) of the tableau/distance matrix. writes the permutation vector in form permutation = i,j,..,m e.g. permutation = 3,1,2,4 as first line of output before identifier information and tableau -v specifies verbose mode: debugging output is written to stderr. """ global verbose try: opts, args = getopt.getopt(sys.argv[1:], "35bdfknep:a:t:s:c:m:i:o:uv?") except getopt.GetoptError: usage(os.path.basename(sys.argv[0])) valid_secstruct_programs = ["dssp", "stride", "pdb", "pmml"] valid_domain_programs = getdomains.valid_domain_programs + [r"none"] valid_domain_programs_re = [ re.compile(re_str) for re_str in valid_domain_programs ] verbose = False # global (python globals are only 'global' to module though) secstruct_program = "pdb" include_310_helices = False include_pi_helices = False domain_program = "none" sse_id_list = None use_numeric = False use_hk = False savefilename = None min_sse_len = None fortran_format = False build_distance_matrix = False chainid = None fident = None do_shuffle = False build_both = False # both tableau and dist matrix use_old_format = False # size + SSE chain + degrees omega matrix domainid = None for opt, arg in opts: if opt == "-3": # include 3_10 helices include_310_helices = True elif opt == "-5": # include pi helices include_pi_helices = True elif opt == "-d": # build SSE midpoint distance matrix not tableau build_distance_matrix = True elif opt == "-b": # build both tableau and distance matrix build_both = True elif opt == "-k": # use HH and KK codes use_hk = True elif opt == "-n": # output numeric matrix not tableau use_numeric = True elif opt == "-e": # use TableauCreator .angles file format use_old_format = True elif opt == "-f": # FORTRAN style format for TMATN fortran_format = True elif opt == "-p": # domain parsing program domain_program = None for valid_domarg_re in valid_domain_programs_re: if valid_domarg_re.match(arg): domain_program = arg break if domain_program == None: sys.stderr.write("valid values for -p are: " + str(valid_domain_programs) + "\n") usage(sys.argv[0]) elif opt == "-a": # only output tableau for specified domain id domainid = arg elif opt == "-t": if arg not in valid_secstruct_programs: sys.stderr.write("valid values for -t are: " + str(valid_secstruct_programs) + "\n") usage(sys.argv[0]) secstruct_program = arg elif opt == "-s": sse_id_list_str = arg.split(',') sse_id_list = [] sse_id_uniq_dict = {} # { id : True } just for checking all unique for sse_id_str in sse_id_list_str: if sse_id_str.isdigit(): if sse_id_uniq_dict.has_key(int(sse_id_str)): sys.stderr.write("duplicate SSE sequential number " + sse_id_str + "\n") usage(sys.argv[0]) sse_id_uniq_dict[int(sse_id_str)] = True sse_id_list.append(int(sse_id_str)) else: sys.stderr.write("not a valid SSE sequential number '" + sse_id_str + "'\n") usage(sys.argv[0]) sse_id_list.sort() # ensure SSEs are in order elif opt == "-c": # chain identifier if len(arg) != 1: sys.stderr.write("invalid chain identifier for -c option\n") usage(sys.argv[0]) chainid = arg.upper() elif opt == "-m": # min sse len min_sse_len = int(arg) elif opt == "-i": # identifier to use for fortran format fident = arg elif opt == "-o": # save tableau in packed format savefilename = arg elif opt == "-u": # randomly permute the tableau/matrix do_shuffle = True elif opt == "-v": # verbose verbose = True # this module only ptnode_set_verbose(True) # ptnode module ptsecstruct.ptsecstruct_set_verbose(True) # ptsecstruct module ptdomain_set_verbose(True) # ptdomain module else: usage(sys.argv[0]) if use_numeric and use_hk: sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are " "mutually exlusive\n") usage(sys.argv[0]) if build_distance_matrix and build_both: sys.stderr.write("WARNING: both -d (build dist matrix) and -b " "(build both) specified, ignoring -d\n") build_distance_matrix = False if savefilename and do_shuffle: sys.stderr.write('WARNING: saved tableau will not be shuffled\n') if build_distance_matrix: if use_numeric: use_numeric = False sys.stderr.write( "WARNING: -n (numeric) ignored for -d (distance matrix)\n") if use_hk: sys.stderr.write( "-k (use HH and KK) invalid for -d (distance matrix)\n") usage(sys.argv[0]) if (secstruct_program == "pmml" and (min_sse_len == None or min_sse_len < 3)): sys.stderr.write( "WARNING: PMML can give SSEs of length 1 or 2 causing axis fitting to fail, setting minimum length to 3 as if -m3 were specfified\n" ) min_sse_len = 3 if fident: if not fortran_format: sys.stderr.write("-i is only valid with -f\n") usage(sys.argv[0]) elif len(fident) > 8: sys.stderr.write("identifier must be 8 chars or less\n") usage(sys.argv[0]) if use_old_format and (build_both or use_hk or use_numeric or fortran_format or do_shuffle or savefilename): sys.stderr.write("-e (use old .angles format) is not compatible " "with -b -k or -n or -f or -u or -o\n") usage(os.path.basename(sys.argv[0])) if len(args) != 1: usage(os.path.basename(sys.argv[0])) pdb_filename = args[0] # check for compressed files. We only support gzip (.gz) # Note we are not using the zlib or GzipFile python modules # since we are calling to external programs which require the # file uncompressed themsevles anyway so we'll just run gzip # to uncompress the file to a temporary directory. pdb_file_basename = os.path.basename(pdb_filename) (name, extension) = os.path.splitext(pdb_file_basename) if extension == '.gz': TMPDIR = os.tempnam(None, "ptgz") os.mkdir(TMPDIR) tmp_pdbfilename = os.path.join(TMPDIR, name) os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename) our_pdb_filename = tmp_pdbfilename used_tmp_file = True else: our_pdb_filename = pdb_filename used_tmp_file = False try: if fortran_format and fident: pdbid = fident else: pdbid = name.upper() if len(pdbid) >= 6 and pdbid[:3] == "PDB": pdbid = pdbid[3:7] if chainid: pdbid += '_' + chainid # parse PDB file pdb_parser = PDBParser() pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename) # create the Tableaux and output them (tableaux_list, ssestr_list) = make_tableaux(our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, (use_numeric or use_old_format), sse_id_list, use_hk, min_sse_len, build_distance_matrix, chainid, domainid) if build_both: (distmatrix_list, ssestr_list) = make_tableaux( our_pdb_filename, pdb_struct, secstruct_program, domain_program, include_310_helices, include_pi_helices, use_numeric, sse_id_list, use_hk, min_sse_len, True, # build_distance_matrix chainid, domainid) i = 1 for tableau in tableaux_list: n = len(tableau) permutation = range( n) # used to permute rows/cols: null permutation if do_shuffle: random.shuffle( permutation) # actually permute for shuffle mode if verbose: sys.stderr.write('permutation is: ' + str(permutation) + '\n') sys.stdout.write('permutation = ' + ','.join([str(x + 1) for x in permutation]) + '\n') if i > 1: sys.stdout.write('\ndomain ' + str(i) + ':\n') if fortran_format: sys.stdout.write("%7.7s %4d\n" % (pdbid.upper(), n)) if use_old_format: if build_distance_matrix: write_distmatrix_old_format(n, tableau, ssestr_list[i - 1]) else: write_tableau_old_format(n, tableau, ssestr_list[i - 1]) else: write_tableau(n, tableau, permutation, use_numeric, fortran_format, build_distance_matrix) if build_both: write_tableau(n, distmatrix_list[i - 1], permutation, use_numeric, fortran_format, True) i += 1 finally: if used_tmp_file: cleanup_tmpdir(TMPDIR) if savefilename: if verbose: sys.stderr.write('writing tableau to ' + savefilename + '\n') fh = open(savefilename, "w") if len(tableaux_list) > 1: sys.stderr.write('WARNING: only saving first tableau in list\n') if build_distance_matrix: pickle.dump(distmatrix, fh) elif use_numeric: # Numeric/numpy seems to have no 'packed' format for symmetric # matrices, so we just have to dump the whole thing. pickle.dump(Omega, fh) else: pickle.dump(PTTableauPacked(tableaux_list[0]), fh) fh.close()