def main(): this_dir = os.path.dirname(os.path.realpath(__file__)) make_fragments_script_path = os.path.join(this_dir, make_fragments_script) errors = [] #check_configuration_paths() if errors: colorprinter.error("There is an error in the configuration files:") for e in errors: print("") flname = e[0] es = e[1] colorprinter.warning(flname) for e in es: colorprinter.error(e) sys.exit(ERRCODE_CONFIG) options = parse_args() if options["outpath"] and options['job_inputs']: job_script = None try: cluster_job = ClusterEngine.FragmentsJob(make_fragments_script_path, options, test_mode = test_mode) job_script = cluster_job.script except JobInitializationException, e: colorprinter.error(str(e)) sys.exit(ERRCODE_ARGUMENTS) submission_script = os.path.join(options["outpath"], 'submission_script.py') write_file(submission_script, job_script, 'w') try: send_mail = options['sendmail'] username = None if send_mail: username = get_username() (jobid, output) = ClusterEngine.submit(submission_script, options["outpath"], send_mail = send_mail, username = username ) except Exception, e: colorprinter.error("An exception occurred during submission to the cluster.") colorprinter.error(str(e)) colorprinter.error(traceback.format_exc()) sys.exit(ERRCODE_CLUSTER)
def parse_args(): global errcode errors = [] pdbpattern = re.compile("^\w{4}$") logfile_name = logfile.getName() script_name = os.path.split(sys.argv[0])[1] description = '\n' + """\ *** Help *** The output of the computation will be saved in the output directory, along with the input FASTA file which is generated from the supplied FASTA file. To admit queries, a log of the output directories for cluster jobs is saved in {logfile_name} in the current directory. The FASTA description lines must begin with '>protein_id|chain_letter'. This information may optionally be followed by a '|' and more text. There are a few caveats: 1. The underlying Perl script requires a 5-character ID for the sequence identifier which is typically a PDB ID followed by a chain ID e.g. "1a2pA". For this reason, our script expects FASTA record headers to have a form like ">xxxx|y" where xxxx is a 4-letter identifier e.g. PDB ID and y is a chain identifier. protein_id identifier may be longer than 4 characters and chain_letter must be a single character. However, only the first 4 characters of the identifier are used by the script. Any information after the chain identifier must be preceded by a '|' character. For example, ">1A2P_001|A|some information" is a valid header but the generated ID will be "1a2pA" (we convert PDB IDs to lowercase). 2. If you are submitting a batch job, the list of 5-character IDs generated from the FASTA files using the method above must be unique. For example, if you have two records ">1A2P_001|A|" and "">1A2P_002|A|" then the job will fail. On the other hand, ">1A2P_001|A|" and "">1A2P_001|B|" is perfectly fine and the script will output fragments for 1a2pA and 1a2pB. 3. By design, residue ID ranges are capped at chain boundaries. For example, if a PDB has chains A (residues 1-50), B (residues 51-100), and chain C (residues 101- 150) and the user selects 9mers for the ranges 35-48 and 101-110 then, since none of the ranges overlap with chain B - even though they will when 9mers are considered - we will not generate any fragments for chain B. This behavior is chosen as it seems the most intuitive/expected. *** Examples *** Single-sequence fragment generation: 1: {script_name} -d results /path/to/1CYO.fasta.txt Multi-sequence fragment generation (batch job): 2: {script_name} -d results /some/path/*.fa??? /some/other/path/ Fragment generation for a specific chain: 3: {script_name} -d results /path/to/1CYO.fasta.txt -cA Fragment generation using a loops file applied to: a) a FASTA file; b) a PDB identifier; c) a directory of FASTA/PDB files and a PDB ID, using the short/test queue: 4a: {script_name} -d results -l input/loops_file input/fragments/0001.fasta 4b: {script_name} -d results -l input/loops_file 4un3 4c: {script_name} -d results -l input/loops_file -q short.q input/fragments 4un3 *** Example secondary structure definition file*** # Comments are allowed. A line has two columns: the first specifies the residue(s), # the second specifies the expected secondary structure using H(elix), E(xtended/sheet), # or L(oop). The second column is case-insensitive. # # A single residue, any structure 1339 HEL # An expected helix 1354-1359 H # A helical or sheet structure 1360,1370-1380 HE """.format(**locals()) parser = OptionParserWithNewlines(usage="usage: %prog [options] <inputs>...", version="%prog 1.1A", option_class=MultiOption) parser.epilog = description group = OptionGroup(parser, "Fragment generation options") group.add_option("-N", "--nohoms", dest="nohoms", action="store_true", help="Optional. If this option is set then homologs are omitted from the search.") group.add_option("-s", "--frag_sizes", dest="frag_sizes", help="Optional. A list of fragment sizes e.g. -s 3,6,9 specifies that 3-mer, 6-mer, and 9-mer fragments are to be generated. The default is for 3-mer and 9-mer fragments to be generated.") group.add_option("-c", "--chain", dest="chain", help="Chain used for the fragment. This is optional so long as the FASTA file only contains one chain.", metavar="CHAIN") group.add_option("-l", "--loops_file", dest="loops_file", help="Optional but recommended. A Rosetta loops file which will be used to select sections of the FASTA sequences from which fragments will be generated. This saves a lot of time on large sequences.") group.add_option("-i", "--indices", dest="indices", help="Optional. A comma-separated list of ranges. A range can be a single index or a hyphenated range. For example, '10-30,66,90-93' is a valid set of indices. The indices are used to pick out parts of the supplied sequences for fragment generation and start at 1 (1-indexed). Similarly to the loops_file option, this restriction may save a lot of computational resources. If this option is used in addition to the loops_file option then the sections defined by the indices are combined with those in the loops file.") group.add_option("--ss", dest="secondary_structure_file", help="Optional. A secondary structure definition file. This is used in postprocessing to filter out fragments which do not match the requested secondary structure.") group.add_option("--n_frags", dest="n_frags", help="Optional. The number of fragments to generate. This must be less than the number of candidates. The default value is 200.") group.add_option("--n_candidates", dest="n_candidates", help="Optional. The number of candidates to generate. The default value is 1000.") group.add_option("--add_vall_files", dest="add_vall_files", help="Optional and untested. This option allows extra Vall files to be added to the run. The files must be comma-separated.") group.add_option("--use_vall_files", dest="use_vall_files", help="Optional and untested. This option specifies that the run should use only the following Vall files. The files must be comma-separated.") group.add_option("--add_pdbs_to_vall", dest="add_pdbs_to_vall", help="Optional and untested. This option adds extra pdb Vall files to the run. The files must be comma-separated.") parser.add_option_group(group) group = OptionGroup(parser, "General options") group.add_option("-d", "--outdir", dest="outdir", help="Optional. Output directory relative to user space on netapp. Defaults to the current directory so long as that is within the user's netapp space.", metavar="OUTPUT_DIRECTORY") group.add_option("-V", "--overwrite", dest="overwrite", action="store_true", help="Optional. If the output directory <PDBID><CHAIN> for the fragment job(s) exists, delete the current contents.") group.add_option("-F", "--force", dest="force", action="store_true", help="Optional. Create the output directory without prompting.") group.add_option("-M", "--email", dest="sendmail", action="store_true", help="Optional. If this option is set, an email is sent when the job finishes or fails (cluster-dependent). WARNING: On an SGE cluster, an email will be sent for each FASTA file i.e. for each task in the job array.") group.add_option("-Z", "--nozip", dest="nozip", action="store_true", help="Optional, false by default. If this is option is set then the resulting fragments are not compressed with gzip. We compress output by default as this can reduce the output size by 90% and the resulting zipped files can be passed directly to Rosetta.") parser.add_option_group(group) group = OptionGroup(parser, "Cluster options") group.add_option("-q", "--queue", dest="queue", help="Optional. Specify which cluster queue to use. Whether this option works and what this value should be will depend on your cluster architecture. Valid arguments for the QB3 SGE cluster are long.q, lab.q, and short.q. By default, no queue is specified. This may be a single value or a comma-separated list of queues. The short.q is only allowed on its own for test runs.", metavar="QUEUE_NAME") group.add_option("-x", "--scratch", type="int", dest="scratch", help="Optional. Specifies the amount of /scratch space in GB to reserve for the job.") group.add_option("-m", "--memfree", type="int", dest="memfree", help="Optional. Specifies the amount of RAM in GB that the job will require on the cluster. This must be at least 2GB.") group.add_option("-r", "--runtime", type="int", dest="runtime", help="Optional. Specifies the runtime in hours that the job will require on the cluster.") parser.add_option_group(group) group = OptionGroup(parser, "Querying options") group.add_option("-K", "--check", dest="check", help="Optional, needs to be fixed for batch mode. Query whether or not a job is running. It if has finished, query %s and print whether the job was successful." % logfile.getName(), metavar="JOBID") group.add_option("-Q", "--query", dest="query", action="store_true", help="Optional, needs to be fixed for batch mode. Query the progress of the cluster job against %s and then quit." % logfile.getName()) parser.add_option_group(group) parser.set_defaults(outdir = os.getcwd()) parser.set_defaults(overwrite = False) parser.set_defaults(nohoms = False) parser.set_defaults(force = False) parser.set_defaults(query = False) parser.set_defaults(sendmail = False) parser.set_defaults(queue = []) parser.set_defaults(nozip = False) parser.set_defaults(scratch = 10) parser.set_defaults(memfree = 40) parser.set_defaults(runtime = 6) parser.set_defaults(frag_sizes = '3,9') parser.set_defaults(n_frags = '200') parser.set_defaults(n_candidates = '1000') parser.set_defaults(add_vall_files = '') parser.set_defaults(use_vall_files = '') parser.set_defaults(add_pdbs_to_vall = '') (options, args) = parser.parse_args() username = get_username() # QUERY if options.query: ClusterEngine.query(logfile) # CHECK elif options.check: if not(options.check.isdigit()): errors.append("Please enter a valid job identifier.") else: # The job has finished. Check the output file. jobID = int(options.check) errors.extend(ClusterEngine.check(logfile, jobID, cluster_job_name)) validOptions = options.query or options.check # Queue if options.queue: options.queue = sorted(set(options.queue.split(','))) # RAM / scratch if options.scratch < 1: errors.append("The amount of scratch space requested must be at least 1 (GB).") if options.memfree < 2: errors.append("The amount of RAM requested must be at least 2 (GB).") if options.runtime < 6: errors.append("The requested runtime must be at least 6 (hours).") # CHAIN if options.chain and not (len(options.chain) == 1): errors.append("Chain must only be one character.") # OUTDIR outpath = options.outdir if outpath[0] != "/": outpath = os.path.abspath(outpath) outpath = os.path.normpath(outpath) # Loops file if options.loops_file: if not os.path.isabs(options.loops_file): options.loops_file = os.path.realpath(options.loops_file) if not(os.path.exists(options.loops_file)): errors.append('The loops file %s does not exist.' % options.loops_file) if options.indices: try: options.indices = parse_range_pairs(options.indices, range_separator = '-') except Exception, e: errors.append('The indices argument must be a list of valid indices into the sequences for which fragments are to be generated.')