def create_inputs(options, outpath, found_sequences): errors = [] # Create subdirectories job_inputs = [] for (pdb_id, chain, file_name), sequence in sorted(found_sequences.iteritems()): created_new_subdirectory = False subdir_path = os.path.join(outpath, "%s%s" % (pdb_id, chain)) try: if os.path.exists(subdir_path): if options.overwrite: colorprinter.warning("Path %s exists. Removing all files in that path as per the override option." % subdir_path) shutil.rmtree(subdir_path) created_new_subdirectory = True else: errors.append('The directory %s already exists.' % subdir_path) # uncomment this if we want to turn on the _001, _002, etc. directories count = 1 while count < 1000: subdir_path = os.path.join(outpath, "%s%s_%.3i" % (pdb_id, chain, count)) if not os.path.exists(subdir_path): break count += 1 if count == 1000: errors.append("The directory %s contains too many previous results. Please clean up the old results or choose a new output directory." % outpath) sys.exit(ERRCODE_OLDRESULTS) os.makedirs(subdir_path, 0755) # Create a FASTA file for the sequence in the output directory fasta_file = os.path.join(subdir_path, "%s%s.fasta" % (pdb_id, chain)) colorprinter.message("Creating a new FASTA file %s." % fasta_file) assert(not(os.path.exists(fasta_file))) write_file(fasta_file, '\n'.join(sequence) + '\n', 'w') # The file must terminate in a newline for the Perl script to work job_inputs.append(JobInput(fasta_file, pdb_id, chain)) except: if created_new_subdirectory and os.path.exists(subdir_path): shutil.rmtree(subdir_path) errors.append('An error occurred creating the input for %s%s.' % (pdb_id, chain)) job_inputs = [] break return job_inputs, errors
def get_sequences(options, fasta_file_contents): ''' This function returns a dict mapping (pdbid, chain, file_name) tuples to sequences: - options is the OptionParser member; - fasta_file_contents is a map from input filenames to the associated FASTA file contents. ''' errors = [] fasta_files_str = ", ".join(fasta_file_contents.keys()) fasta_records = None reverse_mapping = {} try: fasta_records, reverse_mapping = parse_FASTA_files(options, fasta_file_contents) if not fasta_records: errors.append("No protein sequences found in the FASTA file(s) %s." % fasta_files_str) except Exception, e: e = '\n'.join([l for l in traceback.format_exc(), str('e') if l.strip()]) errors.append("Error parsing FASTA file(s) %s:\n%s" % (fasta_files_str, str(e))) if not fasta_records: return None, {}, errors colorprinter.message('Found %d protein sequence(s).' % len(fasta_records)) return fasta_records, reverse_mapping, errors
def setup_jobs(outpath, options, input_files): ''' This function sets up the jobs by creating the necessary input files as expected. - outpath is where the output is to be stored. - options is the optparse options object. - input_files is a list of paths to input files. ''' job_inputs = None reverse_mapping = None fasta_file_contents = {} # Generate FASTA files for PDB inputs # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in # then all chains (protein, RNA, or DNA) will be used in a Rosetta run. for input_file in input_files: assert(not(fasta_file_contents.get(input_file))) if any(fnmatch(input_file, x) for x in pdb_file_wildcards): pdb = PDB.from_filepath(input_file, strict=True) pdb.pdb_id = os.path.basename(input_file).split('.')[0] if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7: # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB pdb.pdb_id = pdb.pdb_id.replace('pdb', '') fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB') else: fasta_file_contents[input_file] = (read_file(input_file), 'FASTA') # Extract sequences from the input FASTA files. found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents) if found_sequences: reformat(found_sequences) if errors: return None, False, errors # Discard sequences that are the wrong chain. desired_sequences = {} for key, sequence in found_sequences.iteritems(): pdb_id, chain, file_name = key if options.chain is None or chain == options.chain: desired_sequences[key] = sequence # Create the input FASTA and script files. job_inputs, errors = create_inputs(options, outpath, desired_sequences) # Create the reverse mapping file if reverse_mapping: segment_mapping_file = os.path.join(outpath, "segment_map.json") colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file) write_file(segment_mapping_file, json.dumps(reverse_mapping)) # Create the post-processing script file post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py')) write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w') # Create the secondary structure filter file if options.secondary_structure_file: write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w') return job_inputs, reverse_mapping != None, errors
colorprinter.error(str(e)) sys.exit(ERRCODE_ARGUMENTS) submission_script = os.path.join(options["outpath"], 'submission_script.py') write_file(submission_script, job_script, 'w') try: send_mail = options['sendmail'] username = None if send_mail: username = get_username() (jobid, output) = ClusterEngine.submit(submission_script, options["outpath"], send_mail = send_mail, username = username ) except Exception, e: colorprinter.error("An exception occurred during submission to the cluster.") colorprinter.error(str(e)) colorprinter.error(traceback.format_exc()) sys.exit(ERRCODE_CLUSTER) colorprinter.message("\nFragment generation jobs started with job ID %d. Results will be saved in %s." % (jobid, options["outpath"])) if options['no_homologs']: print("The --nohoms option was selected.") if options['no_zip']: print("The --nozip option was selected.") if ClusterEngine.ClusterType == "SGE": print("The jobs have been submitted using the %s queue(s)." % (', '.join(sorted(options['queue'])) or 'default')) print('') logfile.writeToLogfile(datetime.now(), jobid, options["outpath"]) if __name__ == "__main__": main()