Ejemplo n.º 1
0
def create_inputs(options, outpath, found_sequences):
    errors = []

    # Create subdirectories
    job_inputs = []
    for (pdb_id, chain, file_name), sequence in sorted(found_sequences.iteritems()):
        created_new_subdirectory = False
        subdir_path = os.path.join(outpath, "%s%s" % (pdb_id, chain))
        try:
            if os.path.exists(subdir_path):
                if options.overwrite:
                    colorprinter.warning("Path %s exists. Removing all files in that path as per the override option." % subdir_path)
                    shutil.rmtree(subdir_path)
                    created_new_subdirectory = True
                else:
                    errors.append('The directory %s already exists.' % subdir_path) # uncomment this if we want to turn on the _001, _002, etc. directories
                    count = 1
                    while count < 1000:
                        subdir_path = os.path.join(outpath, "%s%s_%.3i" % (pdb_id, chain, count))
                        if not os.path.exists(subdir_path):
                            break
                        count += 1
                    if count == 1000:
                        errors.append("The directory %s contains too many previous results. Please clean up the old results or choose a new output directory." % outpath)
                        sys.exit(ERRCODE_OLDRESULTS)
            os.makedirs(subdir_path, 0755)

            # Create a FASTA file for the sequence in the output directory
            fasta_file = os.path.join(subdir_path, "%s%s.fasta" % (pdb_id, chain))
            colorprinter.message("Creating a new FASTA file %s." % fasta_file)

            assert(not(os.path.exists(fasta_file)))
            write_file(fasta_file, '\n'.join(sequence) + '\n', 'w') # The file must terminate in a newline for the Perl script to work
            job_inputs.append(JobInput(fasta_file, pdb_id, chain))
        except:
            if created_new_subdirectory and os.path.exists(subdir_path):
                shutil.rmtree(subdir_path)
            errors.append('An error occurred creating the input for %s%s.' % (pdb_id, chain))
            job_inputs = []
            break

    return job_inputs, errors
Ejemplo n.º 2
0
def get_sequences(options, fasta_file_contents):
    ''' This function returns a dict mapping (pdbid, chain, file_name) tuples to sequences:
          - options is the OptionParser member;
          - fasta_file_contents is a map from input filenames to the associated FASTA file contents.
    '''
    errors = []
    fasta_files_str = ", ".join(fasta_file_contents.keys())
    fasta_records = None
    reverse_mapping = {}

    try:
        fasta_records, reverse_mapping = parse_FASTA_files(options, fasta_file_contents)
        if not fasta_records:
            errors.append("No protein sequences found in the FASTA file(s) %s." % fasta_files_str)
    except Exception, e:
        e = '\n'.join([l for l in traceback.format_exc(), str('e') if l.strip()])
        errors.append("Error parsing FASTA file(s) %s:\n%s" % (fasta_files_str, str(e)))

    if not fasta_records:
        return None, {}, errors

    colorprinter.message('Found %d protein sequence(s).' % len(fasta_records))
    return fasta_records, reverse_mapping, errors
Ejemplo n.º 3
0
def setup_jobs(outpath, options, input_files):
    ''' This function sets up the jobs by creating the necessary input files as expected.
          - outpath is where the output is to be stored.
          - options is the optparse options object.
          - input_files is a list of paths to input files.
    '''

    job_inputs = None
    reverse_mapping = None
    fasta_file_contents = {}

    # Generate FASTA files for PDB inputs
    # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type
    # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do
    # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of
    # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in
    # then all chains (protein, RNA, or DNA) will be used in a Rosetta run.
    for input_file in input_files:
        assert(not(fasta_file_contents.get(input_file)))
        if any(fnmatch(input_file, x) for x in pdb_file_wildcards):
            pdb = PDB.from_filepath(input_file, strict=True)
            pdb.pdb_id = os.path.basename(input_file).split('.')[0]            
            if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7:
                # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB
                pdb.pdb_id = pdb.pdb_id.replace('pdb', '')    
            fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB')
        else:
            fasta_file_contents[input_file] = (read_file(input_file), 'FASTA')

    # Extract sequences from the input FASTA files.
    found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents)
    if found_sequences:
        reformat(found_sequences)
    if errors:
        return None, False, errors

    # Discard sequences that are the wrong chain.
    desired_sequences = {}
    for key, sequence in found_sequences.iteritems():
        pdb_id, chain, file_name = key
        if options.chain is None or chain == options.chain:
            desired_sequences[key] = sequence

    # Create the input FASTA and script files.
    job_inputs, errors = create_inputs(options, outpath, desired_sequences)

    # Create the reverse mapping file
    if reverse_mapping:
        segment_mapping_file = os.path.join(outpath, "segment_map.json")
        colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file)
        write_file(segment_mapping_file, json.dumps(reverse_mapping))

    # Create the post-processing script file
    post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py'))
    write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w')

    # Create the secondary structure filter file
    if options.secondary_structure_file:
        write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w')

    return job_inputs, reverse_mapping != None, errors
Ejemplo n.º 4
0
            colorprinter.error(str(e))
            sys.exit(ERRCODE_ARGUMENTS)

        submission_script = os.path.join(options["outpath"], 'submission_script.py')
        write_file(submission_script, job_script, 'w')

        try:
            send_mail = options['sendmail']
            username = None
            if send_mail:
                username = get_username()
            (jobid, output) = ClusterEngine.submit(submission_script, options["outpath"], send_mail = send_mail, username = username )
        except Exception, e:
            colorprinter.error("An exception occurred during submission to the cluster.")
            colorprinter.error(str(e))
            colorprinter.error(traceback.format_exc())
            sys.exit(ERRCODE_CLUSTER)

        colorprinter.message("\nFragment generation jobs started with job ID %d. Results will be saved in %s." % (jobid, options["outpath"]))
        if options['no_homologs']:
            print("The --nohoms option was selected.")
        if options['no_zip']:
            print("The --nozip option was selected.")
        if ClusterEngine.ClusterType == "SGE":
            print("The jobs have been submitted using the %s queue(s)." % (', '.join(sorted(options['queue'])) or 'default'))
        print('')
        logfile.writeToLogfile(datetime.now(), jobid, options["outpath"])

if __name__ == "__main__":
    main()