コード例 #1
0
ファイル: prune_pairs.py プロジェクト: amorehead/DIPS
def main(pair_dir, to_keep_dir, output_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    to_keep_filenames = \
        db.get_structures_filenames(to_keep_dir, extension='.txt')
    if len(to_keep_filenames) == 0:
        logging.warning(
            "There is no to_keep file in {:}. All pair files from {:} "
            "will be copied into {:}".format(to_keep_dir, pair_dir,
                                             output_dir))

    to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames)
    logging.info("There are {:} rows, cols in to_keep_df".format(
        to_keep_df.shape))

    logging.info("Looking for all pairs in {:}".format(pair_dir))
    work_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(i, o, to_keep_df)
              for i, o in zip(work_filenames, output_filenames)]
    ncopied = 0
    ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        ncopied, len(work_keys)))
コード例 #2
0
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus):
    ext = '.pkl'
    requested_filenames = \
        db.get_structures_filenames(pdb_dataset, extension=ext)
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [
        x[0] for x in db.get_all_filenames(work_keys,
                                           pdb_dataset,
                                           extension=ext,
                                           keyer=lambda x: db.get_pdb_name(x))
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)
コード例 #3
0
def parse_all(pdb_dataset, output_dir, num_cpus):
    """Parse pdb dataset (pdb files) to pandas dataframes."""
    requested_filenames = db.get_structures_filenames(pdb_dataset)
    produced_filenames = db.get_structures_filenames(
        output_dir, extension='.pkl')

    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    work_filenames = [x[0] for x in
                      db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)]

    logging.info("{:} requested keys, {:} produced keys, {:} work keys"
                 .format(len(requested_keys), len(produced_keys),
                         len(work_keys)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(
            sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl")

    inputs = [(key, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(parse, inputs, num_cpus)
コード例 #4
0
ファイル: tfrecord.py プロジェクト: amorehead/DIPS
def main(pair_dir, tfrecord_dir, num_cpus):
    """Run write_pairs on all provided complexes."""
    requested_filenames = \
        db.get_structures_filenames(pair_dir, extension='.dill')
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = \
        db.get_structures_filenames(tfrecord_dir, extension='.tfrecord')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]

    work_keys = [key for key in requested_keys if key not in produced_keys]
    logging.info("{:} requested keys, {:} produced keys, {:} work keys".format(
        len(requested_keys), len(produced_keys), len(work_keys)))
    work_filenames = [
        x[0]
        for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill')
    ]

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".tfrecord")

    inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)]
    par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
コード例 #5
0
ファイル: sequence.py プロジェクト: NabinGiri/atom3
def pdb_to_fasta(pdb_filename, fasta_filename, id_filename, separate):
    """Write a pdb file as a fasta file."""
    flat_map = {}
    pdb_name = db.get_pdb_name(pdb_filename)
    structure = pd.read_pickle(pdb_filename)
    fasta_name_to_chain = {}
    for (chain, residues) in struct.get_chain_to_valid_residues(structure):
        fasta_name = pdb_name + '-' + chain[-2] + '-' + chain[-1]
        flat_map[fasta_name] = residues
        fasta_name_to_chain[fasta_name] = chain
    names = []
    filenames = []
    id_filenames = []
    if not separate:
        write_fasta(flat_map, fasta_filename, id_out=id_filename)
        filenames.append(fasta_filename)
        id_filenames.append(id_filename)
        names.append('all')
    else:
        for (name, seq) in flat_map.items():
            new_dict = {}
            new_dict[name] = seq
            filename = fasta_filename.format(name)
            filename2 = id_filename.format(name)
            write_fasta(new_dict, filename, id_out=filename2)
            names.append(fasta_name_to_chain[name])
            filenames.append(filename)
            id_filenames.append(filename2)
    return (names, filenames, id_filenames)
コード例 #6
0
ファイル: prune_pairs.py プロジェクト: amorehead/DIPS
def __should_keep(pair_filename, to_keep_df):
    assert (not to_keep_df.empty)
    # pair_name example: 20gs.pdb1_0
    pair_name_regex = re.compile(
        '(?P<pdb_code>\w{4})(\.pdb(?P<struct_id>\d+))*(_(?P<pair_id>\d+))')

    pair_name = db.get_pdb_name(pair_filename)
    pair_metadata = pair_name_regex.match(pair_name).groupdict()

    # The order to check is: pdb_code, struct_id, pair_id, chain
    if pair_metadata['pdb_code'] not in set(to_keep_df.pdb_code):
        return False
    # Check if we need to select based on struct_id
    slice = to_keep_df[to_keep_df.pdb_code == pair_metadata['pdb_code']]
    if 'struct_id' in slice.columns:
        if pair_metadata['struct_id'] not in set(slice.struct_id):
            return False
        slice = slice[slice.struct_id == pair_metadata['struct_id']]
    # Check if we need to select based on pair_id
    if 'pair_id' in slice.columns:
        if pair_metadata['pair_id'] not in set(slice.pair_id):
            return False
        slice = slice[slice.pair_id == pair_metadata['pair_id']]
    # Check if we need to select based on chain
    if 'chain' in slice.columns:
        pair = pa.read_pair_from_dill(pair_filename)
        pair_chains = set(pair.df0.chain) | set(pair.df1.chain)
        # Convert chain names to lowercase
        pair_chains = set([c.lower() for c in pair_chains])
        # All chains in the pair need to be to_keep_df to be valid
        if not pair_chains.issubset(set(slice.chain)):
            return False
    return True
コード例 #7
0
ファイル: structure.py プロジェクト: stephaniewankowicz/atom3
def get_missing_sidechains(pdb_dataset, output_scwrl):
    """Get residues that are missing atoms."""
    for pdb_filename in db.get_structures_filenames(pdb_dataset):
        biopy_structure = db.parse_biopython_structure(pdb_filename)
        pdb_name = db.get_pdb_name(pdb_filename)
        missing = 0
        scwrl_list = []
        logging.info("Processing {:}".format(pdb_name))
        for model in biopy_structure:
            for chain in model:
                for i, residue in enumerate(chain):
                    res_name = residue.resname
                    if res_name not in expected:
                        logging.warning("Non-standard residue found: {:}. "
                                        "Skipping.".format(res_name))
                        continue
                    res_code = poly.three_to_one(res_name)
                    res_id = residue.id[1]
                    curr_count = len(
                        Bio.PDB.Selection.unfold_entities(residue, 'A'))
                    if curr_count != expected[res_name]:
                        logging.debug(
                            "Missing residue {:} at position {:} (with id {:})"
                            " which has {:} instead of the expected {:} atoms."
                            .format(res_name, i, res_id, curr_count,
                                    expected[res_name]))
                        missing += 1
                        scwrl_list.append(res_code.upper())
                    else:
                        scwrl_list.append(res_code.lower())

        logging.debug("Missing {:} residue total".format(missing))
        with open(output_scwrl, 'w') as f:
            f.write("".join(scwrl_list))
コード例 #8
0
def complexes_from_pair_dir(pair_dir):
    """Get all complex names from provided pair directory."""
    filenames = db.get_structures_filenames(pair_dir, extension='.dill')
    # Remove per-chain identifier.
    # TODO: This could cause issues when only some of the pairs have been
    # written.
    return ['_'.join(db.get_pdb_name(x).split('_')[:-1]) for x in filenames]
コード例 #9
0
ファイル: case.py プロジェクト: stephaniewankowicz/atom3
def is_of_type(pdb_name, style, receptor=None, bound=None):
    """Check if pdb_name is of requested type."""
    pdb_name = db.get_pdb_name(pdb_name, with_type=False)
    if receptor is None:
        if bound is None:
            return True
        elif bound:
            if style == 'db5':
                return _has_symbol('b', pdb_name)
            elif style == 'dockground':
                # Dockground only has pdb code.
                return len(pdb_name) == 4
        else:
            return _has_symbol('u', pdb_name)

    if bound is None:
        if receptor:
            return _has_symbol('r', pdb_name) or _has_symbol('2', pdb_name)
        else:
            return _has_symbol('l', pdb_name) or _has_symbol('1', pdb_name)

    if receptor and bound:
        return _has_symbol('r_b', pdb_name) or _has_symbol('2_b', pdb_name)
    elif receptor and not bound:
        return _has_symbol('r_u', pdb_name) or _has_symbol('2_u', pdb_name)
    elif not receptor and bound:
        return _has_symbol('l_b', pdb_name) or _has_symbol('1_b', pdb_name)
    else:
        return _has_symbol('l_u', pdb_name) or _has_symbol('1_u', pdb_name)

    return False
コード例 #10
0
def map_pssms(pdb_filename, blastdb, output_filename):
    pdb_name = db.get_pdb_name(pdb_filename)
    start_time = timeit.default_timer()
    start_time_blasting = timeit.default_timer()
    pis = gen_pssm(pdb_filename, blastdb, output_filename)
    num_chains = len(pis.groupby(['pdb_name', 'model', 'chain']))
    elapsed_blasting = timeit.default_timer() - start_time_blasting

    parsed = pd.read_pickle(pdb_filename)
    parsed = parsed.merge(
        pis, on=['model', 'pdb_name', 'chain', 'residue'])

    start_time_writing = timeit.default_timer()
    parsed.to_pickle(output_filename)
    elapsed_writing = timeit.default_timer() - start_time_writing

    elapsed = timeit.default_timer() - start_time
    logging.info(
        ('For {:d} pssms generated from {} spent {:05.2f} blasting, '
         '{:05.2f} writing, and {:05.2f} overall.')
        .format(
             num_chains,
             pdb_name,
             elapsed_blasting,
             elapsed_writing,
             elapsed))
コード例 #11
0
ファイル: structure.py プロジェクト: stephaniewankowicz/atom3
def get_chain(pdb_filename):
    """Get chain from split pdb filename."""
    pdb_name = db.get_pdb_name(pdb_filename, with_type=False)
    tokens = pdb_name.split("_")
    if len(tokens) < 3:
        return 0
    else:
        return tokens[1]
コード例 #12
0
ファイル: complex.py プロジェクト: NabinGiri/atom3
def _get_rcsb_complexes(filenames):
    """Get complexes for RCSB type dataset."""
    complexes = {}
    for filename in filenames:
        name = db.get_pdb_name(filename)
        complexes[name] = Complex(name=name,
                                  bound_filenames=[filename],
                                  unbound_filenames=[])
    return complexes
コード例 #13
0
ファイル: complex.py プロジェクト: amorehead/atom3
def _get_casp_capri_complexes(filenames, keyer=db.get_pdb_code):
    """Get complexes for CASP-CAPRI type dataset."""
    complexes = {}
    for filename in filenames:
        name = db.get_pdb_name(filename)
        complexes[name] = Complex(name=name,
                                  bound_filenames=[filename],
                                  unbound_filenames=[])
    return complexes
コード例 #14
0
ファイル: structure.py プロジェクト: stephaniewankowicz/atom3
def get_model(pdb_filename):
    """Get model from split pdb filename."""
    pdb_name = db.get_pdb_name(pdb_filename, with_type=False)
    tokens = pdb_name.split("_")
    if len(tokens) < 3:
        return 0
    elif not tokens[2].isdigit():
        return 0
    else:
        return int(tokens[2])
コード例 #15
0
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff,
         num_cpus):
    """Run postprocess_pruned_pairs on all provided complexes."""
    logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir))
    work_filenames = \
        db.get_structures_filenames(pruned_pairs_dir, extension='.dill')
    work_keys = [db.get_pdb_name(x) for x in work_filenames]
    logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".dill")

    inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o)
              for i, o in zip(work_filenames, output_filenames)]
    n_copied = 0
    n_copied += np.sum(
        par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus))
    logging.info("{:} out of {:} pairs was copied".format(
        n_copied, len(work_keys)))
コード例 #16
0
ファイル: case.py プロジェクト: stephaniewankowicz/atom3
def _get_seq_and_atoms(filename):
    """Form dictionaries mapping from chain to their sequence and atoms."""
    seqs = {}
    all_atoms = {}
    structure = struct.parse_structure(filename)
    pdb_name = db.get_pdb_name(filename)
    for (chain, residues) in \
            struct.get_chain_to_valid_residues(structure, pdb_name):
        atoms = []
        for residue in residues:
            atoms.append(np.array(residue[['x', 'y', 'z']], dtype='f4'))
        if len(residues) != 0:
            # Ignore zero-length peptides.
            seqs[chain] = residues
            all_atoms[chain] = np.array(atoms)
    return all_atoms, seqs
コード例 #17
0
ファイル: conservation.py プロジェクト: amorehead/atom3
def map_profile_hmms(num_cpus, pkl_filename, output_filename, hhsuite_db,
                     source_type, num_iter):
    pdb_name = db.get_pdb_name(pkl_filename)
    start_time = timeit.default_timer()
    start_time_blitsing = timeit.default_timer()
    profile_hmms, num_chains = gen_profile_hmm(num_cpus, pkl_filename,
                                               output_filename, hhsuite_db,
                                               source_type, num_iter)
    elapsed_blitsing = timeit.default_timer() - start_time_blitsing

    start_time_writing = timeit.default_timer()
    profile_hmms.to_pickle(output_filename)
    elapsed_writing = timeit.default_timer() - start_time_writing

    elapsed = timeit.default_timer() - start_time
    logging.info(
        ('For {:d} profile HMMs generated from {}, spent {:05.2f} blitsing,'
         ' {:05.2f} writing, and {:05.2f} overall.').format(
             num_chains, pdb_name, elapsed_blitsing, elapsed_writing, elapsed))
コード例 #18
0
ファイル: case.py プロジェクト: stephaniewankowicz/atom3
def _generate_reference(pdb_filename, s2r_chain, s2r_res, output_filename,
                        style):
    """Transform PDB structure to a reference structure."""
    biopy_structure = db.parse_biopython_structure(pdb_filename)
    pdb_name = db.get_pdb_name(pdb_filename)

    new_model = Bio.PDB.Model.Model('0')
    new_structure = Bio.PDB.Structure.Structure('')
    for (chain, residues) in \
            struct.get_chain_to_valid_residues(biopy_structure, pdb_name):
        if style == 'dockground' and chain not in s2r_chain:
            # If we are in dockground, we allow ourselves to remove unmapped
            # chains.
            continue
        ref_chain = s2r_chain[chain]

        if chain in s2r_res:
            # If we have an alignment for this chain.
            new_chain = Bio.PDB.Chain.Chain(ref_chain)
            for i, residue in enumerate(residues):
                if residue.id[0] != ' ':
                    continue
                residue.segid = ""
                residue.id = (' ', s2r_res[chain][i], residue.id[2])
                new_chain.add(residue)
        else:
            # Else, just remove segment ID.
            new_chain = Bio.PDB.Chain.Chain(ref_chain)
            for i, residue in enumerate(residues):
                residue.segid = ""
        new_model.add(new_chain)

    new_structure.add(new_model)
    w = Bio.PDB.PDBIO()
    w.set_structure(new_structure)
    w.save(output_filename)
コード例 #19
0
ファイル: conservation.py プロジェクト: amorehead/atom3
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db,
                         num_cpu_jobs, num_cpus_per_job, source_type, num_iter,
                         rank, size, write_file):
    ext = '.pkl'
    if write_file:
        if source_type.lower() == 'rcsb':
            # Filter out pairs that did not survive pruning previously to reduce complexity
            pruned_pdb_names = [
                db.get_pdb_name(filename)
                for filename in db.get_structures_filenames(pruned_dataset,
                                                            extension='.dill')
            ]
            requested_filenames = [
                os.path.join(pkl_dataset,
                             db.get_pdb_code(pruned_pdb_name)[1:3],
                             pruned_pdb_name.split('_')[0] + ext)
                for pruned_pdb_name in pruned_pdb_names
            ]
        else:  # DB5 does not employ pair pruning, so there are no pairs to filter
            requested_filenames = [
                filename
                for filename in db.get_structures_filenames(pkl_dataset,
                                                            extension=ext)
            ]

        # Filter DB5 filenames to unbound type and get all work filenames
        requested_filenames = [
            filename for filename in requested_filenames
            if (source_type.lower() == 'db5' and '_u_' in filename) or (
                source_type.lower() in
                ['rcsb', 'evcoupling', 'casp_capri', 'input'])
        ]
        requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
        produced_filenames = db.get_structures_filenames(output_dir,
                                                         extension='.pkl')
        produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
        work_keys = [key for key in requested_keys if key not in produced_keys]
        establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \
            if source_type.lower() == 'casp_capri' \
            else pdb_code.upper()
        work_filenames = [
            os.path.join(
                pkl_dataset,
                establish_pdb_code_case(db.get_pdb_code(work_key),
                                        source_type)[1:3], work_key + ext)
            for work_key in work_keys
        ]

        # Remove any duplicate filenames
        work_filenames = list(set(work_filenames))
        logging.info(
            "{:} requested keys, {:} produced keys, {:} work filenames".format(
                len(requested_keys), len(produced_keys), len(work_filenames)))

        if source_type.lower() == 'input':
            # Directly generate profile HMM features after aggregating input filenames
            logging.info("{:} work filenames".format(len(work_filenames)))

            output_filenames = []
            for pdb_filename in work_filenames:
                sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
                if not os.path.exists(sub_dir):
                    os.makedirs(sub_dir, exist_ok=True)
                output_filenames.append(sub_dir + '/' +
                                        db.get_pdb_name(pdb_filename) + '.pkl')

            inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                       num_iter)
                      for key, output in zip(work_filenames, output_filenames)]
            par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
        else:
            # Write out a local file containing all work filenames
            temp_df = pd.DataFrame({'filename': work_filenames})
            temp_df.to_csv(f'{source_type}_work_filenames.csv')
            logging.info(
                'File containing work filenames written to storage. Exiting...'
            )

    # Read from previously-created work filenames CSV
    else:
        work_filenames = pd.read_csv(
            f'{source_type}_work_filenames.csv').iloc[:, 1].to_list()
        work_filenames = list(
            set(work_filenames))  # Remove any duplicate filenames

        # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
        work_filename_rank_batches = slice_list(work_filenames, size)
        work_filenames = work_filename_rank_batches[rank]

        logging.info("{:} work filenames".format(len(work_filenames)))

        output_filenames = []
        for pdb_filename in work_filenames:
            sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir, exist_ok=True)
            output_filenames.append(sub_dir + '/' +
                                    db.get_pdb_name(pdb_filename) + '.pkl')

        inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type,
                   num_iter)
                  for key, output in zip(work_filenames, output_filenames)]
        par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
コード例 #20
0
ファイル: conservation.py プロジェクト: amorehead/atom3
def gen_profile_hmm(num_cpus, pkl_filename, output_filename, hhsuite_db,
                    source_type, num_iter):
    """Generate profile HMM from sequence."""
    pdb_name = db.get_pdb_name(pkl_filename)
    out_dir = os.path.dirname(output_filename)
    work_dir = os.path.join(out_dir, 'work')
    if not os.path.exists(work_dir):
        os.makedirs(work_dir, exist_ok=True)
    fasta_format = work_dir + "/{:}.fa"
    id_format = work_dir + "/{:}.cpkl"

    # Get FASTA sequence-chain representations of PDB structures
    chains, chain_fasta_filenames, id_filenames = sequ.pdb_to_fasta(
        pkl_filename, fasta_format, id_format, True)

    # Process each profile HMM for a given PDB structure or complex
    num_chains = 0
    profile_hmms = []
    for chain, chain_fasta_filename, id_filename in zip(
            chains, chain_fasta_filenames, id_filenames):
        basename = os.path.splitext(chain_fasta_filename)[0]
        profile_hmm_filename = "{}.hhm".format(basename)
        hhblits_filename = "{}.a3m".format(basename)

        if not os.path.exists(profile_hmm_filename):
            logging.info("HHblits'ing {:}".format(chain_fasta_filename))
            _hhsuite(num_cpus, chain_fasta_filename, hhblits_filename,
                     profile_hmm_filename, hhsuite_db, num_iter)

        if not os.path.exists(profile_hmm_filename):
            logging.warning("No hits for {:}".format(chain_fasta_filename))
            # Create empty file
            open(profile_hmm_filename, 'w').close()

        if os.stat(profile_hmm_filename).st_size != 0:
            with open(chain_fasta_filename, 'r') as fasta:
                with open(profile_hmm_filename, 'r') as hmm:
                    sequence = ''
                    for seq_line in fasta.readlines()[1:]:
                        sequence += " ".join(seq_line.splitlines())
                    profile_hmm = extract_hmm_profile(hmm.read(), sequence)
        else:
            logging.warning(
                "No profile HMM found for {:} (model {:}, chain {:})".format(
                    pdb_name, chain[-2], chain[-1]))
            profile_hmm = None

        pdb_name = db.get_pdb_name(pkl_filename)
        key = pdb_name + '-' + chain[-2] + '-' + chain[-1]
        pos_to_res = pickle.load(open(id_filename, 'rb'))[key]

        if profile_hmm is not None:  # Skip if profile HMM was not found
            profile_hmm = pd.DataFrame(data=profile_hmm)
            profile_hmm.insert(0, 'pdb_name', db.get_pdb_name(pkl_filename))
            profile_hmm.insert(1, 'model', chain[0])
            profile_hmm.insert(2, 'chain', chain[1])
            profile_hmm.insert(3, 'residue', pos_to_res)
            profile_hmms.append(profile_hmm)
        # Keep track of how many chains have been processed
        num_chains += 1
    # Merge related DataFrames into a single one
    profile_hmms = pd.concat(profile_hmms)
    return profile_hmms, num_chains
コード例 #21
0
ファイル: conservation.py プロジェクト: amorehead/atom3
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset,
                               pkl_dataset, pruned_dataset, output_dir,
                               source_type):
    ext = '.pkl'
    if source_type.lower() == 'rcsb':
        # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or
        (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input'])
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames]
    produced_filenames_path = os.path.join(output_dir, 'PSAIA',
                                           source_type.upper())
    produced_filenames = [
        path.as_posix()
        for path in Path(produced_filenames_path).rglob('*.tbl')
    ]
    produced_keys = [db.get_pdb_code(x) for x in produced_filenames]
    work_keys = [
        key for key, pdb_code in zip(requested_keys, requested_pdb_codes)
        if pdb_code not in produced_keys
    ]
    format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \
        if source_type.lower() in ['input'] \
        else pdb_code.upper()
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pdb_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(
                pdb_dataset,
                format_pdb_code_for_inputs(db.get_pdb_code(work_key),
                                           source_type), work_key)
            for work_key in work_keys
        ]

    # Remove any duplicate filenames
    work_filenames = list(set(work_filenames))

    # Exit early if no inputs need to processed
    logging.info("{:} PDB files to process with PSAIA".format(
        len(work_filenames)))

    # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion)
    file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(),
                                  'pdb_list.fls')
    with open(file_list_file, 'w') as file:
        for requested_pdb_filename in work_filenames:
            file.write(f'{requested_pdb_filename}\n')

    inputs = [(psaia_dir, psaia_config_file, file_list_file)]
    par.submit_jobs(map_protrusion_indices, inputs,
                    1)  # PSAIA is inherently single-threaded in execution
コード例 #22
0
ファイル: structure.py プロジェクト: stephaniewankowicz/atom3
def parse_structure(structure_filename, concoord=False, one_model=False):
    """Parse a file into chain,model-to-residue mapping."""
    _, ext = os.path.splitext(structure_filename)
    detailed = ext == '.pkl'
    if detailed:
        # If detailed we are reading pandas pickle file outputted by
        # protprep.
        df = pd.read_pickle(structure_filename)
        # Set model to 0, because a multi-model file was either already split
        # into separate files (using the split command) or was pared down to a
        # single model by the autodock portion of the protprep pipeline.
        # This might need to be revisited if/when autodock is removed from
        # pipeline or we decide to actually keep track of correct model.
        df['model'] = get_model(structure_filename)
        # Remove hydrogens, for now, to maintain compatability.
        df = df[df['maestro_atom_name'].apply(lambda x: x.strip()[0]) != 'H']
    else:
        # BioPython.PDB Structure extracted from PDB file.
        biopy_structure = db.parse_biopython_structure(structure_filename)
        pdb_name = db.get_pdb_name(structure_filename)
        if concoord:
            # need to set model number to be correct (drawn from filename)
            # TODO: I (Raphael) moved this out of core Structure code, need to
            # make sure it is correct still for CONCOORD.
            biopy_structure = db.parse_biopython_structure(structure_filename)
            biopy_structure = \
                Bio.PDB.Structure.Structure(biopy_structure.id)

            chainmodel = pdb_name.split('_')[1]
            model_id = str(int(re.split('(\d+)', chainmodel)[1]) + 1)

            for model_obj in biopy_structure:
                new_model = Bio.PDB.Model.Model(model_id)
                for chain in model_obj:
                    new_model.add(chain)
                biopy_structure.add(new_model)

        if one_model:
            new_structure = Bio.PDB.Structure.Structure(biopy_structure.id)
            new_structure.add(biopy_structure[0])
            biopy_structure = new_structure
        atoms = []
        for residue in Bio.PDB.Selection.unfold_entities(biopy_structure, 'R'):
            # Prune out things that aren't actually residue atoms.
            if 'CA' in residue and residue.get_id()[0] == ' ':
                for atom in residue:
                    atoms.append(atom)

        df = pd.DataFrame(
            [(pdb_name,
              str(atom.get_parent().get_parent().get_parent().serial_num),
              atom.get_parent().get_full_id()[2],
              str(atom.get_parent().get_id()[1]) +
              atom.get_parent().get_id()[2], atom.get_parent().get_resname(),
              atom.get_coord()[0], atom.get_coord()[1], atom.get_coord()[2],
              atom.get_id()[0], atom.get_name(), str(atom.serial_number))
             for atom in atoms],
            columns=[
                'pdb_name', 'model', 'chain', 'residue', 'resname', 'x', 'y',
                'z', 'element', 'atom_name', 'aid'
            ])
    return df
コード例 #23
0
def gen_pssm(pdb_filename, blastdb, output_filename):
    """Generate PSSM and PSFM from sequence."""
    pdb_name = db.get_pdb_name(pdb_filename)
    out_dir = os.path.dirname(output_filename)
    work_dir = os.path.join(out_dir, 'work')
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    fasta_format = work_dir + "/{:}.fa"
    id_format = work_dir + "/{:}.cpkl"
    chains, chain_fasta_filenames, id_filenames = sequ.pdb_to_fasta(
        pdb_filename, fasta_format, id_format, True)

    pssms = []
    for chain, chain_fasta_filename, id_filename in \
            zip(chains, chain_fasta_filenames, id_filenames):
        basename = os.path.splitext(chain_fasta_filename)[0]
        pssm_filename = "{}.pssm".format(basename)
        blast_filename = "{}.blast".format(basename)
        clustal_filename = "{}.clustal".format(basename)
        al2co_filename = "{}.al2co".format(basename)
        if not os.path.exists(pssm_filename):
            logging.info("Blasting {:}".format(chain_fasta_filename))
            _blast(chain_fasta_filename, pssm_filename, blast_filename,
                   blastdb)

        if not os.path.exists(pssm_filename):
            logging.warning("No hits for {:}".format(chain_fasta_filename))
            # Create empty file.
            open(pssm_filename, 'w').close()

        if not os.path.exists(clustal_filename):
            logging.info("Converting {:}".format(blast_filename))
            _to_clustal(blast_filename, clustal_filename)

        if not os.path.exists(al2co_filename):
            logging.info("Al2co {:}".format(al2co_filename))
            _al2co(clustal_filename, al2co_filename)

        if os.stat(pssm_filename).st_size != 0:
            pssm = pd.read_csv(pssm_filename,
                               skiprows=2,
                               skipfooter=6,
                               delim_whitespace=True,
                               engine='python',
                               usecols=range(20),
                               index_col=[0, 1])
            pssm = pssm.reset_index()
            del pssm['level_0']
            pssm.rename(columns={'level_1': 'orig'}, inplace=True)

            pscm = pd.read_csv(pssm_filename,
                               skiprows=2,
                               skipfooter=6,
                               delim_whitespace=True,
                               engine='python',
                               usecols=range(20, 40),
                               index_col=[0, 1])
            psfm = pscm.applymap(lambda x: x / 100.)
            psfm = psfm.reset_index()
            del psfm['level_0']
            psfm.columns = pssm.columns
            del psfm['orig']
            del pssm['orig']

            # Combine both into one.
            psfm = psfm.add_prefix('psfm_')
            pssm = pssm.add_prefix('pssm_')
            al2co = pd.read_csv(al2co_filename,
                                delim_whitespace=True,
                                usecols=[2],
                                names=['al2co'])
            pssm = pd.concat([pssm, psfm, al2co], axis=1)

        else:
            logging.warning(
                "No pssm found for {:} (model {:}, chain {:})".format(
                    pdb_name, chain[-2], chain[-1]))
            pssm, psfm = None, None

        pdb_name = db.get_pdb_name(pdb_filename)
        key = pdb_name + '-' + chain[-2] + '-' + chain[-1]
        pos_to_res = pickle.load(open(id_filename))[key]

        pssm['pdb_name'] = db.get_pdb_name(pdb_filename)
        pssm['model'] = chain[0]
        pssm['chain'] = chain[1]
        pssm['residue'] = pos_to_res
        pssms.append(pssm)
    pssms = pd.concat(pssms)
    return pssms
コード例 #24
0
ファイル: conservation.py プロジェクト: amorehead/atom3
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus,
                  source_type, rank, size):
    ext = '.pkl'
    if source_type.lower(
    ) == 'rcsb':  # Filter out pairs that did not survive pruning previously to reduce complexity
        pruned_pdb_names = [
            db.get_pdb_name(filename)
            for filename in db.get_structures_filenames(pruned_dataset,
                                                        extension='.dill')
        ]
        requested_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(pruned_pdb_name)[1:3],
                         pruned_pdb_name.split('_')[0] + ext)
            for pruned_pdb_name in pruned_pdb_names
        ]
    else:  # DB5 does not employ pair pruning, so there are no pairs to filter
        requested_filenames = [
            filename for filename in db.get_structures_filenames(pkl_dataset,
                                                                 extension=ext)
        ]

    # Filter DB5 filenames to unbound type and get all work filenames
    requested_filenames = [
        filename for filename in requested_filenames
        if (source_type.lower() == 'db5' and '_u_' in filename) or (
            source_type.lower() == 'rcsb') or (source_type.lower(
            ) == 'evcoupling') or (source_type.lower() == 'casp_capri')
    ]
    requested_keys = [db.get_pdb_name(x) for x in requested_filenames]
    produced_filenames = db.get_structures_filenames(output_dir,
                                                     extension='.pkl')
    produced_keys = [db.get_pdb_name(x) for x in produced_filenames]
    work_keys = [key for key in requested_keys if key not in produced_keys]
    if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri':
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3], work_key + ext)
            for work_key in work_keys
        ]
    else:
        work_filenames = [
            os.path.join(pkl_dataset,
                         db.get_pdb_code(work_key)[1:3].upper(),
                         work_key + ext) for work_key in work_keys
        ]

    # Reserve an equally-sized portion of the full work load for a given rank in the MPI world
    work_filenames = list(set(work_filenames))
    work_filename_rank_batches = slice_list(work_filenames, size)
    work_filenames = work_filename_rank_batches[rank]

    # Remove any duplicate filenames
    logging.info(
        "{:} requested keys, {:} produced keys, {:} work filenames".format(
            len(requested_keys), len(produced_keys), len(work_filenames)))

    output_filenames = []
    for pdb_filename in work_filenames:
        sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir, exist_ok=True)
        output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) +
                                ".pkl")

    inputs = [(key, blastdb, output)
              for key, output in zip(work_filenames, output_filenames)]
    par.submit_jobs(map_pssms, inputs, num_cpus)