Ejemplo n.º 1
0
    def test_clean_pdb_and_get_chain(self):
        files = [('1kf6.pdb', '1kf6_clean_chainA_tester.pdb')]
        outdir = op.join('test_files', 'out')
        working_dir = 'test_structures'
        out_suffix = '_clean_chainA'
        custom_clean = CleanPDB(keep_chains='A')

        for infile, outfile in files:
            outfile_new = '{}{}.pdb'.format(op.splitext(infile)[0], out_suffix)
            infile_path = op.join(working_dir, infile)

            my_pdb = StructureIO(infile_path)
            default_cleaned_pdb = my_pdb.write_pdb(
                custom_selection=custom_clean,
                out_suffix=out_suffix,
                out_dir=outdir,
                force_rerun=True)
            default_cleaned_pdb_basename = op.basename(default_cleaned_pdb)

            # test if the filename is correct
            # print(default_cleaned_pdb_basename, outfile_new)
            self.assertEqual(default_cleaned_pdb_basename, outfile_new)

            # test if the file contents are equal
            self.assertEqual(
                open(default_cleaned_pdb, 'r').read(),
                open(op.join(working_dir, outfile), 'r').read())

            # test that the file does not equal the original file
            self.assertNotEqual(
                open(default_cleaned_pdb, 'r').read(),
                open(infile_path, 'r').read())
Ejemplo n.º 2
0
def clean_pdb(pdb_file,
              out_suffix='_clean',
              outdir=None,
              force_rerun=False,
              remove_atom_alt=True,
              keep_atom_alt_id='A',
              remove_atom_hydrogen=True,
              add_atom_occ=True,
              remove_res_hetero=True,
              keep_chemicals=None,
              keep_res_only=None,
              add_chain_id_if_empty='X',
              keep_chains=None):
    """Clean a PDB file.

    Args:
        pdb_file (str): Path to input PDB file
        out_suffix (str): Suffix to append to original filename
        outdir (str): Path to output directory
        force_rerun (bool): If structure should be re-cleaned if a clean file exists already
        remove_atom_alt (bool): Remove alternate positions
        keep_atom_alt_id (str): If removing alternate positions, which alternate ID to keep
        remove_atom_hydrogen (bool): Remove hydrogen atoms
        add_atom_occ (bool): Add atom occupancy fields if not present
        remove_res_hetero (bool): Remove all HETATMs
        keep_chemicals (str, list): If removing HETATMs, keep specified chemical names
        keep_res_only (str, list): Keep ONLY specified resnames, deletes everything else!
        add_chain_id_if_empty (str): Add a chain ID if not present
        keep_chains (str, list): Keep only these chains

    Returns:
        str: Path to cleaned PDB file

    """
    outfile = ssbio.utils.outfile_maker(inname=pdb_file,
                                        append_to_name=out_suffix,
                                        outdir=outdir,
                                        outext='.pdb')

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        my_pdb = StructureIO(pdb_file)
        my_cleaner = CleanPDB(remove_atom_alt=remove_atom_alt,
                              remove_atom_hydrogen=remove_atom_hydrogen,
                              keep_atom_alt_id=keep_atom_alt_id,
                              add_atom_occ=add_atom_occ,
                              remove_res_hetero=remove_res_hetero,
                              keep_res_only=keep_res_only,
                              add_chain_id_if_empty=add_chain_id_if_empty,
                              keep_chains=keep_chains,
                              keep_chemicals=keep_chemicals)

        my_clean_pdb = my_pdb.write_pdb(out_suffix=out_suffix,
                                        out_dir=outdir,
                                        custom_selection=my_cleaner,
                                        force_rerun=force_rerun)

        return my_clean_pdb
    else:
        return outfile
Ejemplo n.º 3
0
    def copy_results(self,
                     copy_to_dir,
                     rename_model_to=None,
                     force_rerun=False):
        """Copy the raw information from I-TASSER modeling to a new folder.

        Copies all files in the list _attrs_to_copy.

        Args:
            copy_to_dir (str): Directory to copy the minimal set of results per sequence.
            rename_model_to (str): New file name (without extension)
            force_rerun (bool): If existing models and results should be overwritten.

        """
        # Save path to the structure and copy it if specified
        if not rename_model_to:
            rename_model_to = self.model_to_use

        new_model_path = op.join(copy_to_dir, '{}.pdb'.format(rename_model_to))

        if self.structure_path:
            if ssbio.utils.force_rerun(flag=force_rerun,
                                       outfile=new_model_path):
                # Clean and save it
                custom_clean = CleanPDB()
                my_pdb = StructureIO(self.structure_path)
                new_model_path = my_pdb.write_pdb(
                    custom_selection=custom_clean,
                    custom_name=rename_model_to,
                    out_dir=copy_to_dir,
                    force_rerun=force_rerun)

            # Update the structure_path to be the new clean file
            self.load_structure_path(structure_path=new_model_path,
                                     file_type='pdb')

            # Other modeling results - store in a new folder
            dest_itasser_dir = op.join(copy_to_dir,
                                       '{}_itasser'.format(rename_model_to))
            if not op.exists(dest_itasser_dir):
                os.mkdir(dest_itasser_dir)

            for attr in self._attrs_to_copy:
                old_file_path = getattr(self, attr)
                new_file_path = op.join(dest_itasser_dir,
                                        op.basename(old_file_path))
                if ssbio.utils.force_rerun(flag=force_rerun,
                                           outfile=new_file_path):
                    shutil.copy2(old_file_path, new_file_path)
                    log.debug('{}: copied from {}'.format(
                        new_file_path, old_file_path))
                else:
                    log.debug('{}: file already exists'.format(new_file_path))
                setattr(self, attr, new_file_path)
Ejemplo n.º 4
0
    def parse_structure(self, store_in_memory=False):
        """Read the 3D coordinates of a structure file and return it as a Biopython Structure object.
        Also create ChainProp objects in the chains attribute for each chain in the first model.

        Args:
            store_in_memory (bool): If the Biopython Structure object should be stored in the attribute ``structure``.

        Returns:
            Structure: Biopython Structure object

        """
        # TODO: perhaps add option to parse into ProDy object?
        if not self.structure_file:
            log.error('{}: no structure file, unable to parse'.format(self.id))
            return None
        else:
            # Add Biopython structure object
            structure = StructureIO(self.structure_path, self.file_type)

            # Add all chains to self.chains as ChainProp objects
            structure_chains = [x.id for x in structure.first_model.child_list]
            self.add_chain_ids(structure_chains)
            self.get_structure_seqs(structure.first_model)

            # Also add all chains to self.mapped_chains ONLY if there are none specified
            if not self.mapped_chains:
                self.add_mapped_chain_ids(structure_chains)

            self.parsed = True

            if store_in_memory:
                self.structure = structure

            return structure
Ejemplo n.º 5
0
    def parse_structure(self):
        """Read the 3D coordinates of a structure file and return it as a Biopython Structure object

        Also create ChainProp objects in the chains attribute

        Returns:
            Structure: Biopython Structure object

        """
        # TODO: perhaps add option to parse into ProDy object?
        if not self.structure_path:
            log.error('{}: no structure file, unable to parse'.format(self.id))
            return None
        else:
            # Add Biopython structure object
            structure = StructureIO(self.structure_path, self.file_type)

            # Add all chains to self.chains as ChainProp objects
            structure_chains = [x.id for x in structure.first_model.child_list]
            self.add_chain_ids(structure_chains)
            self.get_structure_seqs(structure.first_model)

            # Also add all chains to self.mapped_chains ONLY if there are none specified
            if not self.mapped_chains:
                self.add_mapped_chain_ids(structure_chains)

            return structure
Ejemplo n.º 6
0
def get_msms_df_on_file(pdb_file,
                        outfile=None,
                        outdir=None,
                        outext='_msms.df',
                        force_rerun=False):
    """Run MSMS (using Biopython) on a PDB file.

    Saves a CSV file of:
        chain: chain ID
        resnum: residue number (PDB numbering)
        icode: residue insertion code
        res_depth: average depth of all atoms in a residue
        ca_depth: depth of the alpha carbon atom

    Depths are in units Angstroms. 1A = 10^-10 m = 1nm

    Args:
        pdb_file: Path to PDB file
        outfile: Optional name of output file (without extension)
        outdir: Optional output directory
        outext: Optional extension for the output file
        outext: Suffix appended to json results file
        force_rerun: Rerun MSMS even if results exist already

    Returns:
        Pandas DataFrame: ResidueDepth property_dict, reformatted

    """
    # Create the output file name
    outfile = ssbio.utils.outfile_maker(inname=pdb_file,
                                        outname=outfile,
                                        outdir=outdir,
                                        outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        # Load the structure
        my_structure = StructureIO(pdb_file)
        model = my_structure.first_model
        df = get_msms_df(model,
                         pdb_file,
                         outfile=outfile,
                         outdir=outdir,
                         outext=outext,
                         force_rerun=force_rerun)
    else:
        log.debug(
            '{}: already ran MSMS and force_rerun={}, loading results'.format(
                outfile, force_rerun))
        df = pd.read_csv(outfile, index_col=0)

    return df
Ejemplo n.º 7
0
    p.add_argument(
        'mutations',
        help=
        'Mutations in the form of Chain1.ResNum1.Mutation1,Chain2.ResNum2.Mutation2. Example: A.4.TYR,B.4.TYR'
    )
    p.add_argument('--outsuffix',
                   '-o',
                   default='_mutated',
                   help='Suffix appended to PDB file')
    p.add_argument('--clean',
                   '-c',
                   action='store_true',
                   help='Clean PDB and keep only chain with mutation')
    args = p.parse_args()

    mutations = parse_mutation_input(args.mutations)

    my_pdb = StructureIO(args.infile)
    if args.clean:
        my_cleaner = CleanPDB(keep_chains=[x[0] for x in mutations])
        my_clean_pdb = my_pdb.write_pdb(out_suffix='_clean',
                                        out_dir=tempfile.gettempdir(),
                                        custom_selection=my_cleaner)
        my_pdb = StructureIO(my_clean_pdb)

    my_mutation = MutatePDB(mutations)
    my_mutated_pdb = my_pdb.write_pdb(out_suffix=args.outsuffix,
                                      out_dir='mutated_pdbs',
                                      custom_selection=my_mutation)
    print('Mutated PDB at: {}'.format(my_mutated_pdb))
Ejemplo n.º 8
0
    if not op.isdir(args.outdir):
        os.mkdir(args.outdir)

    infiles = ssbio.utils.input_list_parser(args.infile)

    for pdb in tqdm(infiles):

        outfile = ssbio.utils.outfile_maker(inname=pdb,
                                            append_to_name=args.outsuffix,
                                            outdir=args.outdir,
                                            outext='.pdb')

        if ssbio.utils.force_rerun(flag=args.force, outfile=outfile):

            my_pdb = StructureIO(pdb)
            my_cleaner = CleanPDB(remove_atom_alt=args.keepalt,
                                  remove_atom_hydrogen=args.keephydro,
                                  keep_atom_alt_id='A',
                                  add_atom_occ=True,
                                  remove_res_hetero=args.keephetero,
                                  add_chain_id_if_empty='X',
                                  keep_chains=args.chain)

            my_clean_pdb = my_pdb.write_pdb(out_suffix=args.outsuffix,
                                            out_dir=args.outdir,
                                            custom_selection=my_cleaner,
                                            force_rerun=args.force)

    print('Clean PDBs at: {}'.format(args.outdir))
Ejemplo n.º 9
0
def hse_output(pdb_file, file_type):
    """
    The solvent exposure of an amino acid residue is important for analyzing,
    understanding and predicting aspects of protein structure and function [73].
    A residue's solvent exposure can be classified as four categories: exposed, partly exposed,
    buried and deeply buried residues. Hamelryck  et al. [73] established a new 2D measure that provides a
    different view of solvent exposure, i.e. half-sphere exposure (HSE). By conceptually dividing the sphere
    of a residue into two halves- HSE-up and HSE-down, HSE provides a more detailed description of an amino
    acid residue's spatial neighborhood. HSE is calculated by the hsexpo module implemented in the BioPython
    package [74] from a PDB file.

    http://onlinelibrary.wiley.com/doi/10.1002/prot.20379/abstract

    Args:
        pdb_file:

    Returns:

    """
    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    # Calculate HSEalpha
    exp_ca = HSExposureCA(model)
    # Calculate HSEbeta
    exp_cb = HSExposureCB(model)
    # Calculate classical coordination number
    exp_fs = ExposureCN(model)

    return


# def magni(a, b, c):
#     """Calculate the magnitude of distance vector
#     """
#     return pow((pow(a, 2) + pow(b, 2) + pow(c, 2)), 1.0 / 2.0)

# @cachetools.func.ttl_cache(maxsize=256)
# def calculate_res_distance(res_1, res_2, pdb_file):
#     """Calculate distance of one residue number to another in a PDB file
#
#     Args:
#         res_1: Residue number 1
#         res_2: Residue number 2
#         pdb_file: Path to PDB file
#
#     Returns:
#
#     """
#
#     my_structure = StructureIO(pdb_file)
#     model = my_structure.first_model
#
#     res_list = PDB.Selection.unfold_entities(model, 'R')
#
#     ires_list = []
#     res_chk_1 = ''
#     res_chk_2 = ''
#     for j in res_list:
#         if j.id[1] in [res_1, res_2] and j.resname != 'HOH':
#             ires_list.append(j)
#             if res_chk_1 == '' and res_chk_2 == '':
#                 res_chk_1 = j.id[1]
#             else:
#                 res_chk_2 = j.id[1]
#
#     paired = ssbio.utils.combinations(ires_list, 2)
#     try:
#         for k in paired:
#             chainA = PDB.Selection.unfold_entities(k[0], 'C')[0]
#             chainB = PDB.Selection.unfold_entities(k[1], 'C')[0]
#             vec = list(
#                 np.array([x.get_coord() for x in k[0]]).mean(axis=0) - np.array([x.get_coord() for x in k[1]]).mean(
#                     axis=0))
#             distance = magni(vec[0], vec[1], vec[2])
#
#         return distance
#     except UnboundLocalError:
#         log.error("Unknown interaction")
#         return None
Ejemplo n.º 10
0
def get_structure_seqs(pdb_file, file_type):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        pdb_file: Path to PDB file

    Returns:
        dict: Dictionary of:
        {chain_id: sequence}

    """

    # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but
    # it seems like biopython capitalizes it to chain L

    # Get the first model
    my_structure = StructureIO(pdb_file)
    model = my_structure.first_model

    structure_seqs = {}

    # Loop over each chain of the PDB
    for chain in model:
        chain_seq = ''
        tracker = 0

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            # res_num = res.id[1]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                full_id = res.get_full_id()
                end_tracker = full_id[3][1]
                i_code = full_id[3][2]
                aa = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if i_code != ' ':
                        chain_seq += aa
                        tracker = end_tracker + 1
                        continue
                    else:
                        chain_seq += 'X' * (end_tracker - tracker - 1)

                chain_seq += aa
                tracker = end_tracker

            else:
                continue

        structure_seqs[chain.get_id()] = chain_seq

    return structure_seqs