Exemple #1
0
    def load_PDB_to_system(self, filename = None):
        parser    = PDBParser(QUIET=True)
        structure = parser.get_structure('X', filename)
        self.residues = []

        for model in structure:

            c = 1
            for chain in model:

                self.id   = 1
                #self.name = "protein"

                n = 1
                r = 1

                for pdb_residue in chain:
                    residue = Residue(id=r,  name=pdb_residue.resname)
                    for pdb_atom in pdb_residue:

                        atom = Atom(id=n,
                                    name=pdb_atom.name,
                                    pos=pdb_atom.coord)
                        n += 1

                        residue.atoms.append(atom)
                    self.residues.append(residue)
                    r += 1
			def test_to_string(self):
				"""Write structure as string"""

				stream = StringIO()
				stream.write(dummy_1)
				stream.seek(0)

				mol = MolProcesser(stream)
				n_models = sum(1 for _ in mol.structure.get_models()) #1
				n_chains = sum(1 for _ in mol.structure.get_chains()) #2
				n_resids = sum(1 for _ in mol.structure.get_residues()) #2
				n_atoms = sum(1 for _ in mol.structure.get_atoms()) #15
				has_docc = sum(1 for a in mol.structure.get_atoms() if a.is_disordered())
				has_hatm = sum(1 for r in mol.structure.get_residues() if r.id[0] != ' ')

				stream_2 = StringIO()
				stream_2.write(mol.tostring)
				stream_2.seek(0)

				p = PDBParser(QUIET=1)
				mol_2 = p.get_structure('xyz', stream_2)

				n_models_2 = sum(1 for _ in mol_2.get_models()) #1
				n_resids_2 = sum(1 for _ in mol_2.get_residues()) #2
				n_atoms_2 = sum(1 for _ in mol_2.get_atoms()) #15
				has_docc_2 = sum(1 for a in mol_2.get_atoms() if a.is_disordered())
				has_hatm_2 = sum(1 for r in mol_2.get_residues() if r.id[0] != ' ')

				self.assertEqual(n_models, n_models_2)
				self.assertEqual(n_resids, n_resids_2)
				self.assertEqual(n_atoms, n_atoms_2)
				self.assertEqual(has_docc, has_docc_2)
				self.assertEqual(has_hatm, has_hatm_2)
Exemple #3
0
 def test_get_sequence_from_pdb_structure(self):
     pdb_file = "./test.pdb"
     p = PDBParser()
     structure = p.get_structure('test', pdb_file)
     structure_of_chain = structure[0]['A']
     sequence = construct_protein_graph.get_sequence_from_pdb_structure(structure_of_chain)
     self.assertEqual("VNIKTNPFK", sequence)
Exemple #4
0
def selectChain(ifn, ofn, chainID='A'):
    parser = PDBParser()
    structure = parser.get_structure('x', ifn)

    class ChainSelector():
        def __init__(self, chainID=chainID):
            self.chainID = chainID

        def accept_chain(self, chain):
            if chain.get_id() == self.chainID:
                return 1
            return 0

        def accept_model(self, model):
            return 1

        def accept_residue(self, residue):
            return 1

        def accept_atom(self, atom):
            return 1

    sel = ChainSelector(chainID)
    io = PDBIO()
    io.set_structure(structure)
    io.save(ofn, sel)
Exemple #5
0
def RemoveLigandsOneBioUnit(biounit, ligandlist):
    # ligandlist is a residue list with residue chain id, name and residue number
    p = PDBParser(PERMISSIVE = 1)
    pdbname= biounit.split("/")[-1]
    try:
        models = p.get_structure(pdbname, biounit)
    except:
        return None
    #for model in models:
    #    for chain in model:
    #        for residue in chain:
    #            print residue
    for rligand in ligandlist:
        for model in models:
            for chain in model:
                for residue in list(chain):
                    if chain.id == rligand["ChainID"] and int(rligand["ResNum"]) == residue.id[1]:
                        chain.detach_child(residue.id)
                    elif residue.id[0] == "W":
                        chain.detach_child(residue.id)
                    elif len(rligand["LigName"].split()) > 1 and int(rligand["ResNum"]) <= residue.id[1]:
                        LongLigand(chain, residue, rligand)
    io = PDBIO()
    io.set_structure(models)
    filepath = os.path.join(BIOSTRDIR, models.id)
    io.save(filepath)
Exemple #6
0
    def test_1_warnings(self):
        """Check warnings: Parse a flawed PDB file in permissive mode.

        NB: The try/finally block is adapted from the warnings.catch_warnings
        context manager in the Python 2.6 standard library.
        """
        warnings.simplefilter('always', PDBConstructionWarning)
        try:
            # Equivalent to warnings.catch_warnings -- hackmagic
            orig_showwarning = warnings.showwarning
            all_warns = []
            def showwarning(*args, **kwargs):
                all_warns.append(args[0])
            warnings.showwarning = showwarning
            # Trigger warnings
            p = PDBParser(PERMISSIVE=True)
            p.get_structure("example", "PDB/a_structure.pdb")
            for wrn, msg in zip(all_warns, [
                # Expected warning messages:
                'Atom N defined twice in residue <Residue ARG het=  resseq=2 icode= > at line 19.',
                'disordered atom found with blank altloc before line 31.',
                "Residue (' ', 4, ' ') redefined at line 41.",
                "Blank altlocs in duplicate residue SER (' ', 4, ' ') at line 41.",
                "Residue (' ', 10, ' ') redefined at line 73.",
                "Residue (' ', 14, ' ') redefined at line 104.",
                "Residue (' ', 16, ' ') redefined at line 133.",
                "Residue (' ', 80, ' ') redefined at line 631.",
                "Residue (' ', 81, ' ') redefined at line 644.",
                'Atom O defined twice in residue <Residue HOH het=W resseq=67 icode= > at line 820.'
                ]):
                self.assertTrue(msg in str(wrn))
        finally:
            warnings.showwarning = orig_showwarning
Exemple #7
0
def chain2pos_scan_str(chain, pdb, mutation_set='a'):
  """
  Takes a chain ID and a model.PDBFile object, returns a string
  suitable as the PositionScan line for FoldX.
  """
  parser = PDBParser(PERMISSIVE=1)
  pdbfn = pdb.fullpath()
  struct = parser.get_structure(pdb.uuid, pdbfn)[0]
  #chains = pdb_extract_chain_seqs(struct)
  
  chainlist = Selection.unfold_entities(struct, 'C')
  
  position_scan_str = ''
  for c in chainlist:
    if c.id == chain:
      for r in c:
        try:
          aa = three_to_one(r.get_resname())
          resnum = r.id[1]
          position_scan_str += '%s%s%i%s,' % (aa, chain, resnum, mutation_set)
        except:
          # non-native amino acid or water
          pass


  position_scan_str = position_scan_str[:-1]
  
  return position_scan_str
def parse_structure(path):
    """
    Parses a structure using Biopython's PDB/mmCIF Parser
    Verifies the integrity of the structure (gaps) and its
    suitability for the calculation (is it a complex?).
    """
    # setup logging
    logger = logging.getLogger('Prodigy')
    logger.info('[+] Reading structure file: {0}'.format(path))
    fname = os.path.basename(path)
    sname = '.'.join(fname.split('.')[:-1])
    s_ext = fname.split('.')[-1]

    _ext = {'pdb', 'ent', 'cif'}
    if s_ext not in _ext:
        raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext))

    sparser = PDBParser(QUIET=1) if s_ext in {'pdb', 'ent'} else MMCIFParser()

    try:
        s = sparser.get_structure(sname, path)
    except Exception as exeption:
        logger.error('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr)
        raise Exception(exeption)

    return (validate_structure(s),
            len(set([c.id for c in s.get_chains()])),
            len(list(s.get_residues())))
 def test_NACCESS(self):
     """Test NACCESS generation from PDB"""
     p = PDBParser()
     pdbfile = "PDB/1A8O.pdb"
     model = p.get_structure("1A8O", pdbfile)[0]
     naccess = NACCESS(model, pdbfile)
     self.assertEqual(len(naccess), 66)
Exemple #10
0
 def test_dssp(self):
     """Test DSSP generation from PDB."""
     p = PDBParser()
     pdbfile = "PDB/2BEG.pdb"
     model = p.get_structure("2BEG", pdbfile)[0]
     dssp = DSSP(model, pdbfile)
     self.assertEqual(len(dssp), 130)
Exemple #11
0
    def run(self):
        mypath = self.getPath()
        lig_ifn = mypath.sdf
        prt_ifn = mypath.pdb

        lig_ext = os.path.basename(lig_ifn).split('.')[-1]
        lig = pybel.readfile(lig_ext, lig_ifn).next()
        lig.removeh()
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure('prt', prt_ifn)

        typetable = OBTypeTable()
        typetable.SetFromType('INT')
        typetable.SetToType('SYB')

        dat = []
        atom_types = [typetable.Translate(atom.type) for atom in lig.atoms]
        atom_types = shuffle(atom_types)
        for residue in structure.get_residues():
            dists = residueDistances2LigandAtoms(residue, lig)
            dat.append({"dists": dists,
                        "atom_types": atom_types,
                        "residue": residue.get_resname()})

        to_write = json.dumps(dat, indent=4, separators=(',', ':'))
        with self.output().open('w') as ofs:
            ofs.write(to_write)
    def test_conversion(self):
        """Parse 1A8O.cif, write 1A8O.pdb, parse again and compare"""

        cif_parser = MMCIFParser(QUIET=1)
        cif_struct = cif_parser.get_structure("example", "PDB/1LCD.cif")

        pdb_writer = PDBIO()
        pdb_writer.set_structure(cif_struct)
        filenumber, filename = tempfile.mkstemp()
        pdb_writer.save(filename)

        pdb_parser = PDBParser(QUIET=1)
        pdb_struct = pdb_parser.get_structure('example_pdb', filename)

        # comparisons
        self.assertEqual(len(pdb_struct), len(cif_struct))

        pdb_atom_names = [a.name for a in pdb_struct.get_atoms()]
        cif_atom_names = [a.name for a in cif_struct.get_atoms()]
        self.assertEqual(len(pdb_atom_names), len(cif_atom_names))
        self.assertSequenceEqual(pdb_atom_names, cif_atom_names)

        pdb_atom_elems = [a.element for a in pdb_struct.get_atoms()]
        cif_atom_elems = [a.element for a in cif_struct.get_atoms()]
        self.assertSequenceEqual(pdb_atom_elems, cif_atom_elems)
def parse_freesasa_output(fpath):
    """
    Returns per-residue relative accessibility of side-chain and main-chain
    atoms as calculated by freesasa.
    """

    asa_data, rsa_data = {}, {}

    _rsa = rel_asa
    _bb = set(('CA', 'C', 'N', 'O'))

    P = PDBParser(QUIET=1)
    s = P.get_structure('bogus', fpath.name)
    for res in s.get_residues():
        res_id = (res.parent.id, res.resname, res.id[1])
        asa_mc, asa_sc, total_asa = 0, 0, 0
        for atom in res:
            aname = atom.name
            at_id = (res.parent.id, res.resname, res.id[1], aname)
            asa = atom.bfactor
            # if atom.name in _bb:
            #     asa_mc += asa
            # else:
            #     asa_sc += asa
            total_asa += asa
            asa_data[at_id] = asa

        rsa_data[res_id] = total_asa / _rsa['total'][res.resname]

    return asa_data, rsa_data
Exemple #14
0
	def __init__(self, table, pdb):
		table = table.reset_index(drop=True)
		struct = PDBParser().get_structure(table['pdb_id'][0], pdb)		
		table = table.fillna('')
		alpha_num = sum([1 for x in table['tcr_v_allele'].tolist() if x.find('TRA') != -1])
		beta_num = table.shape[0] - alpha_num
		table.insert(table.columns.get_loc('tcr_chain'), 'tcr_chain_name', ['alpha'] * alpha_num + ['beta'] * beta_num)
	
		print table
		self.__table = table
		self.__name = str(struct.get_id())		
		self.__struct = struct
		self.__chains = [chain.get_id() for chain in struct[0]]
		self.__regions = table.groupby(['tcr_chain_name', 'tcr_region'])
	
		# Dictionary of regions residues;
		# looks like : { ('alpha', 'CDR1') : [residue list],
		# ('alpha', 'CDR2') : [residue list], ... } 
		self.__regions_res = self.__regions.groups 	
		for key in self.__regions_res.keys():	
			self.__regions_res[key] = []
	
		# Pepdide residue list	
		self.__peptide = []				
	
		# Dictionaries with pairwise region matrices;
		# look like : { (('alpha', 'CDR1'), ('alpha', 'CDR2')) : dataframe, 
		# (('alpha', 'CDR1'), ('alpha', 'CDR3')) : dataframe, ... }
		self.__d_matrices = {}	
		self.__e_matrices = {}	
		self.verbose = True
		if not self.getRegionsResidues():
			print 'SOME REGION WAS NOT FOUND IN PDB'
		if not self.definePeptideChain():
			print 'PEPTIDE WAS NOT FOUND IN PDB'
 def test_3_bad_xyz(self):
     """Check error: Parse an entry with bad x,y,z value."""
     data = "ATOM      9  N   ASP A 152      21.554  34.953  27.691  1.00 19.26           N\n"
     parser = PDBParser(PERMISSIVE=False)
     s = parser.get_structure("example", StringIO(data))
     data = "ATOM      9  N   ASP A 152      21.ish  34.953  27.691  1.00 19.26           N\n"
     self.assertRaises(PDBConstructionException,
             parser.get_structure, "example", StringIO(data))       
Exemple #16
0
def main():
    p = PDBParser()
    filename = "pdb10gs.ent"
    models = p.get_structure("10GS", filename)
    for model in models:
        print models[0]
        print model.get_full_id()
        TestDSSP(models[0], filename)
Exemple #17
0
def main():
    p = PDBParser()
    filename = "test/10gs.bio1"
    models = p.get_structure("10gs", filename)
    for model in models:
        print models[0]
        print model.get_full_id()
        TestNACCESS(models[0], filename)
Exemple #18
0
def build_all_angles_model(pdb_filename):
    parser=PDBParser()
    structure=parser.get_structure('sample', \
                                    path.join(PDBdir, pdb_filename))
    model=structure[0]
    chain=model['A']
    model_structure_geo=[]
    prev="0"
    N_prev="0"
    CA_prev="0"
    CO_prev="0"
    prev_res=""
    rad=180.0/math.pi
    for res in chain:
        if(res.get_resname() in resdict.keys()):
            geo=Geometry.geometry(resdict[res.get_resname()])
            if(prev=="0"):
                N_prev=res['N']
                CA_prev=res['CA']
                C_prev=res['C']
                prev="1"
            else:
                n1=N_prev.get_vector()
                ca1=CA_prev.get_vector()
                c1=C_prev.get_vector()
                                
                C_curr=res['C']
                N_curr=res['N']
                CA_curr=res['CA']
                                                
                c=C_curr.get_vector()
                n=N_curr.get_vector()
                ca=CA_curr.get_vector()

                geo.CA_C_N_angle=calc_angle(ca1, c1, n)*rad
                geo.C_N_CA_angle=calc_angle(c1, n, ca)*rad

                psi= calc_dihedral(n1, ca1, c1, n) ##goes to current res
                omega= calc_dihedral(ca1, c1, n, ca) ##goes to current res
                phi= calc_dihedral(c1, n, ca, c) ##goes to current res

                geo.psi_im1=psi*rad
                geo.omega=omega*rad
                geo.phi=phi*rad

                geo.N_CA_C_angle= calc_angle(n, ca, c)*rad
                ##geo.CA_C_O_angle= calc_angle(ca, c, o)*rad

                ##geo.N_CA_C_O= calc_dihedral(n, ca, c, o)*rad

                N_prev=res['N']
                CA_prev=res['CA']
                C_prev=res['C']
                ##O_prev=res['O']
                                
                        
            model_structure_geo.append(geo)
    return model_structure_geo
def pdb2dfromactivesite(pdb_fh,active_sites=[]):
    """
    This calculates distances between each ligand atom or optionally provided amino acids (sources) and each residue in the protein.
    
    :param pdb_fh: path to .pdb file.
    :param active_sites: optional list of residue numbers as sources. 
    :returns dfromligands: pandas table with distances from ligand
    """
    junk_residues = ["HOH"," MG","CA"," NA","SO4","IOD","NA","CL","GOL","PO4"]
    pdb_parser=PDBParser()
    pdb_data=pdb_parser.get_structure("pdb_name",pdb_fh)
    model = pdb_data[0]
    chainA = model["A"] #only a chain
    residues   = list(chainA.get_residues())
    ligands_residue_objs=[]
    for residue in chainA:
        if not residue.get_resname() in junk_residues:
            if not residue.get_resname() in aas_21_3letter: #only aas 
                ligands_residue_objs.append(residue)
            elif residue.id[1] in active_sites:
                ligands_residue_objs.append(residue)
            
    dfromligands=pd.DataFrame()
    for ligandi in range(len(ligands_residue_objs)):
        ligand_residue_obj=ligands_residue_objs[ligandi]
        for ligand_atom_obj in ligand_residue_obj:
            for residue in chainA:
                if residue.get_resname() in aas_21_3letter: #only aas 
                    dfromligands.loc[residue.id[1],"ref_pdb"]=residue.get_resname()
                    if not ligand_residue_obj.get_resname() in aas_21_3letter:
                        dfromligands.loc[residue.id[1],"Distance from Ligand: %s (ATOM: %s)" % \
                                         (ligand_residue_obj.get_resname(),ligand_atom_obj.get_name())]\
                        =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"]
                    else:
                        dfromligands.loc[residue.id[1],"Distance from active site residue: %s %d (ATOM: %s)" % \
                                         (ligand_residue_obj.get_resname(),ligand_residue_obj.get_id()[1],\
                                          ligand_atom_obj.get_name())]\
                        =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"]

    dfromligands.index.name="aasi"
    if "ref_pdb" in dfromligands:
        del dfromligands["ref_pdb"]
    #average and minimum distances
    cols_all=dfromligands.columns.tolist()
    for moltype in ['Distance from Ligand:','Distance from active site residue:']:
        cols_moltype=[c for c in cols_all if moltype in c]
        if len(cols_all)>0:
            dfromligands.loc[:,'%s average' % moltype]=dfromligands.loc[:,cols_moltype].T.mean()
            dfromligands.loc[:,'%s minimum' % moltype]=dfromligands.loc[:,cols_moltype].T.min()
            mols=np.unique([c[c.find(moltype):c.find(' (ATOM')] for c in cols_moltype])
            if len(mols)>1:
                for mol in mols:
                    cols_mol=[c for c in cols_moltype if mol in c]
                    dfromligands.loc[:,'%s: average' % mol]=dfromligands.loc[:,cols_mol].T.mean()
                    dfromligands.loc[:,'%s: minimum' % mol]=dfromligands.loc[:,cols_mol].T.min()    

    return dfromligands
def getPdbAtomsBySerialNum(pdb_fn, serial_nums):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('x', pdb_fn)
    atoms = {atom.serial_number : atom for atom in structure.get_atoms()}
    re_ordered = []
    for num in serial_nums:
        re_ordered.append(atoms[num])

    return re_ordered
 def test_fragment_mapper(self):
     """Self test for FragmentMapper module."""
     p = PDBParser()
     pdb1 = "PDB/1A8O.pdb"
     s = p.get_structure("X", pdb1)
     m = s[0]
     fm = FragmentMapper(m, 10, 5, "PDB")
     for r in Selection.unfold_entities(m, "R"):
         if r in fm:
             self.assertTrue(str(fm[r]).startswith("<Fragment length=5 id="))
	def _get_ligand_name(self):
		p = PDBParser(QUIET=True)
		ligand = p.get_structure('ligand', self.out_filename)
		chain = ligand[0]['A']
		for residue in chain.get_residues():
			if residue.resname in self.ignore:
				pass
			else:
				self.ligands.append(residue.resname)
		print "Ligands found: ", self.ligands
	def _get_resmapping(self):
		res_mapping = []
		filepath = self._get_filepath('', pdb_file=True)
		p = PDBParser(QUIET=True)
		structure = p.get_structure('protein', filepath)
		chain = structure[0]['A']
		for residue in chain.get_residues():
			if str(residue.id[1]) in self.resnums:
				res_mapping.append((self.codes[residue.resname], residue.id[1]))
		return res_mapping
 def check_msms(self, prot_file, first_100_residues):
     p = PDBParser()
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", PDBConstructionWarning)
         s = p.get_structure("X", prot_file)
     model = s[0]
     rd = ResidueDepth(model)
     res_chain = ''
     for item in rd.property_list[:100]:
         res_chain = res_chain + item[0].get_resname()
     self.assertEqual(res_chain, first_100_residues)
 def test_empty(self):
     """Parse an empty file."""
     parser = PDBParser()
     filenumber, filename = tempfile.mkstemp()
     os.close(filenumber)
     try:
         struct = parser.get_structure('MT', filename)
         # Structure has no children (models)
         self.assertFalse(len(struct))
     finally:
         os.remove(filename)
Exemple #26
0
def experimental_method(pdb_path):
    """
    Get String representation of Experimental method used file of interest.
    Use header for this information.

    :param pdb_path: Path to PDB file
    :return:
    """
    parser = PDBParser(get_header=True)
    parser.get_structure('', pdb_path)

    return parser.get_header()['structure_method']
def read_pdb(pdbfile):
    '''
    Read a PDB file as structure file with BIO.PDB

    :param pdbfile: path to pdb file
    :return:  structure
    '''

    parser = PDBParser()
    structure = parser.get_structure('pdb', pdbfile)

    return structure
def open_pdb(pdbfn):
    """Open pdb with Biopython.

    Args:
       pdbfn1 (str): a path to a pdb structure

    Returns:
       PDB Biopython object: with a pdb structure

    """
    parser = PDBParser()
    return parser.get_structure('', pdbfn)
Exemple #29
0
 def test_c_n(self):
     """Extract polypeptides from 1A80."""
     warnings.resetwarnings()
     parser = PDBParser(PERMISSIVE=False)
     structure = parser.get_structure("example", "PDB/1A8O.pdb")
     self.assertEqual(len(structure), 1)
     for ppbuild in [PPBuilder(), CaPPBuilder()]:
         #==========================================================
         #First try allowing non-standard amino acids,
         polypeptides = ppbuild.build_peptides(structure[0], False)
         self.assertEqual(len(polypeptides), 1)
         pp = polypeptides[0]
         # Check the start and end positions
         self.assertEqual(pp[0].get_id()[1], 151)
         self.assertEqual(pp[-1].get_id()[1], 220)
         # Check the sequence
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         #Here non-standard MSE are shown as M
         self.assertEqual("MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ"
                          "NANPDCKTILKALGPGATLEEMMTACQG", str(s))
         #==========================================================
         #Now try strict version with only standard amino acids
         #Should ignore MSE 151 at start, and then break the chain
         #at MSE 185, and MSE 214,215
         polypeptides = ppbuild.build_peptides(structure[0], True)
         self.assertEqual(len(polypeptides), 3)
         #First fragment
         pp = polypeptides[0]
         self.assertEqual(pp[0].get_id()[1], 152)
         self.assertEqual(pp[-1].get_id()[1], 184)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s))
         #Second fragment
         pp = polypeptides[1]
         self.assertEqual(pp[0].get_id()[1], 186)
         self.assertEqual(pp[-1].get_id()[1], 213)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s))
         #Third fragment
         pp = polypeptides[2]
         self.assertEqual(pp[0].get_id()[1], 216)
         self.assertEqual(pp[-1].get_id()[1], 220)
         s = pp.get_sequence()
         self.assertTrue(isinstance(s, Seq))
         self.assertEqual(s.alphabet, generic_protein)
         self.assertEqual("TACQG", str(s))
 def _select_residues(self, name, chain_id, selected):
     pdb_id = name.split('_')[0]
     pdb_path = DecompressedPdb(pdb_id).output().path
     pdb_parser = PDBParser(QUIET=True)
     pdb_structure = pdb_parser.get_structure(name, pdb_path)
     chain = pdb_structure[0][chain_id]
     coords = []
     names = []
     for res_id in selected:
         for atom in chain[res_id].get_unpacked_list():
             names.append(atom.get_name())
             coords.append(atom.get_coord())
     return coords, names
class WriteTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.io = PDBIO()
        self.parser = PDBParser(PERMISSIVE=1)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", PDBConstructionWarning)
            self.structure = self.parser.get_structure("example",
                                                       "PDB/1A8O.pdb")

    def test_pdbio_write_structure(self):
        """Write a full structure using PDBIO."""
        struct1 = self.structure
        # Ensure that set_structure doesn't alter parent
        parent = struct1.parent

        # Write full model to temp file
        self.io.set_structure(struct1)
        self.assertIs(parent, struct1.parent)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)

        try:
            self.io.save(filename)

            struct2 = self.parser.get_structure("1a8o", filename)
            nresidues = len(list(struct2.get_residues()))

            self.assertEqual(len(struct2), 1)
            self.assertEqual(nresidues, 158)
        finally:
            os.remove(filename)

    def test_pdbio_write_preserve_numbering(self):
        """Test writing PDB and preserve atom numbering."""
        self.io.set_structure(self.structure)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)

        try:
            self.io.save(filename)  # default preserve_atom_numbering=False

            struct = self.parser.get_structure("1a8o", filename)
            serials = [a.serial_number for a in struct.get_atoms()]
            og_serials = list(range(1, len(serials) + 1))
            self.assertEqual(og_serials, serials)
        finally:
            os.remove(filename)

    def test_pdbio_write_auto_numbering(self):
        """Test writing PDB and do not preserve atom numbering."""
        self.io.set_structure(self.structure)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)

        try:
            self.io.save(filename, preserve_atom_numbering=True)

            struct = self.parser.get_structure("1a8o", filename)
            serials = [a.serial_number for a in struct.get_atoms()]
            og_serials = [a.serial_number for a in self.structure.get_atoms()]

            self.assertEqual(og_serials, serials)
        finally:
            os.remove(filename)

    def test_pdbio_write_residue(self):
        """Write a single residue using PDBIO."""
        struct1 = self.structure
        residue1 = list(struct1.get_residues())[0]

        # Ensure that set_structure doesn't alter parent
        parent = residue1.parent

        # Write full model to temp file
        self.io.set_structure(residue1)
        self.assertIs(parent, residue1.parent)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            struct2 = self.parser.get_structure("1a8o", filename)
            nresidues = len(list(struct2.get_residues()))
            self.assertEqual(nresidues, 1)
        finally:
            os.remove(filename)

    def test_pdbio_write_residue_w_chain(self):
        """Write a single residue (chain id == X) using PDBIO."""
        struct1 = self.structure.copy()  # make copy so we can change it
        residue1 = list(struct1.get_residues())[0]

        # Modify parent id
        parent = residue1.parent
        parent.id = "X"

        # Write full model to temp file
        self.io.set_structure(residue1)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            struct2 = self.parser.get_structure("1a8o", filename)
            nresidues = len(list(struct2.get_residues()))
            self.assertEqual(nresidues, 1)

            # Assert chain remained the same
            chain_id = [c.id for c in struct2.get_chains()][0]
            self.assertEqual(chain_id, "X")
        finally:
            os.remove(filename)

    def test_pdbio_write_residue_wout_chain(self):
        """Write a single orphan residue using PDBIO."""
        struct1 = self.structure
        residue1 = list(struct1.get_residues())[0]

        residue1.parent = None  # detach residue

        # Write full model to temp file
        self.io.set_structure(residue1)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            struct2 = self.parser.get_structure("1a8o", filename)
            nresidues = len(list(struct2.get_residues()))
            self.assertEqual(nresidues, 1)

            # Assert chain is default: "A"
            chain_id = [c.id for c in struct2.get_chains()][0]
            self.assertEqual(chain_id, "A")
        finally:
            os.remove(filename)

    def test_pdbio_write_custom_residue(self):
        """Write a chainless residue using PDBIO."""
        res = Residue.Residue((" ", 1, " "), "DUM", "")
        atm = Atom.Atom("CA", [0.1, 0.1, 0.1], 1.0, 1.0, " ", "CA", 1, "C")
        res.add(atm)

        # Ensure that set_structure doesn't alter parent
        parent = res.parent

        # Write full model to temp file
        self.io.set_structure(res)

        self.assertIs(parent, res.parent)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            struct2 = self.parser.get_structure("res", filename)
            latoms = list(struct2.get_atoms())
            self.assertEqual(len(latoms), 1)
            self.assertEqual(latoms[0].name, "CA")
            self.assertEqual(latoms[0].parent.resname, "DUM")
            self.assertEqual(latoms[0].parent.parent.id, "A")
        finally:
            os.remove(filename)

    def test_pdbio_select(self):
        """Write a selection of the structure using a Select subclass."""

        # Selection class to filter all alpha carbons
        class CAonly(Select):
            """Accepts only CA residues."""
            def accept_atom(self, atom):
                if atom.name == "CA" and atom.element == "C":
                    return 1

        struct1 = self.structure
        # Ensure that set_structure doesn't alter parent
        parent = struct1.parent
        # Write to temp file
        self.io.set_structure(struct1)

        self.assertIs(parent, struct1.parent)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename, CAonly())
            struct2 = self.parser.get_structure("1a8o", filename)
            nresidues = len(list(struct2.get_residues()))
            self.assertEqual(nresidues, 70)
        finally:
            os.remove(filename)

    def test_pdbio_missing_occupancy(self):
        """Write PDB file with missing occupancy."""
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", PDBConstructionWarning)
            structure = self.parser.get_structure("test", "PDB/occupancy.pdb")

        self.io.set_structure(structure)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always", BiopythonWarning)
                self.io.save(filename)
                self.assertEqual(len(w), 1, w)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", PDBConstructionWarning)
                struct2 = self.parser.get_structure("test", filename)
            atoms = struct2[0]["A"][(" ", 152, " ")]
            self.assertIsNone(atoms["N"].get_occupancy())
        finally:
            os.remove(filename)

    def test_pdbio_write_truncated(self):
        """Test parsing of truncated lines."""
        struct = self.structure

        # Write to temp file
        self.io.set_structure(struct)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            # Check if there are lines besides 'ATOM', 'TER' and 'END'
            with open(filename) as handle:
                record_set = {l[0:6] for l in handle}
            record_set -= {
                "ATOM  ",
                "HETATM",
                "MODEL ",
                "ENDMDL",
                "TER\n",
                "TER   ",
                "END\n",
                "END   ",
            }
            self.assertEqual(record_set, set())
        finally:
            os.remove(filename)

    def test_model_numbering(self):
        """Preserve model serial numbers during I/O."""
        def confirm_numbering(struct):
            self.assertEqual(len(struct), 3)
            for idx, model in enumerate(struct):
                self.assertEqual(model.serial_num, idx + 1)
                self.assertEqual(model.serial_num, model.id + 1)

        def confirm_single_end(fname):
            """Ensure there is only one END statement in multi-model files."""
            with open(fname) as handle:
                end_stment = []
                for iline, line in enumerate(handle):
                    if line.strip() == "END":
                        end_stment.append((line, iline))
            self.assertEqual(len(end_stment), 1)  # Only one?
            self.assertEqual(end_stment[0][1], iline)  # Last line of the file?

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", PDBConstructionWarning)
            struct1 = self.parser.get_structure("1lcd", "PDB/1LCD.pdb")

        confirm_numbering(struct1)

        # Round trip: serialize and parse again
        self.io.set_structure(struct1)
        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)
        try:
            self.io.save(filename)
            struct2 = self.parser.get_structure("1lcd", filename)
            confirm_numbering(struct2)
            confirm_single_end(filename)
        finally:
            os.remove(filename)

    def test_pdbio_write_x_element(self):
        """Write a structure with atomic element X with PDBIO."""
        struct1 = self.structure

        # Change element of one atom
        atom = next(struct1.get_atoms())
        atom.element = "X"  # X is assigned in Atom.py as last resort

        self.io.set_structure(struct1)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)

        try:
            self.io.save(filename)
        finally:
            os.remove(filename)

    def test_pdbio_write_unk_element(self):
        """PDBIO raises ValueError when writing unrecognised atomic elements."""
        struct1 = self.structure

        atom = next(struct1.get_atoms())
        atom.element = "1"

        self.io.set_structure(struct1)

        filenumber, filename = tempfile.mkstemp()
        os.close(filenumber)

        with self.assertRaises(ValueError):
            self.io.save(filename)
        os.remove(filename)
 def __init__(self, structId="subset"):
     self.structId = structId
     self.pdbParser = PDBParser(QUIET=True)
     self.structure = Structure(structId)
Exemple #33
0
import json
import re
from abc import ABCMeta, abstractmethod

from Bio.PDB import PDBParser, NeighborSearch, PPBuilder, Residue
from enum import Enum

from Constants import *

LIGAND_STRUCT_ID = 'ligand'
RECEPTOR_STRUCT_ID = 'receptor'
N_PATCH_DOCK_SCORE_COMPONENTS = 4

pdb_parser = PDBParser(QUIET=True)


class ComplexType(Enum):
    zdock_benchmark_bound = 1
    zdock_benchmark_unbound = 2
    patch_dock = 3


class Complex(object):
    __metaclass__ = ABCMeta

    def __init__(self, complex_id, reprocess=False):
        self._complex_id = complex_id
        self._neighbours = None
        if (not self._is_processed()) or reprocess:
            self._process_complex()
        data = self._load_processed_data()
Exemple #34
0
def build_backbone_model(pdb_filename):
    parser = PDBParser()
    structure = parser.get_structure("sample", path.join(PDBdir, pdb_filename))
    model = structure[0]
    chain = model["A"]
    model_structure_geo = []
    prev = "0"
    N_prev = "0"
    CA_prev = "0"
    CO_prev = "0"
    ##O_prev="0"
    prev_res = ""
    rad = 180.0 / math.pi
    for res in chain:
        if res.get_resname() in resdict.keys():
            geo = Geometry.geometry(resdict[res.get_resname()])
            if prev == "0":
                N_prev = res["N"]
                CA_prev = res["CA"]
                C_prev = res["C"]
                ##O_prev=res['O']
                prev = "1"
            else:
                n1 = N_prev.get_vector()
                ca1 = CA_prev.get_vector()
                c1 = C_prev.get_vector()
                ##o1=O_prev.get_vector()

                ##O_curr=res['O']
                C_curr = res["C"]
                N_curr = res["N"]
                CA_curr = res["CA"]

                ##o=O_curr.get_vector()
                c = C_curr.get_vector()
                n = N_curr.get_vector()
                ca = CA_curr.get_vector()

                geo.CA_C_N_angle = calc_angle(ca1, c1, n) * rad
                geo.C_N_CA_angle = calc_angle(c1, n, ca) * rad
                geo.CA_N_length = CA_curr - N_curr
                geo.CA_C_length = CA_curr - C_curr
                geo.peptide_bond = N_curr - C_prev

                psi = calc_dihedral(n1, ca1, c1, n)  ##goes to current res
                omega = calc_dihedral(ca1, c1, n, ca)  ##goes to current res
                phi = calc_dihedral(c1, n, ca, c)  ##goes to current res

                geo.psi_im1 = psi * rad
                geo.omega = omega * rad
                geo.phi = phi * rad

                geo.CA_N_length = CA_curr - N_curr
                geo.CA_C_length = CA_curr - C_curr
                ##geo.C_O_length= C_curr - O_curr

                geo.N_CA_C_angle = calc_angle(n, ca, c) * rad
                ##geo.CA_C_O_angle= calc_angle(ca, c, o)*rad

                ##geo.N_CA_C_O= calc_dihedral(n, ca, c, o)*rad

                N_prev = res["N"]
                CA_prev = res["CA"]
                C_prev = res["C"]
                ##O_prev=res['O']

            model_structure_geo.append(geo)
    return model_structure_geo
class ResidueMutator(object):
    def __init__(self,
                 tripeptides=None,
                 components=None,
                 standard_residues=None):
        """ The mutator object takes a non-standard residue or incomplete residue and modifies it
        """
        try:
            from Bio.PDB import PDBParser
            from Bio.SVDSuperimposer import SVDSuperimposer
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "BioPython is required for this functionality")

        # get defaults if not provided
        if standard_residues is None:
            standard_residues = data.standard_residues
        if tripeptides is None:
            tripeptides = data.tripeptides
        if components is None:
            components = data.chem_components
        self.components = components
        self.candidates = {}
        self.standard_residues = standard_residues
        self.imposer = SVDSuperimposer()
        self.parser = PDBParser(PERMISSIVE=1, QUIET=True)

        # build up candidate structures
        for fn in tripeptides:
            structure = self.parser.get_structure("", fn)
            resn = structure[0][" "][2].get_resname()
            self.candidates[resn] = []
            for model in structure:
                self.candidates[resn].append(model[" "][2])

    def mutate(self, residue, replace_backbone=True):
        resn = residue.get_resname()

        if self.standard(resn):
            # the residue is already a standard residue, here for repair
            parn = resn
        else:
            parn = self.components[resn]['_chem_comp.mon_nstd_parent_comp_id']
            if not self.standard(parn):
                # the parent residue is a nonstandard residue, can't mutate
                return False

        if parn not in self.candidates:
            # parent not in candidate structures
            return False

        sc_fixed = set(
            self.components[resn]
            ['side_chain_atoms'])  # side chain atoms of fixed residue
        sc_movin = set(
            self.components[parn]
            ['side_chain_atoms'])  # side chain atoms of standard parent
        atom_names = sc_fixed.intersection(sc_movin)

        # get list of side chain atoms present in residue
        atom_list = []
        for atom in atom_names:
            if atom in residue:
                atom_list.append(atom)

        if len(atom_list) == 0:
            return False

        # get side chain atom coordinates
        fixed_coord = np.zeros((len(atom_list), 3))
        for i in range(len(atom_list)):
            fixed_coord[i] = residue[atom_list[i]].get_coord()

        # loop over candidates, finding best RMSD
        moved_coord = np.zeros((len(atom_list), 3))
        min_rms = 99999
        rotm = None
        tran = None
        min_candidate = None
        for candidate in self.candidates[parn]:
            for j in range(len(atom_list)):
                moved_coord[j] = candidate[atom_list[j]].get_coord()
            # perfom SVD fitting
            self.imposer.set(fixed_coord, moved_coord)
            self.imposer.run()
            if self.imposer.get_rms() < min_rms:
                min_rms = self.imposer.get_rms()
                rotm, tran = self.imposer.get_rotran()
                min_candidate = candidate

        # copy the candidate to a new object
        candidate = min_candidate.copy()
        candidate.transform(rotm, tran)
        stripHydrogens(candidate)

        if replace_backbone:
            # replace backbone atoms of candidate
            backbone_atoms = self.components[resn]['main_chain_atoms']
            for atom in backbone_atoms:
                if atom not in residue:
                    continue
                if atom not in candidate:
                    candidate.add(residue[atom].copy())
                candidate[atom].set_coord(residue[atom].get_coord())

        return candidate

    def standard(self, resname):
        return resname in self.standard_residues

    def modified(self, resname):
        if resname in self.standard_residues:
            # it's standard, not modified
            return False

        if resname in self.components and '_chem_comp.mon_nstd_parent_comp_id' in self.components[
                resname]:
            return (
                (resname not in self.standard_residues) and
                (self.components[resname]['_chem_comp.mon_nstd_parent_comp_id']
                 in self.standard_residues))
        else:
            # has no standard parent field - can't be modified
            return False
Exemple #36
0
def PdbAtomIterator(source):
    """Return SeqRecord objects for each chain in a PDB file.

    Argument source is a file-like object or a path to a file.

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.

    This gets called internally via Bio.SeqIO for the atom based interpretation
    of the PDB file format:

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"):
    ...     print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    Equivalently,

    >>> with open("PDB/1A8O.pdb") as handle:
    ...     for record in PdbAtomIterator(handle):
    ...         print("Record id %s, chain %s" % (record.id, record.annotations["chain"]))
    ...
    Record id 1A8O:A, chain A

    """
    # TODO - Add record.annotations to the doctest, esp the residues (not working?)

    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB import PDBParser

    structure = PDBParser().get_structure(None, source)
    pdb_id = structure.header["idcode"]
    if not pdb_id:
        warnings.warn("'HEADER' line not found; can't determine PDB ID.",
                      BiopythonParserWarning)
        pdb_id = "????"

    for record in AtomIterator(pdb_id, structure):
        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations.update(structure.header)

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
        self.shared_program.vert['radius'] = radius
        self.shared_program.frag['radius'] = radius
        self.shared_program.frag['color'] = color

        self._draw_mode = 'points'

    def _prepare_transforms(self, view):
        view.view_program.vert['transform'] = view.get_transform()


from Bio.PDB import PDBParser, DSSP

from molecular_data import crgbaDSSP, restype, colorrgba, vrad, resdict

pdbdata = 'data/1yd9.pdb'
parser = PDBParser(QUIET=True, PERMISSIVE=True)
structure = parser.get_structure('model', pdbdata)


def centroid(arr):
    length = arr.shape[0]
    sum_x = np.sum(arr[:, 0])
    sum_y = np.sum(arr[:, 1])
    sum_z = np.sum(arr[:, 2])
    return sum_x / length, sum_y / length, sum_z / length


atoms = [atom for atom in structure.get_atoms()]
natoms = len(atoms)
#atom coordinates
coordinates = np.array([atom.coord for atom in atoms])
def pdb_neighbors(pdb_f, pdb_id):
    structure = PDBParser().get_structure(pdb_id, pdb_f)
    atom_list = Selection.unfold_entities(structure, 'A')
    ns = NeighborSearch(atom_list)
    center_res = [
        res for res in structure.get_residues()
        if res.get_resname() in ['PTR', 'SEP', 'TPO']
    ]

    neighbors = []
    for res in center_res:
        if res.get_resname() == 'PTR':
            central_atoms = [
                atom for atom in res.child_list
                if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OH']
            ]
        elif res.get_resname() == 'SEP':
            central_atoms = [
                atom for atom in res.child_list
                if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG']
            ]
        elif res.get_resname() == 'TPO':
            central_atoms = [
                atom for atom in res.child_list
                if atom.get_name() in ['O1P', 'O2P', 'O3P', 'OG1']
            ]

        atom_neighbors = [
            ns.search(a.get_coord(), BOND_CUTOFF) for a in central_atoms
        ]
        atom_neighbors = [atom for atoms in atom_neighbors for atom in atoms]

        positive_atom_neighbors = [
            ns.search(a.get_coord(), POSITIVE_BOND_CUTOFF)
            for a in central_atoms
        ]
        positive_atom_neighbors = [
            atom for atoms in positive_atom_neighbors for atom in atoms
        ]
        positive_atom_neighbors = [
            atom for atom in positive_atom_neighbors
            if atom.get_name() in ['NE2', 'ND1', 'NZ', 'NE', 'NH2', 'NH1']
        ]

        atom_neighbors.extend(positive_atom_neighbors)
        atom_neighbors = list(set(atom_neighbors))

        #filter self
        atom_neighbors = [
            atom for atom in atom_neighbors if not atom.get_parent() == res
        ]

        # only consider those containing N or O
        atom_neighbors = [
            atom for atom in atom_neighbors
            if 'N' in atom.get_name() or 'O' in atom.get_name()
        ]

        ## ignore water
        atom_neighbors = [
            atom for atom in atom_neighbors
            if not atom.get_parent().get_resname() == 'HOH'
        ]

        # filter main_chain O, they are not donor
        atom_neighbors = [
            atom for atom in atom_neighbors if not atom.get_name() == 'O'
        ]

        # filter O in N Q, they are not donor
        atom_neighbors = [
            atom for atom in atom_neighbors
            if not (atom.get_name() == 'OD1'
                    and atom.get_parent().get_resname() == 'ASN')
        ]
        atom_neighbors = [
            atom for atom in atom_neighbors
            if not (atom.get_name() == 'OE1'
                    and atom.get_parent().get_resname() == 'GLN')
        ]

        # filter O in D E, they are not donor
        # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD1' and atom.get_parent().get_resname() == 'ASP')]
        # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OD2' and atom.get_parent().get_resname() == 'ASP')]
        # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE1' and atom.get_parent().get_resname() == 'GLU')]
        # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'OE2' and atom.get_parent().get_resname() == 'GLU')]

        # ignore residues on the same chain of res using main-chain atom
        # atom_neighbors = [atom for atom in atom_neighbors if not (atom.get_name() == 'N' and atom.get_parent().get_parent() == res.get_parent())]

        ## filter non-standard residues
        STAND_RES = [
            'VAL', 'ILE', 'LEU', 'GLU', 'GLN', 'ASP', 'ASN', 'HIS', 'TRP',
            'PHE', 'TYR', 'ARG', 'LYS', 'SER', 'THR', 'MET', 'ALA', 'GLY',
            'PRO', 'CYS'
        ]
        for atom in atom_neighbors:
            if atom.get_parent().get_resname() not in STAND_RES:
                atom_neighbors = []

        ## filter same chain
        # for atom in atom_neighbors:
        # if atom.get_parent().get_parent() == res.get_parent():
        # atom_neighbors = []

        ## filter entry containing main_chain O of residues on different chain of res
        # for atom in atom_neighbors:
        # if atom.get_name() == 'N':
        # atom_neighbors = []

        atom_neighbors = list(
            set(Selection.unfold_entities(atom_neighbors, 'R')))
        atom_neighbors = [r for r in atom_neighbors if r != res]

        if len(atom_neighbors) > 0:
            res = res.get_resname() + '_' + str(
                res.get_id()[1]) + '_' + res.get_parent().get_id()
            atom_neighbors = [
                n.get_resname() + '_' + str(n.get_id()[1]) + '_' +
                n.get_parent().get_id() for n in atom_neighbors
            ]
            neighbors.append((pdb_id, res, atom_neighbors))

    return neighbors
#!/usr/bin/env python
# coding: utf-8

from Bio.PDB import Atom
from math import sqrt
from Bio.PDB import PDBParser
import argparse
import sys


prot_id = "5AGY.pdb"
prot_file = sys.argv[1]


parser = PDBParser(PERMISSIVE = 1)
structure = parser.get_structure(prot_id, prot_file)
model = structure[0]


if "-h" in sys.argv or "--help" in sys.argv:
    print("Ce programme identifie les interactions entre cycles aromatiques à partir d'un fichier Protein Data Bank (PDB). Les critères pris en compte proviennent du Protein Interaction Calculator que l'on peut retrouver en suivant le lien : http://pic.mbu.iisc.ernet.in/PIC_Criteria.pdf. Le parser de Biopython est strucutré de la manière suivante : Structure/model/chain/residu/atome.")
    print("Fonctions utilisées :")
    print('parser.get_structure -->  ', 'Creation of a structure object from a PDB file')
    print('objet.get_name -->  ', 'Renvoie le nom correspondant à l objet : Structure/model/chain/residu/atome')
    print('parser.get_structure -->  ', 'Renvoie le numéro rattaché au résidue dans le fichier PDB')
    print('')


residues = []
aroaro = ["PHE", "TRP", "TYR"]
for chain in model: # protéine -> chaîne -> résidues impliqués dans interactions aromatiques / aromatiques
Exemple #40
0
import numpy
import argparse
from Bio.PDB import PDBParser
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings

warnings.filterwarnings("ignore", category=PDBConstructionWarning)

arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("infile", help="Input file")
args = arg_parser.parse_args()

fname = args.infile

pdb_parser = PDBParser()

# Ignore PDB warnings, we are just interested in the size

structure = pdb_parser.get_structure(0, fname)
atoms = list(structure.get_atoms())

natoms = len(atoms)

coords = numpy.zeros((natoms, 3))

for index, this_atom in enumerate(atoms):
    coords[index, :] = this_atom.get_vector().get_array()

coords /= 10  # Convert from Ångström to nm.

x_size = coords[:, 0].max() - coords[:, 0].min()
Exemple #41
0
def pdb2dfromactivesite(pdb_fh, active_sites=[]):
    """
    This calculates distances between each ligand atom or optionally provided amino acids (sources) and each residue in the protein.
    
    :param pdb_fh: path to .pdb file.
    :param active_sites: optional list of residue numbers as sources. 
    :returns dfromligands: pandas table with distances from ligand
    """
    junk_residues = [
        "HOH", " MG", "CA", " NA", "SO4", "IOD", "NA", "CL", "GOL", "PO4"
    ]
    pdb_parser = PDBParser()
    pdb_data = pdb_parser.get_structure("pdb_name", pdb_fh)
    model = pdb_data[0]
    chainA = model["A"]  #only a chain
    residues = list(chainA.get_residues())
    ligands_residue_objs = []
    for residue in chainA:
        if not residue.get_resname() in junk_residues:
            if not residue.get_resname() in aas_21_3letter:  #only aas
                ligands_residue_objs.append(residue)
            elif residue.id[1] in active_sites:
                ligands_residue_objs.append(residue)

    dfromligands = pd.DataFrame()
    for ligandi in range(len(ligands_residue_objs)):
        ligand_residue_obj = ligands_residue_objs[ligandi]
        for ligand_atom_obj in ligand_residue_obj:
            for residue in chainA:
                if residue.get_resname() in aas_21_3letter:  #only aas
                    dfromligands.loc[residue.id[1],
                                     "ref_pdb"] = residue.get_resname()
                    if not ligand_residue_obj.get_resname() in aas_21_3letter:
                        dfromligands.loc[residue.id[1],"Distance from Ligand: %s (ATOM: %s)" % \
                                         (ligand_residue_obj.get_resname(),ligand_atom_obj.get_name())]\
                        =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"]
                    else:
                        dfromligands.loc[residue.id[1],"Distance from active site residue: %s %d (ATOM: %s)" % \
                                         (ligand_residue_obj.get_resname(),ligand_residue_obj.get_id()[1],\
                                          ligand_atom_obj.get_name())]\
                        =ligand_residue_obj[ligand_atom_obj.get_name()]-residue["CA"]

    dfromligands.index.name = "aasi"
    if "ref_pdb" in dfromligands:
        del dfromligands["ref_pdb"]
    #average and minimum distances
    cols_all = dfromligands.columns.tolist()
    for moltype in [
            'Distance from Ligand:', 'Distance from active site residue:'
    ]:
        cols_moltype = [c for c in cols_all if moltype in c]
        if len(cols_all) > 0:
            dfromligands.loc[:, '%s average' %
                             moltype] = dfromligands.loc[:,
                                                         cols_moltype].T.mean(
                                                         )
            dfromligands.loc[:, '%s minimum' %
                             moltype] = dfromligands.loc[:,
                                                         cols_moltype].T.min()
            mols = np.unique(
                [c[c.find(moltype):c.find(' (ATOM')] for c in cols_moltype])
            if len(mols) > 1:
                for mol in mols:
                    cols_mol = [c for c in cols_moltype if mol in c]
                    dfromligands.loc[:, '%s: average' %
                                     mol] = dfromligands.loc[:,
                                                             cols_mol].T.mean(
                                                             )
                    dfromligands.loc[:, '%s: minimum' %
                                     mol] = dfromligands.loc[:,
                                                             cols_mol].T.min()

    return dfromligands
Exemple #42
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.
    """
    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from Bio.PDB import PDBParser
    from Bio.SCOP.three_to_one_dict import to_one_letter_code

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return to_one_letter_code.get(residue.resname, 'X')

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.iteritems()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [res for res in chain.get_unpacked_list()
                    if res.get_resname().upper() in to_one_letter_code]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i+1] != rnum + 1:
                # It's a gap!
                gaps.append((i+1, rnum, rnumbers[i+1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(map(restype, residues[prev_idx:i]))
                    prev_idx = i
                    res_out.append('X'*gapsize)
                    # Last segment
                    res_out.extend(map(restype, residues[prev_idx:]))
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  UserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(map(restype, residues[prev_idx:i]))
        else:
            # No gaps
            res_out = map(restype, residues)
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                id=record_id,
                description=record_id,
                )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
Exemple #43
0
def main():
    """The main routine for conkit-validate functionality"""
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = conkit.command_line.setup_logging(level="info")

    if os.path.isfile(args.output) and not args.overwrite:
        raise FileExistsError('The output file {} already exists!'.format(args.output))
    if args.pdbformat != 'pdb':
        raise ValueError('Model file format can only be PDB')

    logger.info(os.linesep + "Working directory:                           %s", os.getcwd())
    logger.info("Reading input sequence:                      %s", args.seqfile)
    sequence = conkit.io.read(args.seqfile, args.seqformat).top

    if len(sequence) < 5:
        raise ValueError('Cannot validate model with less than 5 residues')

    logger.info("Length of the sequence:                      %d", len(sequence))
    logger.info("Reading input distance prediction:           %s", args.distfile)
    prediction = conkit.io.read(args.distfile, args.distformat).top
    logger.info("Reading input PDB model:                     %s", args.pdbfile)
    model = conkit.io.read(args.pdbfile, args.pdbformat).top
    p = PDBParser()
    structure = p.get_structure('structure', args.pdbfile)[0]
    dssp = DSSP(structure, args.pdbfile, dssp=args.dssp, acc_array='Wilke')

    logger.info(os.linesep + "Validating model.")

    if len(sequence) > 500:
        logger.info("Input model has more than 500 residues, this might take a while...")

    figure = conkit.plot.ModelValidationFigure(model, prediction, sequence, dssp, map_align_exe=args.map_align_exe)
    figure.savefig(args.output, overwrite=args.overwrite)
    logger.info(os.linesep + "Validation plot written to %s", args.output)

    residue_info = figure.data.loc[:, ['RESNUM', 'SCORE', 'MISALIGNED']]
    table = PrettyTable()
    table.field_names = ["Residue", "Predicted score", "Suggested register"]

    _resnum_template = '{} ({})'
    _error_score_template = '*** {0:.2f} ***'
    _correct_score_template = '    {0:.2f}    '
    _register_template = '*** {} ({}) ***'
    _empty_register = '               '

    for residue in residue_info.values:
        resnum, score, misalignment = residue
        current_residue = _resnum_template.format(sequence.seq[resnum - 1], resnum)
        score = _error_score_template.format(score) if score > 0.5 else _correct_score_template.format(score)

        if misalignment and resnum in figure.alignment.keys():
            register = _register_template.format(sequence.seq[figure.alignment[resnum] - 1], figure.alignment[resnum])
        else:
            register = _empty_register

        table.add_row([current_residue, score, register])

    logger.info(os.linesep)
    logger.info(table)
 def __init__(self):
     self.parser = PDBParser(QUIET=True, PERMISSIVE=True)
     self.df_structure = None
     self.pairwise_dist = None
# Benchmark the parsing of a PDB file given as an argument

import sys
import time
from Bio.PDB import PDBParser

pdb_filepath = sys.argv[1]
parser = PDBParser()

start = time.time()
parser.get_structure("", pdb_filepath)
elapsed = time.time() - start

print elapsed
Exemple #46
0
def parsePDBStructure( pdb_id ):
    parser = PDBParser()
    structure = parser.get_structure('test_rsa', pdb_id)
    return structure
Exemple #47
0
search_dict = pypdb.Query(query)     # create a dictionary containing search information
found = search_dict.search(search_dict)[:500]      # create a list of these PDBs by searching RCSB

# create a list with the information and the metadata
metadata = []

for proteins in found:  # for items in # for the items in the list,
    metadata.append(pypdb.describe_pdb(proteins))  # append the dictionary 

# Save the metadata list as a CSV file
dfm = pd.DataFrame(metadata) # convert to a Pandas DF
dfm.to_csv('metadata_'+now+'.csv')      # save as a CSV file

# %%
parser = PDBParser()       # create a parser
pdbs = list()
pdbl = PDBList()

# Download all PDB structures in the previous list if they aren't there
for id in found:
    pdbl.retrieve_pdb_file(pdb_code=id, file_format='pdb', pdir=PDB_dl_dir)   # Retrieve in PDB format, put in directory 'PDB'

# Finished, print "Downloading ... finished!"
print('\n#############~DOWNLOADING COMPLETE~#############\n')
# %%
# convert pdb*.ent to *.pdb
for file in os.scandir(PDB_dl_dir):
    if (file.path.endswith(".ent") and file.is_file()):
        newfn = file.name.replace("pdb","").replace(".ent",".pdb")
        os.rename(file, PDB_dl_dir+"/"+newfn)
Exemple #48
0
 def test_Superimposer(self):
     """Test on module that superimpose two protein structures."""
     pdb1 = "PDB/1A8O.pdb"
     p = PDBParser()
     s1 = p.get_structure("FIXED", pdb1)
     fixed = Selection.unfold_entities(s1, "A")
     s2 = p.get_structure("MOVING", pdb1)
     moving = Selection.unfold_entities(s2, "A")
     rot = numpy.identity(3).astype("f")
     tran = numpy.array((1.0, 2.0, 3.0), "f")
     for atom in moving:
         atom.transform(rot, tran)
     sup = Superimposer()
     sup.set_atoms(fixed, moving)
     self.assertTrue(numpy.allclose(sup.rotran[0], numpy.identity(3)))
     self.assertTrue(
         numpy.allclose(sup.rotran[1], numpy.array([-1.0, -2.0, -3.0])))
     self.assertAlmostEqual(sup.rms, 0.0, places=3)
     # Turn black code style off
     # fmt: off
     atom_list = [
         "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C",
         "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C",
         "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C", "C", "O",
         "C", "C", "C", "O", "N", "N", "C", "C", "O", "N", "C", "C", "O",
         "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N",
         "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C",
         "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C", "C",
         "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C",
         "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C",
         "C", "C", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "N",
         "C", "C", "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C",
         "C", "N", "C", "N", "N", "N", "C", "C", "O", "C", "C", "C", "C",
         "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "C", "C",
         "C", "O", "N", "C", "C", "O", "C", "C", "C", "C", "N", "N", "C",
         "C", "O", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C",
         "N", "C", "C", "O", "C", "C", "C", "N", "C", "N", "N", "N", "C",
         "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N",
         "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "C",
         "N", "C", "C", "O", "C", "O", "N", "C", "C", "O", "C", "C", "C",
         "O", "N", "N", "C", "C", "O", "C", "C", "C", "O", "O", "N", "C",
         "C", "O", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C",
         "N", "N", "C", "C", "O", "C", "C", "O", "N", "N", "C", "C", "O",
         "C", "C", "C", "C", "N", "C", "C", "C", "C", "C", "N", "C", "C",
         "O", "C", "C", "SE", "C", "N", "C", "C", "O", "C", "O", "C", "N",
         "C", "C", "O", "C", "C", "C", "O", "O", "N", "C", "C", "O", "C",
         "O", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C",
         "O", "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "N",
         "C", "C", "O", "C", "C", "C", "O", "N", "N", "C", "C", "O", "C",
         "C", "O", "N", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C",
         "C", "O", "N", "N", "C", "C", "O", "C", "C", "C", "N", "C", "C",
         "O", "C", "C", "O", "O", "N", "C", "C", "O", "C", "S", "N", "C",
         "C", "O", "C", "C", "C", "C", "N", "N", "C", "C", "O", "C", "O",
         "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O",
         "C", "C", "C", "C", "N", "C", "C", "O", "C", "C", "C", "C", "N",
         "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "C", "C", "C",
         "N", "C", "C", "O", "N", "C", "C", "O", "C", "C", "C", "N", "C",
         "C", "O", "N", "C", "C", "O", "C", "N", "C", "C", "O", "C", "O",
         "C", "N", "C", "C", "O", "C", "C", "C", "C", "N", "C", "C", "O",
         "C", "C", "C", "O", "O", "N", "C", "C", "O", "C", "C", "C", "O",
         "O", "N", "C", "C", "O", "C", "C", "SE", "C", "N", "C", "C", "O",
         "C", "C", "SE", "C", "N", "C", "C", "O", "C", "O", "C", "N", "C",
         "C", "O", "C", "N", "C", "C", "O", "C", "S", "N", "C", "C", "O",
         "C", "C", "C", "O", "N", "N", "C", "C", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O",
         "O", "O", "O", "O", "O", "O", "O"
     ]
     # Turn black code style on
     # fmt: on
     sup.apply(moving)
     atom_moved = []
     for aa in moving:
         atom_moved.append(aa.element)
     self.assertEqual(atom_moved, atom_list)
Exemple #49
0
            return 0

    def accept_residue(self, residue):
        hetatm_flag, resseq, icode = residue.get_id()
        # print(residue.get_id())
        if hetatm_flag != " ":
            # skip HETATMS
            return 0
        if icode != " ":
            warnings.warn("WARNING: Icode %s at position %s" % (icode, resseq),
                          BiopythonWarning)
        return 1


for pdb_id_a in pdb_list:
    pdb_id = pdb_id_a[3:7]
    if len(pdb_id_a.split('_')) != 3:
        continue
    pdb_chain = pdb_id_a.split('_')[2]

    data_path = f'data/validation_pdb/pdb{pdb_id.lower()}.ent'
    if not os.path.exists(data_path):
        continue
    p = PDBParser()
    structure = p.get_structure('X', data_path)

    sel = ChainSelect(pdb_chain)
    io = PDBIO()
    io.set_structure(structure)
    io.save(f'data/validation_pdb/chain/{pdb_id}{pdb_chain}.pdb', sel)
def getStructure(name, filename):
    #faster in shell ?
    from Bio.PDB import PDBParser
    parser = PDBParser(PERMISSIVE=1)
    structure = parser.get_structure(name, filename)
    return structure
Exemple #51
0
                       type = str, \
                       help = "Chain from which residues should be removed")
argparser.add_argument("--start", \
                       dest = "start", \
                       type = int, \
                       help = "First residue to be removed from the chain")
argparser.add_argument("--end", \
                       dest = "end", \
                       type = int, \
                       help = "Last residue to be removed from the chain")

# Get the arguments
args = argparser.parse_args()
in_pdb_file = args.in_pdb_file
out_pdb_file = args.out_pdb_file
chain = args.chain
start = args.start
end = args.end
# Create a PDB parser
parser = PDBParser()
# Parse the structure
name = in_pdb_file.replace(".pdb", "")
structure = parser.get_structure(name, in_pdb_file)
# Save the processed structure
w = PDBIO()
w.set_structure(structure)
w.save(out_pdb_file, \
       NotInRangeResSelect(chain = chain, \
                           start = start, \
                           end = end))
Exemple #52
0

aa = [
    'PRO', 'TYR', 'THR', 'VAL', 'PHE', 'ARG', 'GLY', 'CYS', 'ALA', 'LEU',
    'MET', 'ASP', 'GLN', 'SER', 'TRP', 'LYS', 'GLU', 'ASN', 'ILE', 'HIS'
]

from os import chdir
b = '/Users/nicholassofroniew/Documents/DATA-proteins/'
chdir(b)

from glob import glob
from os.path import exists

files = glob(b + 'pdb/*/*.ent')
p = PDBParser()

for f in files:
    name = f[len(b):]
    print(name)
    if not exists(b + 'pdb-parsed' + name[6:-4] +
                  '.csv') and not exists(b + 'pdb-rejected' + name[6:-4] +
                                         '.csv'):
        try:
            structure = p.get_structure('X', b + name)
            df = parse(structure)
            flag = check(df)
            if not flag:
                df = DataFrame([])
        except:
            df = DataFrame([])
Exemple #53
0
def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
    """Takes a PDB string and constructs a Protein object.

  WARNING: All non-standard residue types will be converted into UNK. All
    non-standard atoms will be ignored.

  Args:
    pdb_str: The contents of the pdb file
    chain_id: If None, then the pdb file must contain a single chain (which
      will be parsed). If chain_id is specified (e.g. A), then only that chain
      is parsed.

  Returns:
    A new `Protein` parsed from the pdb contents.
  """
    pdb_fh = io.StringIO(pdb_str)
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('none', pdb_fh)
    models = list(structure.get_models())
    if len(models) != 1:
        raise ValueError(
            f'Only single model PDBs are supported. Found {len(models)} models.'
        )
    model = models[0]

    if chain_id is not None:
        chain = model[chain_id]
        chains = [chain]
    else:
        chains = list(model.get_chains())
    #  if len(chains) != 1:
    #    raise ValueError(
    #        'Only single chain PDBs are supported when chain_id not specified. '
    #        f'Found {len(chains)} chains.')
    #  else:
    #    chain = chains[0]

    atom_positions = []
    aatype = []
    atom_mask = []
    residue_index = []
    b_factors = []

    PARAM_CHAIN_BREAK = 100
    residue_index_prev = 0
    for k, chain in enumerate(chains):
        for res in chain:
            if res.id[2] != ' ':
                raise ValueError(
                    f'PDB contains an insertion code at chain {chain.id} and residue '
                    f'index {res.id[1]}. These are not supported.')
            res_shortname = residue_constants.restype_3to1.get(
                res.resname, 'X')
            restype_idx = residue_constants.restype_order.get(
                res_shortname, residue_constants.restype_num)
            pos = np.zeros((residue_constants.atom_type_num, 3))
            mask = np.zeros((residue_constants.atom_type_num, ))
            res_b_factors = np.zeros((residue_constants.atom_type_num, ))
            for atom in res:
                if atom.name not in residue_constants.atom_types:
                    continue
                pos[residue_constants.atom_order[atom.name]] = atom.coord
                mask[residue_constants.atom_order[atom.name]] = 1.
                res_b_factors[residue_constants.atom_order[
                    atom.name]] = atom.bfactor
            if np.sum(mask) < 0.5:
                # If no known atom positions are reported for the residue then skip it.
                continue
            aatype.append(restype_idx)
            atom_positions.append(pos)
            atom_mask.append(mask)
            residue_index.append(res.id[1] + residue_index_prev +
                                 PARAM_CHAIN_BREAK * k)
            b_factors.append(res_b_factors)
        residue_index_prev = residue_index[-1]

    return Protein(atom_positions=np.array(atom_positions),
                   atom_mask=np.array(atom_mask),
                   aatype=np.array(aatype),
                   residue_index=np.array(residue_index),
                   b_factors=np.array(b_factors))
Exemple #54
0
    for i in range(0, L):
        residues[i].xtra["SS_PSEA"] = ss_seq[i]
    #os.system("rm "+fname)


class PSEA:
    def __init__(self, model, filename):
        ss_seq = psea(filename)
        ss_seq = psea2HEC(ss_seq)
        annotate(model, ss_seq)
        self.ss_seq = ss_seq

    def get_seq(self):
        """
        Return secondary structure string.
        """
        return self.ss_seq


if __name__ == "__main__":

    import sys
    from Bio.PDB import PDBParser

    # Parse PDB file
    p = PDBParser()
    s = p.get_structure('X', sys.argv[1])

    # Annotate structure with PSEA sceondary structure info
    PSEA(s[0], sys.argv[1])
Exemple #55
0
def parse_pdb_coordinates(pdb_path: str,
                          start_position: int,
                          end_position: int,
                          position_correction: int,
                          chain: str,
                          sasa: bool = False) -> DataFrame:
    """
    Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa.
    If PDB is missing atoms, it can handle it.
    """

    # Get structure from PDB
    structure = PDBParser().get_structure('pdb', pdb_path)

    coordinates = []
    commands = []
    bfactors = []
    positions_worked = []  # positions present in pdb

    # Iterate over each CA atom and geet coordinates
    for i in np.arange(start_position + position_correction,
                       end_position + position_correction):
        # first check if atom exists
        try:
            structure[0][chain][int(i)].has_id("CA")
            # Get atom from pdb and geet coordinates
            atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i]
            coordinates.append(atom)
            # Get SASA command for each residue and bfactor
            residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i))
            commands.append(residue)
            bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor())
            bfactors.append(np.log10(bfactor))
            positions_worked.append(i)
        except:
            print("residue {} not found".format(str(i)))
            coordinates.append([np.nan, np.nan, np.nan, i])

    # Convert to df
    df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'],
                               data=coordinates)

    # Center data
    x, y, z = centroid(df_coordinates)
    df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2
    df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2
    df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2
    df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[
        'y_cent'] + df_coordinates['z_cent']

    # Add sasa values
    if sasa:
        # Get structure for SASA
        structure_sasa = freesasa.Structure(pdb_path)
        result = freesasa.calc(structure_sasa)
        # Calculate sasa
        sasa_area = freesasa.selectArea(commands, structure_sasa, result)
        df_sasa: DataFrame = DataFrame(columns=['SASA'],
                                       data=sasa_area.values())
        df_sasa['log B-factor'] = bfactors
        df_sasa['Position'] = positions_worked

        # Merge
        df_coordinates = df_coordinates.merge(df_sasa,
                                              how='outer',
                                              on='Position')

    return df_coordinates
Exemple #56
0
        """
        if self.rotran is None:
            raise PDBException("No transformation has been calculated yet")
        rot, tran = self.rotran
        rot = rot.astype('f')
        tran = tran.astype('f')
        for atom in atom_list:
            atom.transform(rot, tran)


if __name__ == "__main__":
    import sys

    from Bio.PDB import PDBParser, Selection

    p = PDBParser()
    s1 = p.get_structure("FIXED", sys.argv[1])
    fixed = Selection.unfold_entities(s1, "A")

    s2 = p.get_structure("MOVING", sys.argv[1])
    moving = Selection.unfold_entities(s2, "A")

    rot = numpy.identity(3).astype('f')
    tran = numpy.array((1.0, 2.0, 3.0), 'f')

    for atom in moving:
        atom.transform(rot, tran)

    sup = Superimposer()

    sup.set_atoms(fixed, moving)
class PandasMolStructure:
    def __init__(self):
        self.parser = PDBParser(QUIET=True, PERMISSIVE=True)
        self.df_structure = None
        self.pairwise_dist = None

    def get_pandas_structure(self,
                             pdb_file: str = None,
                             het_atom=False) -> pd.DataFrame:
        # TODO split get and init method
        """Constructs a pandas.DataFrame representation of PDB protein structure
        Args:
            pdb_file (str): Path to PDB file

        Returns:
            pd.DataFrame: DataFrame with following strudcture
            {"model":[],"chain":[],"residue":[],"atom":[],"x":[],"y":[],"z":[]}
        """
        assert not (self.df_structure is None and pdb_file is None),\
            "Data has not been initialized yet and no pdb file was provided"

        if self.df_structure is None:
            df_dict = {
                "model": [],
                "chain": [],
                "residue": [],
                "res_pos": [],
                "atom": [],
                "atom_pos": [],
                "is_hetatom": [],
                "x": [],
                "y": [],
                "z": []
            }

            structure = self.parser.get_structure("protein_1", pdb_file)

            # TODO Option should be given to cohoose if we want to use all models from NMR pdb_samples
            model = next(structure.get_models())
            # for model in structure.get_models():
            for chain in model.get_chains():
                for residue in chain.get_residues():

                    for atom in residue.get_atoms():
                        df_dict["model"].append(model.id)
                        df_dict["chain"].append(chain.id)
                        df_dict["residue"].append(residue.get_resname())
                        df_dict["res_pos"].append(residue.id[1])
                        df_dict["atom"].append(atom.get_name())
                        df_dict["atom_pos"].append(atom.serial_number)
                        df_dict["is_hetatom"].append(
                            not bool(re.search('het= ', residue.__repr__())))
                        cords = atom.get_coord()
                        df_dict["x"].append(cords[0])
                        df_dict["y"].append(cords[1])
                        df_dict["z"].append(cords[2])
            self.df_structure = pd.DataFrame(df_dict)

        if het_atom:
            return self.df_structure
        else:
            # het_atoms_to_ignore = ["HOH","NAG", "FUC", "MAN", "GAL", "SO4"]
            # FIXME Atoms to ignore should be based on HETATOM
            atoms_to_not_ignore = utils.get_AA_list(
                config.folder_structure_cfg.aminoacids_csv)
            return self.df_structure[self.df_structure["residue"].isin(
                atoms_to_not_ignore)]

    def get_atom_3Dcoord(self, pdb_file: str) -> np.array:
        """Returns numpy array of 3D atom positions
        Args:
            pdb_file (str): Path to pdb file

        Returns:
            np.array: Array of shape Nx3 where N is the number of atoms
            in pdb_file
        """
        # TODO filter out heteroatoms (watter)
        structure = self.parser.get_structure("protein_1", pdb_file)
        atoms = []
        for atom in structure.get_atoms():
            cords = atom.get_coord()
            atoms.append(cords)
        return np.array(atoms)

    def get_protein_sequence(self):
        structure_df = self.get_pandas_structure()
        sequence = structure_df[structure_df['residue'].shift() != structure_df['residue']]\
            .reset_index(drop=True)[["residue", "res_pos"]]
        return sequence

    @staticmethod
    def get_pairwise_euclidean_atom(sturcture_df: pd.DataFrame
                                    ):  #pdb_file: str = None,het_atom=False):
        # TODO upgrade distances to energy calculations based on distance and charge
        # TODO find data for atom charges
        # TODO whta to do with missing hydrogen atoms? X-Ray doesent determine H positions

        # if self.pairwise_dist is None:
        cords = sturcture_df[["x", "y", "z"]]
        return euclidean_distances(cords, cords)
Exemple #58
0
from Bio.PDB import DSSP, PDBParser
import os

dir = '*/casp11.domains/'
list = os.listdir('*/casp11.domains')
#print(list)
q1 = open("/home/ystroot/Documents/seqcasp11.txt", "a")
q2 = open("/home/ystroot/Documents/sscasp11.txt", "a")

for i in list:
    print(i)
    l = dir + i
    p = PDBParser()
    structure = p.get_structure("Model", l)
    model = structure[0]
    dssp = DSSP(model, l)
    for row in dssp:
        #if row[0] < 1000:
        q1.write(str(row[1]))
        #with open("/home/ystroot/Documents/sscasp11.txt","a") as q2:
        q2.write(str(row[2]))
    q1.write('\n')
    q2.write('\n')

q1.close()
q2.close()
from Bio.PDB import PDBParser
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import copy
from numpy import random

groupfilename = sys.argv[1]
structurename = sys.argv[2]

# create parser
parser = PDBParser()
# read structure from file
structure = parser.get_structure('Closed', structurename)
# store key locations in the DHFR structure
model = structure[0]
chain = model['A']
m20 = chain[20]['CA']
sheet = chain[112]['CA']
globular = chain[41]['CA']
ligand = model['X']
for residue in ligand:
    if "N" in residue.get_id()[0]:
        hydride = residue['H4']
        adenosine = residue['C18']

hydride_distances = {}
hydride_distance_list = []
adenosine_distances = {}
adenosine_distance_list = []
Exemple #60
0
def get_residue_depth(pdb_fh, msms_fh):
    """
    Extracts Residue depth from PDB structure 

    :param pdb_fh: path to PDB structure file
    :param msms_fh: path to MSMS libraries
    :returns data_depth: pandas table with residue depth per residue
    """
    from Bio.PDB import Selection, PDBParser
    from Bio.PDB.Polypeptide import is_aa
    from Bio.PDB.ResidueDepth import get_surface, _read_vertex_array, residue_depth, ca_depth, min_dist
    surface_fh = "%s/%s.msms.vert" % (dirname(msms_fh), basename(pdb_fh))
    if not exists(surface_fh):
        pdb_to_xyzr_fh = "%s/pdb_to_xyzr" % dirname(msms_fh)
        xyzr_fh = "%s/%s.xyzr" % (dirname(msms_fh), basename(pdb_fh))
        pdb_to_xyzr_com = "%s %s > %s" % (pdb_to_xyzr_fh, pdb_fh, xyzr_fh)
        msms_com = "%s -probe_radius 1.5 -if %s -of %s > %s.log" % (
            msms_fh, xyzr_fh, splitext(surface_fh)[0], splitext(surface_fh)[0])
        log_fh = "%s.log" % msms_fh
        log_f = open(log_fh, 'a')
        log_f.write("%s;\n%s\n" % (pdb_to_xyzr_com, msms_com))
        subprocess.call("%s;%s" % (pdb_to_xyzr_com, msms_com),
                        shell=True,
                        stdout=log_f,
                        stderr=subprocess.STDOUT)
        log_f.close()

    surface = _read_vertex_array(surface_fh)

    pdb_parser = PDBParser()
    pdb_data = pdb_parser.get_structure("pdb_name", pdb_fh)
    model = pdb_data[0]
    residue_list = Selection.unfold_entities(model, 'R')

    depth_dict = {}
    depth_list = []
    depth_keys = []
    for residue in residue_list:
        if not is_aa(residue):
            continue
        rd = residue_depth(residue, surface)
        ca_rd = ca_depth(residue, surface)
        # Get the key
        res_id = residue.get_id()
        chain_id = residue.get_parent().get_id()
        if chain_id == "A":
            depth_dict[(chain_id, res_id)] = (rd, ca_rd)
            depth_list.append((residue, (rd, ca_rd)))
            depth_keys.append((chain_id, res_id))
            # Update xtra information
            residue.xtra['EXP_RD'] = rd
            residue.xtra['EXP_RD_CA'] = ca_rd
        else:
            break
    depth_df = pd.DataFrame(depth_dict).T.reset_index()
    depth_df = depth_df.drop("level_0", axis=1)
    aasi_prev = 0
    for i in range(len(depth_df)):
        if depth_df.loc[i, "level_1"][1] != aasi_prev:
            depth_df.loc[i, "aasi"] = depth_df.loc[i, "level_1"][1]
            aasi_prev = depth_df.loc[i, "level_1"][1]

    depth_df = depth_df.drop("level_1", axis=1)
    depth_df = depth_df.loc[~pd.isnull(depth_df.loc[:, "aasi"]), :]
    depth_df = depth_df.set_index("aasi", drop=True)
    depth_df.columns = ["Residue depth", "Residue (C-alpha) depth"]
    return depth_df