Ejemplo n.º 1
0
    def getPairsOfResiduesInContact(self, structureL, structureR):
        '''
      Computes which amino acids of ligand are in contact with which amino acids of receptor
      
      @param structureL: Bio.PDB.Structure. Structure of ligand (bound state if available)
      @param structureR:   Bio.PDB.Structure. Structure of receptor (bound state if available).
      @return positiveContacts:  Set {(Bio.PDB.Residue.fullResId (from bound structure structureL), Bio.PDB.Residue.fullResId (from bound structure structureR))}
      @return chainsNotContactL: Set { str(chainId structureL)}
      @return chainsNotContactR: Set { str(chainId structureR)}
      
    '''
        try:
            atomListL = [
                atom for atom in structureL.child_list[0].get_atoms()
                if not atom.name.startswith("H")
            ]
        except IndexError:
            raise NoValidPDBFile("Problems parsing pdbFile 1")
        try:
            atomListR = [
                atom for atom in structureR.child_list[0].get_atoms()
                if not atom.name.startswith("H")
            ]
        except IndexError:
            raise NoValidPDBFile("Problems parsing pdbFile 2")

        searcher = NeighborSearch(atomListL + atomListR)
        allNeigs = searcher.search_all(self.res2res_dist, level="R")
        lStructId = structureL.get_id()
        rStructId = structureR.get_id()
        positiveContacts = set([])
        chainsInContactL = set([])
        chainsInContactR = set([])
        for res1, res2 in allNeigs:
            pdbId1, modelId1, chainId1, resId1 = res1.get_full_id()
            pdbId2, modelId2, chainId2, resId2 = res2.get_full_id()
            fullResId1 = res1.get_full_id()
            fullResId2 = res2.get_full_id()
            if pdbId1 == lStructId and pdbId2 == rStructId:
                positiveContacts.add((fullResId1, fullResId2))
                chainsInContactL.add(fullResId1[2])
                chainsInContactR.add(fullResId2[2])
            elif pdbId1 == rStructId and pdbId2 == lStructId:
                positiveContacts.add((fullResId2, fullResId1))
                chainsInContactL.add(fullResId2[2])
                chainsInContactR.add(fullResId1[2])
        if CONSIDER_HOMOOLIG_AS_POS:
            positiveContacts, chainsInContactL, chainsInContactR = self.fixHomooligomers(
                structureL, structureR, positiveContacts, chainsInContactL,
                chainsInContactR)
        allChainsL = set([elem.get_id() for elem in structureL[0].get_list()])
        allChainsR = set([elem.get_id() for elem in structureR[0].get_list()])
        chainsNotContactL = allChainsL.difference(chainsInContactL)
        chainsNotContactR = allChainsR.difference(chainsInContactR)
        return positiveContacts, chainsNotContactL, chainsNotContactR
Ejemplo n.º 2
0
    def getPairsOfResiduesInContact(self, structureL, structureR):
        '''
      Computes which amino acids of ligand are in contact with which amino acids of receptor
      
      :param structureL: Bio.PDB.Structure. Structure of ligand unbound state if available
      :param structureR:   Bio.PDB.Structure. Structure of receptor unbound state if available.
      :return positiveContacts, chainsNotContactL, chainsNotContactR
      
               positiveContacts:  Set {( Bio.PDB.Residue.fullResId (from bound structure structureL), 
                                        Bio.PDB.Residue.fullResId (from bound structure structureR)  )
                                      }
              chainsNotContactL: Set { Bio.PDB.Chain.get_id()}  for ligand chains that are not in contact
              chainsNotContactR: Set { Bio.PDB.Chain.get_id()}  for receptor chains that are not in contact              
    '''
        try:
            atomListL = [
                atom for atom in structureL.child_list[0].get_atoms()
                if not atom.name.startswith("H")
            ]
        except IndexError:
            raise NoValidPDBFile("Problems parsing pdbFile 1")
        try:
            atomListR = [
                atom for atom in structureR.child_list[0].get_atoms()
                if not atom.name.startswith("H")
            ]
        except IndexError:
            raise NoValidPDBFile("Problems parsing pdbFile 2")

        searcher = NeighborSearch(atomListL + atomListR)
        allNeigs = searcher.search_all(self.res2res_dist, level="R")
        lStructId = structureL.get_id()
        rStructId = structureR.get_id()
        positiveContactsResidues = set([])
        chainsInContactL = set([])
        chainsInContactR = set([])
        for res1, res2 in allNeigs:
            pdbId1, modelId1, chainId1, resId1 = res1.get_full_id()
            pdbId2, modelId2, chainId2, resId2 = res2.get_full_id()
            if pdbId1 == lStructId and pdbId2 == rStructId:
                positiveContactsResidues.add((res1, res2))
                chainsInContactL.add(chainId1)
                chainsInContactR.add(chainId2)
            elif pdbId1 == rStructId and pdbId2 == lStructId:
                positiveContactsResidues.add((res2, res1))
                chainsInContactL.add(chainId2)
                chainsInContactR.add(chainId1)

        allChainsL = set([elem.get_id() for elem in structureL[0].get_list()])
        allChainsR = set([elem.get_id() for elem in structureR[0].get_list()])
        chainsNotContactL = allChainsL.difference(chainsInContactL)
        chainsNotContactR = allChainsR.difference(chainsInContactR)
        return positiveContactsResidues, chainsNotContactL, chainsNotContactR
Ejemplo n.º 3
0
def checkIfSuccess(fname, maxNumberOfChains=MAX_NUMBER_OF_CHAINS):
    try:
        parser = myPDBParser()
        struct = parser.get_structure("pdb", fname)
        if not 0 in struct:
            return False
        if len(struct[0]) > maxNumberOfChains:
            raise NoValidPDBFile(
                "The maximun number of allowed chains is %d (%d) for %s" %
                (maxNumberOfChains, len(struct[0]), pdbId))
        return True
    except (Exception, ValueError) as e:
        print(e)
        if isinstance(e, NoValidPDBFile):
            raise e
        return False
Ejemplo n.º 4
0
def downloadUsingMmtf(pdbId, fnameOut, maxNumberOfChains=MAX_NUMBER_OF_CHAINS):
    print("downloadUsingMmtf")
    try:
        parser = MMTFParser()
        struct = parser.get_structure_from_url(pdbId)
        if not 0 in struct:
            return False
        if len(struct[0]) > maxNumberOfChains:
            raise NoValidPDBFile(
                "The maximun number of allowed chains is %d (%d) for %s" %
                (maxNumberOfChains, len(struct[0]), pdbId))
        writter = PDBIO()
        writter.set_structure(struct)
        writter.save(fnameOut)
        return True
    except (Exception, ValueError, HTTPError) as e:
        print(e)
        if isinstance(e, NoValidPDBFile):
            raise e
        return False
Ejemplo n.º 5
0
    def createFileForError(self, pdbStruct, outName):
        '''
      Creates a fake DSSP raw output generated when DSSP fails. All residues will be assigned secStruc= Z
      @param pdbStruct: Bio.PDB.Structure. Structure of the psb that is being analyzed
      @param outName: str. output fname
    '''
        oneResLine = "%5d%5d%2s%2s %2s\n"
        try:
            f = open(outName, "w")
            f.write(DsspComputer.DSSP_HEADER)
            if len(pdbStruct) == 0:
                raise NoValidPDBFile(
                    "No valid pdb File. There are no models contained")
            for chain in pdbStruct[0]:
                for i, res in enumerate(chain):
                    if not is_aa(res): continue
                    ##        print i,res,res.get_id()
                    seqIndex = i + 1
                    structIndex = res.get_id()[1]
                    letter = self.threeLetterAA_to_one(res.resname)
                    fakeSecStruct = "Z"
                    fakeCharacters1 = tuple("f" * 7)
                    fakeDigits1 = tuple([0, 0, "f", 0])
                    fakeStrs = tuple("f" * 4)
                    fakeFloats = tuple(elem + 0.0 for elem in range(8))
                    ##        print ((seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+
                    ##                            fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats)
                    ##        print oneResLine%( (seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+
                    ##                            fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats)

                    f.write(oneResLine %
                            (seqIndex, structIndex, chain.get_id(), letter,
                             fakeSecStruct))
            f.close()
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % outName)
            tryToRemove(outName)
            raise
        return 0
Ejemplo n.º 6
0
    def computeFeaturesOneComplex(self,
                                  fnameL,
                                  fnameR,
                                  lPdbId=None,
                                  rPdbId=None,
                                  isHomoComplex=False):
        '''
      Computes all features needed for complex codification.
      :param fnameL: str. Path to the the pdb or fasta file of the ligand of the complex
      :param fnameR: str. Path to the the pdb or fasta file of the receptor of the complex
      :param lPdbId: str. pdbId of receptor in order to query 3dCons. If None, psiblast will be launched directly
      :param rPdbId: str. pdbId of receptor in order to query 3dCons. If None, psiblast will be launched directly
      :param isHomoComplex: boolean. If True, just lPartner is provided and both partners are equal.
    '''

        prefixL, chainTypeL = self.splitExtendedPrefix(
            self.getExtendedPrefix(fnameL))[:2]
        prefixR, chainTypeR = self.splitExtendedPrefix(
            self.getExtendedPrefix(fnameR))[:2]
        assert prefixL == prefixR, "Error, prefixes are different for %s - %s" % (
            fnameL, fnameR)

        structureL, __ = loadPdbIfIsPath(
            fnameL)  #structureL will be none if fnameL is a fasta file
        structureR, __ = loadPdbIfIsPath(
            fnameR)  #structureR will be none if fnameL is a fasta file
        if (structureL is None
                or structureR is None) and self.areForTrainAndTest:
            raise NoValidPDBFile(
                "Error, Training requeries both inputs to be pdbs. \n" +
                "For prediction they can be either pdbs or sequences. You are training"
            )

        cMapper = ContactMapper(self.prefix,
                                computedFeatsRootDir=self.computedFeatsRootDir,
                                areForTrainAndTest=self.areForTrainAndTest,
                                boundAvailable=self.boundAvailable,
                                statusManager=self.statusManager)

        if isHomoComplex and not self.areForTrainAndTest:
            resDictL, resDictR = cMapper.computeComplex(
                fnameL, None, structureL, None)
        else:
            resDictL, resDictR = cMapper.computeComplex(fnameL,
                                                        fnameR,
                                                        structureL,
                                                        structureR,
                                                        isHomoLR=isHomoComplex)

        seqFeatComputer = SeqFeatComputer(
            self.prefix,
            computedFeatsRootDir=self.computedFeatsRootDir,
            statusManager=self.statusManager)

        if not self.methodProtocol.startswith("seq"):
            structComputer = StructFeatComputer(
                self.prefix,
                computedFeatsRootDir=self.computedFeatsRootDir,
                statusManager=self.statusManager)
            if isHomoComplex and not self.areForTrainAndTest:
                structComputer.computeComplex(fnameL, None, structureL, None)
            else:
                structComputer.computeComplex(fnameL, fnameR, structureL,
                                              structureR)

        seqFeatComputer.computeComplex(fnameL,
                                       fnameR,
                                       structureL,
                                       structureR,
                                       lPdbId=lPdbId,
                                       rPdbId=rPdbId,
                                       areLRequivalentProteins=isHomoComplex
                                       and not self.areForTrainAndTest)

        if isHomoComplex and not self.areForTrainAndTest:
            self.copyFeaturesToReceptor(
                fnameL,
                fnameR,
                isSeqInput=self.methodProtocol.startswith("seq"))
Ejemplo n.º 7
0
def downloadPDB(pdbId,
                pdbOutPath,
                chainId=None,
                bioUnit=None,
                removeOtherChains=False,
                checkObsolete=True):

    if not len(pdbId) == 4:
        raise ValueError("bad format pdbId '%s'" % (pdbId))

    pdbId = pdbId.lower()
    outName = os.path.join(pdbOutPath, pdbId + '.pdb')
    print("Trying to download pdbId %s to %s" % (pdbId, outName))
    success = False
    if not os.path.isfile(outName):
        if not bioUnit:
            success = downloadUsingMmtf(pdbId, outName)
        if not success or bioUnit:
            if checkObsolete:
                r = requests.get(
                    "https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/%s" %
                    (pdbId))
                if r.status_code != 200:
                    raise ValueError(
                        "pdb id %s does not exist according to https://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/%s. It may be obsolete or cif"
                        % (pdbId, pdbId))
            if bioUnit:
                cmd = 'wget -qO- ftp://ftp.wwpdb.org/pub/pdb/data/biounit/coordinates/all/%s.pdb%d.gz --timeout=30 | zcat  > %s' % (
                    pdbId, bioUnit, outName)
            else:
                cmd = 'wget -qO- http://www.rcsb.org/pdb/files/%s.pdb.gz --timeout=30 | zcat  > %s' % (
                    pdbId, outName)
            print(cmd)
            p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
            out = p.communicate()
            if len(out[1]) > 0 or not checkIfSuccess(outName):
                print(out)
                try:
                    os.remove(outName)
                except OSError:
                    pass
                msg= "Error downloading pdb_id: %s biounit: %s. Does biounit exists?"%(pdbId, bioUnit) if bioUnit else \
                      "Error downloading pdb_id: %s"%pdbId
                raise NoValidPDBFile(msg)
    else:
        print("Already downloaded pdb_id %s" % (pdbId))
    if not chainId is None:
        print("extracting chain '%s'" % (chainId))
        try:
            splitter = ChainSplitter(os.path.split(outName)[0])
            outName_extract = splitter.make_pdb(outName,
                                                chainId,
                                                rejectInsteadAccept=False)
            #      print(outName_extract, outName)
            if removeOtherChains:
                os.remove(outName)
        except NoChainInPdb:
            raise
        except (ValueError, OSError):
            try:
                os.remove(outName_extract)
            except (OSError, UnboundLocalError):
                pass
            raise
        return outName_extract
    else:
        return outName
Ejemplo n.º 8
0
def prepareInput(allRootDir,
                 idName,
                 pdbId,
                 fname,
                 sequence,
                 chainType="l",
                 removeInputs=False):
    '''
  :param allRootDir: str. Path where files will be generated
  :param idName: an id for the complex, typically a pdbId
  :param pdbId: A pdbId, potentially augmented with chainId and bioUnit info.
  :param fname: str. A path to a fasta or pdb file. Ignored if pdbId provided
  :param sequence: str. A  sequence of amino acids. Ignored if pdbId or fname provided
  :param chainType: "l" or "r"
  :param removeInputs: bool. Remove input files
  '''
    assert chainType in ["l",
                         "r"], "Error, chain type must be either 'l' or 'r'"
    finalInputPath = myMakeDir(allRootDir, "finalInput")

    newFname = os.path.join(finalInputPath,
                            "%s_%s_u.pdb" % (idName, chainType))

    if pdbId and fname is None and sequence is None:
        print("%s_%s" % (pdbId, chainType))
        matchObj = re.match(r"(^\d[a-zA-Z0-9]{3})(:[a-zA-Z0-9])?(_\d*)?$",
                            pdbId)
        if matchObj:
            pdbId = matchObj.group(1)
            chainId = matchObj.group(2)
            biounit = matchObj.group(3)
            isThisParnerSeq = False
            if not chainId is None: chainId = chainId[1:]
            if biounit is None or len(biounit) == 1: biounit = None
            else:
                biounit = int(biounit[1:])
                if biounit == 0: biounit = None
            try:
                fname = downloadPDB(pdbId,
                                    finalInputPath,
                                    chainId=chainId,
                                    bioUnit=biounit,
                                    removeOtherChains=removeInputs)
                print(type(fname), fname)
            except ValueError as e:
                traceback.print_exc()
                raise NoAvailableForDownloadPDB(str(e) + "-> %s" % (pdbId))
            try:
                if not removeInputs:
                    shutil.copyfile(fname, newFname)
                else:
                    os.rename(fname, newFname)
            except OSError:
                NoAvailableForDownloadPDB(
                    "Error. pdb %s was not recover from wwpdb.org" % pdbId)
        else:
            raise NoAvailableForDownloadPDB("Error: bad pdb provided %s" %
                                            (pdbId))
    elif fname:
        if checkIfIsFasta(fname):
            sequence = parseFasta(fname)
            if removeInputs:
                os.remove(fname)
            isThisParnerSeq = True
        else:  #check if fname is valid pdb
            success = moveAndWriteAsPDBIfMmcif(fname,
                                               newFname,
                                               removeInput=removeInputs)
            if not success:
                raise NoValidPDBFile(
                    "Error. It was not possible to parse your pdb file %d" %
                    (1 if chainType == "l" else 2))
            isThisParnerSeq = False
    elif sequence:
        sequence = parseSeq(sequence)
        isThisParnerSeq = True
    else:
        raise ValueError(
            "One of the following arguments should be provided pdbId, fname, sequence"
        )
    if sequence:  #Save sequence
        newFname = os.path.join(finalInputPath,
                                "%s_%s_u.fasta" % (idName, chainType))
        with open(newFname, "w") as f:
            f.write(">" + os.path.basename(newFname) + "\n" + sequence)
    return isThisParnerSeq, newFname, pdbId