def matchChains(atoms1, atoms2, **kwargs): """Returns pairs of chains matched based on sequence similarity. Makes an all-to-all comparison of chains in *atoms1* and *atoms2*. Chains are obtained from hierarchical views (:class:`.HierView`) of atom groups. This function returns a list of matching chains in a tuples that contain 4 items: * matching chain from *atoms1* as a :class:`.AtomMap` instance, * matching chain from *atoms2* as a :class:`.AtomMap` instance, * percent sequence identity of the match, * percent sequence overlap of the match. List of matches are sorted in decreasing percent sequence identity order. :class:`.AtomMap` instances can be used to calculate RMSD values and superpose atom groups. :arg atoms1: atoms that contain a chain :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg atoms2: atoms that contain a chain :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool If *subset* is set to *calpha* or *backbone*, only alpha carbon atoms or backbone atoms will be paired. If set to *all*, all atoms common to matched residues will be returned. This function tries to match chains based on residue numbers and names. All chains in *atoms1* is compared to all chains in *atoms2*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" if not isinstance(atoms1, (AtomGroup, Chain, Selection)): raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection') if not isinstance(atoms2, (AtomGroup, Chain, Selection)): raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection') subset = kwargs.get('subset', 'calpha') if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) assert isinstance(seqid, (float, int)), 'seqid must be float' assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100' coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 90.) assert isinstance(coverage, (float, int)), 'overlap must be float' assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100' pwalign = kwargs.get('pwalign', None) if isinstance(atoms1, Chain): chains1 = [atoms1] atoms1 = atoms1.getAtomGroup() else: chains1 = list(atoms1.getHierView().iterChains()) if not isinstance(atoms1, AtomGroup): atoms1 = atoms1.getAtomGroup() chains = list() for ch in chains1: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains1 = chains if not isinstance(atoms1, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms1), len(chains1))) if isinstance(atoms2, Chain): chains2 = [atoms2] atoms2 = atoms2.getAtomGroup() else: chains2 = list(atoms2.getHierView().iterChains()) if not isinstance(atoms2, AtomGroup): atoms2 = atoms2.getAtomGroup() chains = list() for ch in chains2: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains2 = chains if not isinstance(atoms2, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms2), len(chains2))) matches = [] unmatched = [] LOGGER.debug('Trying to match chains based on residue numbers and names:') for simpch1 in chains1: for simpch2 in chains2: LOGGER.debug(' Comparing {0} (len={1}) and {2} (len={3}):'.format( simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getTrivialMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) matches.append( (match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) unmatched.append((simpch1, simpch2)) if pwalign or (not matches and (pwalign is None or pwalign)): pairwise2 = importBioPairwise2() if pairwise2: LOGGER.debug('Trying to match chains based on {0} sequence ' 'alignment:'.format(ALIGNMENT_METHOD)) for simpch1, simpch2 in unmatched: LOGGER.debug(' Comparing {0} (len={1}) and {2} ' '(len={3}):'.format(simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getAlignedMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) matches.append( (match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) else: LOGGER.warning('Pairwise alignment could not be performed.') if not matches: return None subset = _SUBSETS[subset] for mi, result in enumerate(matches): match1, match2, _seqid, _cover, simpch1, simpch2 = result indices1 = [] indices2 = [] for i in range(len(match1)): ares = match1[i] bres = match2[i] if subset == 'ca': try: aid = ares.getNames().tolist().index('CA') except ValueError: aid = None try: bid = bres.getNames().tolist().index('CA') if aid is not None: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) except ValueError: pass elif subset == 'bb': for bban in ('N', 'CA', 'C', 'O'): try: aid = ares.getNames().tolist().index(bban) except ValueError: continue try: bid = bres.getNames().tolist().index(bban) except ValueError: continue else: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) elif subset == 'noh': for han, aid, noh in zip(ares.getNames(), ares._indices, ares.getFlags('noh')): if not noh: continue try: bid = bres.getNames().tolist().index(han) except ValueError: continue else: indices1.append(aid) indices2.append(bres._indices[bid]) elif subset is None or subset is 'all': aans = ares.getNames() bans = bres.getNames().tolist() aids = ares.getIndices() #bids = bres.getIndices() for j in range(len(aans)): try: bid = bres._indices[bans.index(aans[j])] indices1.append(aids[j]) indices2.append(bid) except ValueError: pass indices1 = np.array(indices1, int) indices2 = np.array(indices2, int) match1 = AM(atoms1, indices1, atoms1.getACSIndex(), title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(), intarrays=True) match2 = AM(atoms2, indices2, atoms2.getACSIndex(), title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(), intarrays=True) matches[mi] = (match1, match2, _seqid, _cover) if len(matches) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) matches.sort(compare, reverse=True) return matches
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool :keyword fast: get rid of verbosity and just returns sequence identity. :type fast: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for sequence alignment, and mapping is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" target_chain = chain if not isinstance(atoms, (AtomGroup, Chain, Selection)): raise TypeError('atoms must be an AtomGroup, a Chain, or a ' 'Selection instance') if not isinstance(target_chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 70.) pwalign = kwargs.get('pwalign', None) fast = kwargs.get('fast', False) if isinstance(atoms, Chain): chains = [atoms] map_ag = atoms.getAtomGroup() else: if isinstance(atoms, AtomGroup): map_ag = atoms else: map_ag = atoms.getAtomGroup() chains = list(atoms.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified'.format( str(atoms), len(chains))) if subset != 'all': target_chain = target_chain.select(subset).getHierView()[ target_chain.getChid()] mappings = [] unmapped = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, True) if fast is False: LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(True) simple_chain.buildFromChain(chain) if len(simple_chain) == 0: if fast is False: LOGGER.debug( ' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue if fast is False: LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: if fast is False: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: if fast is False: LOGGER.debug( '\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).'.format( _seqid, _cover)) unmapped.append(simple_chain) if pwalign or (not mappings and (pwalign is None or pwalign)): LOGGER.debug( 'Trying to map atoms based on {0} sequence alignment:'.format( ALIGNMENT_METHOD)) for simple_chain in unmapped: LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) result = getAlignedMapping(simple_target, simple_chain) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) ch_tar = next((r for r in residues_target if r is not None)).getChain() ch_chn = next((r for r in residues_chain if r is not None)).getChain() title_tar = 'Chain {0} from {1}'.format( ch_tar.getChid(), ch_tar.getAtomGroup().getTitle()) title_chn = 'Chain {0} from {1}'.format( ch_chn.getChid(), ch_chn.getAtomGroup().getTitle()) atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=title_chn + ' -> ' + title_tar) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=title_tar + ' -> ' + title_chn, intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) mappings.sort(compare, reverse=True) return mappings
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is **90** if sequence alignment is performed, otherwise **0** :type seqid: float :keyword overlap: percent overlap, default is **70** :type overlap: float :keyword mapping: if ``"ce"`` or ``"cealign"``, then the CE algorithm [IS98]_ will be performed. It can also be a list of prealigned sequences, a :class:`.MSA` instance, or a dict of indices such as that derived from a :class:`.DaliRecord`. If set to anything other than the options listed above, including the default value (**None**), a simple mapping will be first attempted and if that failed then sequence alignment with a function from :mod:`~Bio.pairwise2` will be used unless *pwalign* is set to **False**, in which case the mapping will fail. :type mapping: list, str :keyword pwalign: if **True**, then pairwise sequence alignment will be performed. If **False** then a simple mapping will be performed based on residue numbers (as well as insertion codes). This will be overridden by the *mapping* keyword's value. :type pwalign: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. .. [IS98] Shindyalov IN, Bourne PE. Protein structure alignment by incremental combinatorial extension (CE) of the optimal path. *Protein engineering* **1998** 11(9):739-47. """ if not isinstance(atoms, (AtomGroup, AtomSubset)): raise TypeError('atoms must be an AtomGroup or a AtomSubset instance') if not isinstance(chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap', 70.) coverage = kwargs.get('coverage', coverage) pwalign = kwargs.get('pwalign', None) pwalign = kwargs.get('mapping', pwalign) alignment = None if pwalign is not None: if isinstance(pwalign, basestring): pwalign = str(pwalign).strip().lower() elif not isinstance(pwalign, bool): alignment = pwalign pwalign = True if subset != 'all': chid = chain.getChid() segname = chain.getSegname() chain_subset = chain.select(subset) target_chain = chain_subset.getHierView()[segname, chid] mobile = atoms.select(subset) else: target_chain = chain mobile = atoms if isinstance(mobile, Chain): chains = [mobile] map_ag = mobile.getAtomGroup() else: if isinstance(mobile, AtomGroup): map_ag = mobile else: map_ag = mobile.getAtomGroup() chains = list(mobile.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified'.format( str(atoms), len(chains))) mappings = [] unmapped = [] unmapped_chids = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, False) LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(chain, False) if len(simple_chain) == 0: LOGGER.debug(' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) # trivial mapping serves as a first simple trial of alignment the two # sequences based on residue number, therefore the sequence identity # (TRIVIAL_SEQID) criterion is strict. _seqid = _cover = -1 target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) trivial_seqid = TRIVIAL_SEQID if pwalign else seqid trivial_cover = TRIVIAL_COVERAGE if pwalign else coverage if _seqid >= trivial_seqid and _cover >= trivial_cover: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: if not pwalign: LOGGER.debug( '\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).'.format( _seqid, _cover)) unmapped.append(simple_chain) unmapped_chids.append(chain.getChid()) if not mappings and pwalign is None: pwalign = True if pwalign and unmapped: if alignment is None: if pwalign in ['ce', 'cealign']: aln_type = 'structure alignment' method = 'CE' if not 'seqid' in kwargs: seqid = 0. else: aln_type = 'sequence alignment' method = ALIGNMENT_METHOD else: aln_type = 'alignment' method = 'predefined' if not 'seqid' in kwargs: seqid = 0. LOGGER.debug('Trying to map atoms based on {0} {1}:'.format( method, aln_type)) for chid, simple_chain in zip(unmapped_chids, unmapped): LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) if method == 'CE': result = getCEAlignMapping(simple_target, simple_chain) else: if isinstance(alignment, dict): result = getDictMapping(simple_target, simple_chain, map_dict=alignment) else: result = getAlignedMapping(simple_target, simple_chain, alignment=alignment) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) ch_tar = next((r for r in residues_target if r is not None)).getChain() ch_chn = next((r for r in residues_chain if r is not None)).getChain() title_tar = 'Chain {0} from {1}'.format( ch_tar.getChid(), ch_tar.getAtomGroup().getTitle()) title_chn = 'Chain {0} from {1}'.format( ch_chn.getChid(), ch_chn.getAtomGroup().getTitle()) # note that chain here is from atoms atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=title_chn + ' -> ' + title_tar) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=title_tar + ' -> ' + title_chn, intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: mappings.sort(key=lambda m: m[-2:], reverse=True) return mappings