class Hit(_BaseSearchObject): """Class representing a single database hit of a search result. Hit objects are the second-level container in the SearchIO module. They are the objects contained within a QueryResult (see QueryResult). They themselves are container for HSP objects and will contain at least one HSP. To have a quick look at a Hit and its contents, invoke ``print`` on it:: >>> from Bio import SearchIO >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) >>> hit = qresult[3] >>> print(hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] 1 3.3e-06 55.39 60 [0:60] [13:73] You can invoke ``len`` on a Hit object to see how many HSP objects it contains:: >>> len(hit) 2 Hit objects behave very similar to Python lists. You can retrieve the HSP object inside a Hit using the HSP's integer index. Hit objects can also be sliced, which will return a new Hit objects containing only the sliced HSPs:: # HSP items inside the Hit can be retrieved using its integer index >>> hit[0] HSP(hit_id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 fragments) # slicing returns a new Hit >>> hit Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps) >>> hit[:1] Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 hsps) >>> print(hit[1:]) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 3.3e-06 55.39 60 [0:60] [13:73] Hit objects provide ``filter`` and ``map`` methods, which are analogous to Python's built-in ``filter`` and ``map`` except that they return a new Hit object instead of a list. Here is an example of using ``filter`` to select for HSPs whose e-value is less than 1e-10:: >>> evalue_filter = lambda hsp: hsp.evalue < 1e-10 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print(filtered_hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] There are also other methods which are counterparts of Python lists' methods with the same names: ``append``, ``index``, ``pop``, and ``sort``. Consult their respective documentations for more details and examples of their usage. """ # attributes we don't want to transfer when creating a new Hit class # from this one _NON_STICKY_ATTRS = ("_items", ) def __init__(self, hsps=(), id=None, query_id=None): """Initialize a Hit object. :param hsps: HSP objects contained in the Hit object :type hsps: iterable yielding HSP :param id: hit ID :type id: string :param query_id: query ID :type query_id: string If multiple HSP objects are used for initialization, they must all have the same ``query_id``, ``query_description``, ``hit_id``, and ``hit_description`` properties. """ # default attribute values self._id = id self._id_alt = [] self._query_id = query_id self._description = None self._description_alt = [] self._query_description = None self.attributes = {} self.dbxrefs = [] # TODO - Move this into the for look below in case # hsps is a single use iterator? for attr in ("query_id", "query_description", "hit_id", "hit_description"): # HACK: setting the if clause to '> 1' allows for empty hit objects. # This makes it easier to work with file formats with unpredictable # hit-hsp ordering. The empty hit object itself is nonfunctional, # however, since all its cascading properties are empty. if len({getattr(hsp, attr) for hsp in hsps}) > 1: raise ValueError( "Hit object can not contain HSPs with more than one %s." % attr) self._items = [] for hsp in hsps: # validate each HSP self._validate_hsp(hsp) # and store it them as an instance attribute self.append(hsp) def __repr__(self): """Return string representation of Hit object.""" return "Hit(id=%r, query_id=%r, %r hsps)" % (self.id, self.query_id, len(self)) def __iter__(self): """Iterate over hsps.""" return iter(self.hsps) def __len__(self): """Return number of hsps.""" return len(self.hsps) # Python 3: def __bool__(self): """Return True if there are hsps.""" return bool(self.hsps) # Python 2: __nonzero__ = __bool__ def __contains__(self, hsp): """Return True if hsp in items.""" return hsp in self._items def __str__(self): """Return a human readable summary of the Hit object.""" lines = [] # set query id line qid_line = "Query: %s" % self.query_id if self.query_description: qid_line += trim_str("\n %s" % self.query_description, 80, "...") lines.append(qid_line) # set hit id line hid_line = " Hit: %s" % self.id if hasattr(self, "seq_len"): hid_line += " (%i)" % self.seq_len if self.description: hid_line += trim_str("\n %s" % self.description, 80, "...") lines.append(hid_line) # set attributes lines for key, value in sorted(self.attributes.items()): lines.append(" %s: %s" % (key, value)) # set dbxrefs line if self.dbxrefs: lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) # set hsp line and table if not self.hsps: lines.append(" HSPs: ?") else: lines.append( " HSPs: %s %s %s %s %s %s" % ("-" * 4, "-" * 8, "-" * 9, "-" * 6, "-" * 15, "-" * 21)) pattern = "%11s %8s %9s %6s %15s %21s" lines.append(pattern % ("#", "E-value", "Bit score", "Span", "Query range", "Hit range")) lines.append( pattern % ("-" * 4, "-" * 8, "-" * 9, "-" * 6, "-" * 15, "-" * 21)) for idx, hsp in enumerate(self.hsps): # evalue evalue = getattr_str(hsp, "evalue", fmt="%.2g") # bitscore bitscore = getattr_str(hsp, "bitscore", fmt="%.2f") # alignment length aln_span = getattr_str(hsp, "aln_span") # query region query_start = getattr_str(hsp, "query_start") query_end = getattr_str(hsp, "query_end") query_range = "[%s:%s]" % (query_start, query_end) # max column length is 18 query_range = trim_str(query_range, 15, "~]") # hit region hit_start = getattr_str(hsp, "hit_start") hit_end = getattr_str(hsp, "hit_end") hit_range = "[%s:%s]" % (hit_start, hit_end) hit_range = trim_str(hit_range, 21, "~]") # append the hsp row lines.append(pattern % (str(idx), evalue, bitscore, aln_span, query_range, hit_range)) return "\n".join(lines) def __getitem__(self, idx): """Return the HSP object at the given index.""" # if key is slice, return a new Hit instance if isinstance(idx, slice): obj = self.__class__(self.hsps[idx]) self._transfer_attrs(obj) return obj return self._items[idx] def __setitem__(self, idx, hsps): """Assign hsps to index idx.""" # handle case if hsps is a list of hsp if isinstance(hsps, (list, tuple)): for hsp in hsps: self._validate_hsp(hsp) else: self._validate_hsp(hsps) self._items[idx] = hsps def __delitem__(self, idx): """Delete item of index idx.""" del self._items[idx] # hsp properties # def _validate_hsp(self, hsp): """Validate an HSP object (PRIVATE). Valid HSP objects have the same hit_id as the Hit object ID and the same query_id as the Hit object's query_id. """ if not isinstance(hsp, HSP): raise TypeError("Hit objects can only contain HSP objects.") # HACK: to make validation during __init__ work if self._items: if self.id is not None: if hsp.hit_id != self.id: raise ValueError( "Expected HSP with hit ID %r, found %r instead." % (self.id, hsp.hit_id)) else: self.id = hsp.hit_id if self.description is not None: if hsp.hit_description != self.description: raise ValueError( "Expected HSP with hit description %r, found %r instead." % (self.description, hsp.hit_description)) else: self.description = hsp.hit_description if self.query_id is not None: if hsp.query_id != self.query_id: raise ValueError( "Expected HSP with query ID %r, found %r instead." % (self.query_id, hsp.query_id)) else: self.query_id = hsp.query_id if self.query_description is not None: if hsp.query_description != self.query_description: raise ValueError( "Expected HSP with query description %r, found %r instead." % (self.query_description, hsp.query_description)) else: self.query_description = hsp.query_description # properties # description = optionalcascade("_description", "hit_description", """Hit description""") query_description = optionalcascade( "_query_description", "query_description", """Description of the query that produced the hit""", ) id = optionalcascade("_id", "hit_id", """Hit ID string.""") query_id = optionalcascade( "_query_id", "query_id", """ID string of the query that produced the hit""") # returns all hsps hsps = allitems(doc="""HSP objects contained in the Hit""") @property def id_all(self): """Alternative ID(s) of the Hit.""" return [self.id] + self._id_alt @property def description_all(self): """Alternative descriptions of the Hit.""" return [self.description] + self._description_alt @property def fragments(self): """Access the HSPFragment objects contained in the Hit.""" return list(chain(*self._items)) # public methods # def append(self, hsp): """Add a HSP object to the end of Hit. Parameters hsp -- HSP object to append. Any HSP object appended must have the same ``hit_id`` property as the Hit object's ``id`` property and the same ``query_id`` property as the Hit object's ``query_id`` property. """ self._validate_hsp(hsp) self._items.append(hsp) def filter(self, func=None): """Create new Hit object whose HSP objects pass the filter function. :param func: function for filtering :type func: callable, accepts HSP, returns bool ``filter`` is analogous to Python's built-in ``filter`` function, except that instead of returning a list it returns a ``Hit`` object. Here is an example of using ``filter`` to select for HSPs having bitscores bigger than 60:: >>> from Bio import SearchIO >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) >>> hit = qresult[3] >>> evalue_filter = lambda hsp: hsp.bitscore > 60 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print(filtered_hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] """ hsps = list(filter(func, self.hsps)) if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def index(self, hsp): """Return the index of a given HSP object, zero-based. :param hsp: object to look up :type hsp: HSP """ return self._items.index(hsp) def map(self, func=None): """Create new Hit object, mapping the given function to its HSPs. :param func: function for mapping :type func: callable, accepts HSP, returns HSP ``map`` is analogous to Python's built-in ``map`` function. It is applied to all HSPs contained in the Hit object and returns a new Hit object. """ if func is not None: hsps = [func(x) for x in self.hsps[:]] # this creates a shallow copy else: hsps = self.hsps[:] if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def pop(self, index=-1): """Remove and returns the HSP object at the specified index. :param index: index of HSP object to pop :type index: int """ return self._items.pop(index) def sort(self, key=None, reverse=False, in_place=True): """Sort the HSP objects. :param key: sorting function :type key: callable, accepts HSP, returns key for sorting :param reverse: whether to reverse sorting results or no :type reverse: bool :param in_place: whether to do in-place sorting or no :type in_place: bool ``sort`` defaults to sorting in-place, to mimick Python's ``list.sort`` method. If you set the ``in_place`` argument to False, it will treat return a new, sorted Hit object and keep the initial one unsorted """ if in_place: self._items.sort(key=key, reverse=reverse) else: hsps = self.hsps[:] hsps.sort(key=key, reverse=reverse) obj = self.__class__(hsps) self._transfer_attrs(obj) return obj
class HSP(_BaseHSP): """Class representing high-scoring region(s) between query and hit. HSP (high-scoring pair) objects are contained by Hit objects (see Hit). In most cases, HSP objects store the bulk of the statistics and results (e.g. e-value, bitscores, query sequence, etc.) produced by a search program. Depending on the search output file format, a given HSP will contain one or more HSPFragment object(s). Examples of search programs that produce HSP with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as BLAT or Exonerate may produce HSPs containing more than one HSPFragment. However, their native terminologies may differ: in BLAT these fragments are called 'blocks' while in Exonerate they are called exons or NER. Here are examples from each type of HSP. The first one comes from a BLAST search:: >>> from Bio import SearchIO >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit >>> blast_hsp HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) >>> print(blast_hsp) Query: 33211 mir_1 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... Query range: [1:61] (1) Hit range: [0:60] (1) Quick stats: evalue 1.7e-22; bitscore 109.49 Fragments: 1 (60 columns) Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the underlying sequence alignment, if it exists. This is not the case for HSPs with more than one HSPFragment. Below is an example, using an HSP from a BLAT search. Invoking ``print`` on these HSPs will instead show a table of the HSPFragment objects it contains:: >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit >>> blat_hsp HSP(hit_id='chr11', query_id='blat_1', 2 fragments) >>> print(blat_hsp) Query: blat_1 <unknown description> Hit: chr11 <unknown description> Query range: [42:67] (-1) Hit range: [59018929:59018955] (1) Quick stats: evalue ?; bitscore ? Fragments: --- -------------- ---------------------- ---------------------- # Span Query range Hit range --- -------------- ---------------------- ---------------------- 0 6 [61:67] [59018929:59018935] 1 16 [42:58] [59018939:59018955] Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` ``hit_range`` properties encompasses all fragments it contains. You can check whether an HSP has more than one HSPFragments or not using the ``is_fragmented`` property:: >>> blast_hsp.is_fragmented False >>> blat_hsp.is_fragmented True Since HSP objects are also containers similar to Python lists, you can access a single fragment in an HSP using its integer index:: >>> blat_fragment = blat_hsp[0] >>> print(blat_fragment) Query: blat_1 <unknown description> Hit: chr11 <unknown description> Query range: [61:67] (-1) Hit range: [59018929:59018935] (1) Fragments: 1 (6 columns) Query - tatagt Hit - tatagt This applies to HSPs objects with a single fragment as well:: >>> blast_fragment = blast_hsp[0] >>> print(blast_fragment) Query: 33211 mir_1 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... Query range: [1:61] (1) Hit range: [0:60] (1) Fragments: 1 (60 columns) Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG Regardless of the search output file format, HSP objects provide the properties listed below. These properties always return values in a list, due to the HSP object itself being a list-like container. However, for HSP objects with a single HSPFragment, shortcut properties that fetches the item from the list are also provided. +----------------------+---------------------+-----------------------------+ | Property | Shortcut | Value | +======================+=====================+=============================+ | aln_all | aln | HSP alignments as | | | | MultipleSeqAlignment object | +----------------------+---------------------+-----------------------------+ | aln_annotation_all | aln_annotation | dictionary of annotation(s) | | | | of all fragments' alignments| +----------------------+---------------------+-----------------------------+ | fragments | fragment | HSPFragment objects | +----------------------+---------------------+-----------------------------+ | hit_all | hit | hit sequence as SeqRecord | | | | objects | +----------------------+---------------------+-----------------------------+ | hit_features_all | hit_features | SeqFeatures of all hit | | | | fragments | +----------------------+---------------------+-----------------------------+ | hit_start_all | hit_start* | start coordinates of the | | | | hit fragments | +----------------------+---------------------+-----------------------------+ | hit_end_all | hit_end* | end coordinates of the hit | | | | fragments | +----------------------+---------------------+-----------------------------+ | hit_span_all | hit_span* | sizes of each hit fragments | +----------------------+---------------------+-----------------------------+ | hit_strand_all | hit_strand | strand orientations of the | | | | hit fragments | +----------------------+---------------------+-----------------------------+ | hit_frame_all | hit_frame | reading frames of the hit | | | | fragments | +----------------------+---------------------+-----------------------------+ | hit_range_all | hit_range | tuples of start and end | | | | coordinates of each hit | | | | fragment | +----------------------+---------------------+-----------------------------+ | query_all | query | query sequence as SeqRecord | | | | object | +----------------------+---------------------+-----------------------------+ | query_features_all | query_features | SeqFeatures of all query | | | | fragments | +----------------------+---------------------+-----------------------------+ | query_start_all | query_start* | start coordinates of the | | | | fragments | +----------------------+---------------------+-----------------------------+ | query_end_all | query_end* | end coordinates of the | | | | query fragments | +----------------------+---------------------+-----------------------------+ | query_span_all | query_span* | sizes of each query | | | | fragments | +----------------------+---------------------+-----------------------------+ | query_strand_all | query_strand | strand orientations of the | | | | query fragments | +----------------------+---------------------+-----------------------------+ | query_frame_all | query_frame | reading frames of the query | | | | fragments | +----------------------+---------------------+-----------------------------+ | query_range_all | query_range | tuples of start and end | | | | coordinates of each query | | | | fragment | +----------------------+---------------------+-----------------------------+ For all types of HSP objects, the property will return the values in a list. Shorcuts are only applicable for HSPs with one fragment. Except the ones noted, if they are used on an HSP with more than one fragments, an exception will be raised. For properties that may be used in HSPs with multiple or single fragments (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends on how many fragment the HSP has: +------------+---------------------------------------------------+ | Property | Value | +============+===================================================+ | hit_start | smallest coordinate value of all hit fragments | +------------+---------------------------------------------------+ | hit_end | largest coordinate value of all hit fragments | +------------+---------------------------------------------------+ | hit_span | difference between ``hit_start`` and ``hit_end`` | +------------+---------------------------------------------------+ | query_start| smallest coordinate value of all query fragments | +------------+---------------------------------------------------+ | query_end | largest coordinate value of all query fragments | +------------+---------------------------------------------------+ | query_span | difference between ``query_start`` and | | | ``query_end`` | +------------+---------------------------------------------------+ In addition to the objects listed above, HSP objects also provide the following properties: +--------------------+------------------------------------------------------+ | Property | Value | +====================+======================================================+ | aln_span | total number of residues in all HSPFragment objects | +--------------------+------------------------------------------------------+ | alphabet | alphabet used in hit and query SeqRecord objects | +--------------------+------------------------------------------------------+ | is_fragmented | boolean, whether there are multiple fragments or not | +--------------------+------------------------------------------------------+ | hit_id | ID of the hit sequence | +--------------------+------------------------------------------------------+ | hit_description | description of the hit sequence | +--------------------+------------------------------------------------------+ | hit_inter_ranges | list of hit sequence coordinates of the regions | | | between fragments | +--------------------+------------------------------------------------------+ | hit_inter_spans | list of lengths of the regions between hit fragments | +--------------------+------------------------------------------------------+ | query_id | ID of the query sequence | +--------------------+------------------------------------------------------+ | query_description | description of the query sequence | +--------------------+------------------------------------------------------+ | query_inter_ranges | list of query sequence coordinates of the regions | | | between fragments | +--------------------+------------------------------------------------------+ | query_inter_spans | list of lengths of the regions between query | | | fragments | +--------------------+------------------------------------------------------+ .. [1] may be used in HSPs with multiple fragments """ # attributes we don't want to transfer when creating a new Hit class # from this one _NON_STICKY_ATTRS = ('_items', ) def __init__(self, fragments=[]): """Initializes an HSP object. :param fragments: fragments contained in the HSP object :type fragments: iterable yielding HSPFragment HSP objects must be initialized with a list containing at least one HSPFragment object. If multiple HSPFragment objects are used for initialization, they must all have the same ``query_id``, ``query_description``, ``hit_id``, ``hit_description``, and alphabet properties. """ if not fragments: raise ValueError("HSP objects must have at least one HSPFragment " "object.") # check that all fragments contain the same IDs, descriptions, alphabet for attr in ('query_id', 'query_description', 'hit_id', 'hit_description', 'alphabet'): if len(set(getattr(frag, attr) for frag in fragments)) != 1: raise ValueError("HSP object can not contain fragments with " "more than one %s." % attr) self._items = [] for fragment in fragments: self._validate_fragment(fragment) self._items.append(fragment) def __repr__(self): return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ (self.__class__.__name__, self.hit_id, self.query_id, len(self)) def __iter__(self): return iter(self._items) def __contains__(self, fragment): return fragment in self._items def __len__(self): return len(self._items) # Python 3: def __bool__(self): return bool(self._items) # Python 2: __nonzero__ = __bool__ def __str__(self): lines = [] # set hsp info line statline = [] # evalue evalue = getattr_str(self, 'evalue', fmt='%.2g') statline.append('evalue ' + evalue) # bitscore bitscore = getattr_str(self, 'bitscore', fmt='%.2f') statline.append('bitscore ' + bitscore) lines.append('Quick stats: ' + '; '.join(statline)) if len(self.fragments) == 1: return '\n'.join([ self._str_hsp_header(), '\n'.join(lines), self.fragments[0]._str_aln() ]) else: lines.append(' Fragments: %s %s %s %s' % ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) pattern = '%16s %14s %22s %22s' lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) lines.append(pattern % ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) for idx, block in enumerate(self.fragments): # set hsp line and table # alignment span aln_span = getattr_str(block, 'aln_span') # query region query_start = getattr_str(block, 'query_start') query_end = getattr_str(block, 'query_end') query_range = '[%s:%s]' % (query_start, query_end) # max column length is 20 query_range = trim_str(query_range, 22, '~]') # hit region hit_start = getattr_str(block, 'hit_start') hit_end = getattr_str(block, 'hit_end') hit_range = '[%s:%s]' % (hit_start, hit_end) hit_range = trim_str(hit_range, 22, '~]') # append the hsp row lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) return self._str_hsp_header() + '\n' + '\n'.join(lines) def __getitem__(self, idx): # if key is slice, return a new HSP instance if isinstance(idx, slice): obj = self.__class__(self._items[idx]) self._transfer_attrs(obj) return obj return self._items[idx] def __setitem__(self, idx, fragments): # handle case if hsps is a list of hsp if isinstance(fragments, (list, tuple)): for fragment in fragments: self._validate_fragment(fragment) else: self._validate_fragment(fragments) self._items[idx] = fragments def __delitem__(self, idx): # note that this may result in an empty HSP object, which should be # invalid del self._items[idx] def _validate_fragment(self, fragment): if not isinstance(fragment, HSPFragment): raise TypeError("HSP objects can only contain HSPFragment " "objects.") # HACK: to make validation during __init__ work if self._items: if fragment.hit_id != self.hit_id: raise ValueError("Expected HSPFragment with hit ID %r, " "found %r instead." % (self.id, fragment.hit_id)) if fragment.hit_description != self.hit_description: raise ValueError("Expected HSPFragment with hit " "description %r, found %r instead." % (self.description, fragment.hit_description)) if fragment.query_id != self.query_id: raise ValueError("Expected HSPFragment with query ID %r, " "found %r instead." % (self.query_id, fragment.query_id)) if fragment.query_description != self.query_description: raise ValueError( "Expected HSP with query description %r, " "found %r instead." % (self.query_description, fragment.query_description)) def _aln_span_get(self): # length of all alignments # alignment span can be its own attribute, or computed from # query / hit length return sum(frg.aln_span for frg in self.fragments) aln_span = property( fget=_aln_span_get, doc="""Total number of columns in all HSPFragment objects.""") # coordinate properties # def _get_coords(self, seq_type, coord_type): assert seq_type in ('hit', 'query') assert coord_type in ('start', 'end') coord_name = '%s_%s' % (seq_type, coord_type) coords = [getattr(frag, coord_name) for frag in self.fragments] if None in coords: warnings.warn( "'None' exist in %s coordinates; ignored" % (coord_name), BiopythonWarning) return coords def _hit_start_get(self): return min(self._get_coords('hit', 'start')) hit_start = property( fget=_hit_start_get, doc="""Smallest coordinate value of all hit fragments""") def _query_start_get(self): return min(self._get_coords('query', 'start')) query_start = property( fget=_query_start_get, doc="""Smallest coordinate value of all query fragments""") def _hit_end_get(self): return max(self._get_coords('hit', 'end')) hit_end = property(fget=_hit_end_get, doc="""Largest coordinate value of all hit fragments""") def _query_end_get(self): return max(self._get_coords('query', 'end')) query_end = property( fget=_query_end_get, doc="""Largest coordinate value of all hit fragments""") # coordinate-dependent properties # def _hit_span_get(self): try: return self.hit_end - self.hit_start except TypeError: # triggered if any of the coordinates are None return None hit_span = property( fget=_hit_span_get, doc="""The number of hit residues covered by the HSP.""") def _query_span_get(self): try: return self.query_end - self.query_start except TypeError: # triggered if any of the coordinates are None return None query_span = property( fget=_query_span_get, doc="""The number of query residues covered by the HSP.""") def _hit_range_get(self): return (self.hit_start, self.hit_end) hit_range = property(fget=_hit_range_get, doc="""Tuple of HSP hit start and end coordinates.""") def _query_range_get(self): return (self.query_start, self.query_end) query_range = property( fget=_query_range_get, doc="""Tuple of HSP query start and end coordinates.""") def _inter_ranges_get(self, seq_type): # this property assumes that there are no mixed strands in a hit/query assert seq_type in ('query', 'hit') strand = getattr(self, '%s_strand_all' % seq_type)[0] coords = getattr(self, '%s_range_all' % seq_type) # determine function used to set inter range # start and end coordinates, given two pairs # of fragment start and end coordinates if strand == -1: startfunc, endfunc = min, max else: startfunc, endfunc = max, min inter_coords = [] for idx, coord in enumerate(coords[:-1]): start = startfunc(coords[idx]) end = endfunc(coords[idx + 1]) inter_coords.append((min(start, end), max(start, end))) return inter_coords def _hit_inter_ranges_get(self): return self._inter_ranges_get('hit') hit_inter_ranges = property( fget=_hit_inter_ranges_get, doc="""Hit sequence coordinates of the regions between fragments""") def _query_inter_ranges_get(self): return self._inter_ranges_get('query') query_inter_ranges = property( fget=_query_inter_ranges_get, doc="""Query sequence coordinates of the regions between fragments""") def _inter_spans_get(self, seq_type): assert seq_type in ('query', 'hit') attr_name = '%s_inter_ranges' % seq_type return [coord[1] - coord[0] for coord in getattr(self, attr_name)] def _hit_inter_spans_get(self): return self._inter_spans_get('hit') hit_inter_spans = property( fget=_hit_inter_spans_get, doc="""Lengths of regions between hit fragments""") def _query_inter_spans_get(self): return self._inter_spans_get('query') query_inter_spans = property( fget=_query_inter_spans_get, doc="""Lengths of regions between query fragments""") # shortcuts for fragments' properties # # bool check if there's more than one fragments is_fragmented = property( lambda self: len(self) > 1, doc="""Whether the HSP has more than one HSPFragment objects""") # first item properties with setters hit_description = fullcascade('hit_description', doc="""Description of the hit sequence""") query_description = fullcascade( 'query_description', doc="""Description of the query sequence""") hit_id = fullcascade('hit_id', doc="""ID of the hit sequence""") query_id = fullcascade('query_id', doc="""ID of the query sequence""") alphabet = fullcascade( 'alphabet', doc="""Alphabet used in hit and query SeqRecord objects""") # properties for single-fragment HSPs fragment = singleitem(doc="""HSPFragment object, first fragment""") hit = singleitem( 'hit', doc="""Hit sequence as a SeqRecord object, first fragment""") query = singleitem( 'query', doc="""Query sequence as a SeqRecord object, first fragment""") aln = singleitem( 'aln', doc= """Alignment of the first fragment as a MultipleSeqAlignment object""") aln_annotation = singleitem( 'aln_annotation', doc="""Dictionary of annotation(s) of the first fragment's alignment""" ) hit_features = singleitem('hit_features', doc="""Hit sequence features, first fragment""") query_features = singleitem( 'query_features', doc="""Query sequence features, first fragment""") hit_strand = singleitem('hit_strand', doc="""Hit strand orientation, first fragment""") query_strand = singleitem( 'query_strand', doc="""Query strand orientation, first fragment""") hit_frame = singleitem( 'hit_frame', doc="""Hit sequence reading frame, first fragment""") query_frame = singleitem( 'query_frame', doc="""Query sequence reading frame, first fragment""") # properties for multi-fragment HSPs fragments = allitems(doc="""List of all HSPFragment objects""") hit_all = allitems( 'hit', doc="""List of all fragments' hit sequences as SeqRecord objects""") query_all = allitems( 'query', doc="""List of all fragments' query sequences as SeqRecord objects""") aln_all = allitems( 'aln', doc= """List of all fragments' alignments as MultipleSeqAlignment objects""" ) aln_annotation_all = allitems( 'aln_annotation', doc="""Dictionary of annotation(s) of all fragments' alignments""") hit_features_all = allitems('hit_features', doc="""List of all hit sequence features""") query_features_all = allitems( 'query_features', doc="""List of all query sequence features""") hit_strand_all = allitems( 'hit_strand', doc="""List of all fragments' hit sequence strands""") query_strand_all = allitems( 'query_strand', doc="""List of all fragments' query sequence strands""") hit_frame_all = allitems( 'hit_frame', doc="""List of all fragments' hit sequence reading frames""") query_frame_all = allitems( 'query_frame', doc="""List of all fragments' query sequence reading frames""") hit_start_all = allitems( 'hit_start', doc="""List of all fragments' hit start coordinates""") query_start_all = allitems( 'query_start', doc="""List of all fragments' query start coordinates""") hit_end_all = allitems( 'hit_end', doc="""List of all fragments' hit end coordinates""") query_end_all = allitems( 'query_end', doc="""List of all fragments' query end coordinates""") hit_span_all = allitems('hit_span', doc="""List of all fragments' hit sequence size""") query_span_all = allitems( 'query_span', doc="""List of all fragments' query sequence size""") hit_range_all = allitems( 'hit_range', doc="""List of all fragments' hit start and end coordinates""") query_range_all = allitems( 'query_range', doc="""List of all fragments' query start and end coordinates""")
class Hit(_BaseSearchObject): """Class representing a single database hit of a search result. Hit objects are the second-level container in the SearchIO module. They are the objects contained within a QueryResult (see QueryResult). They themselves are container for HSP objects and will contain at least one HSP. To have a quick look at a Hit and its contents, invoke `print` on it: >>> from Bio import SearchIO >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next() >>> hit = qresult[3] >>> print hit Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] 1 3.3e-06 55.39 60 [0:60] [13:73] You can invoke `len` on a Hit object to see how many HSP objects it contains: >>> len(hit) 2 Hit objects behave very similar to Python lists. You can retrieve the HSP object inside a Hit using the HSP's integer index. Hit objects can also be sliced, which will return a new Hit objects containing only the sliced HSPs: # HSP items inside the Hit can be retrieved using its integer index >>> hit[0] HSP(hit_id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 fragments) # slicing returns a new Hit >>> hit Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps) >>> hit[:1] Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 hsps) >>> print hit[1:] Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 3.3e-06 55.39 60 [0:60] [13:73] Hit objects provide `filter` and `map` methods, which are analogous to Python's built-in `filter` and `map` except that they return a new Hit object instead of a list. Here is an example of using `filter` to select for HSPs whose e-value is less than 1e-10: >>> evalue_filter = lambda hsp: hsp.evalue < 1e-10 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print filtered_hit Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] There are also other methods which are counterparts of Python lists' methods with the same names: `append`, `index`, `pop`, and `sort`. Consult their respective documentations for more details and examples of their usage. """ # attributes we don't want to transfer when creating a new Hit class # from this one _NON_STICKY_ATTRS = ('_items', ) def __init__(self, hsps=[], id=None, query_id=None): """Initializes a Hit object. Arguments: hsps -- List containing HSP objects. id -- String of the Hit ID query_id -- String of the Hit's query ID If multiple HSP objects are used for initialization, they must all have the same `query_id`, `query_description`, `hit_id`, and `hit_description` properties. """ # default attribute values self._id = id self._query_id = query_id self._description = None self._query_description = None for attr in ('query_id', 'query_description', 'hit_id', 'hit_description'): # HACK: setting the if clause to '> 1' allows for empty hit objects. # This makes it easier to work with file formats with unpredictable # hit-hsp ordering. The empty hit object itself is nonfunctional, # however, since all its cascading properties are empty. if len(set([getattr(hsp, attr) for hsp in hsps])) > 1: raise ValueError("Hit object can not contain HSPs with " "more than one %s." % attr) self._items = [] for hsp in hsps: # validate each HSP self._validate_hsp(hsp) # and store it them as an instance attribute self.append(hsp) def __repr__(self): return "Hit(id=%r, query_id=%r, %r hsps)" % (self.id, self.query_id, len(self)) def __iter__(self): return iter(self.hsps) def __len__(self): return len(self.hsps) def __nonzero__(self): return bool(self.hsps) def __contains__(self, hsp): return hsp in self._items def __str__(self): lines = [] # set query id line qid_line = 'Query: %s' % self.query_id if self.query_description: qid_line += trim_str('\n %s' % self.query_description, 80, '...') lines.append(qid_line) # set hit id line hid_line = ' Hit: %s' % self.id if hasattr(self, 'seq_len'): hid_line += ' (%i)' % self.seq_len if self.description: hid_line += trim_str('\n %s' % self.description, 80, '...') lines.append(hid_line) # set hsp line and table if not self.hsps: lines.append(' HSPs: ?') else: lines.append( ' HSPs: %s %s %s %s %s %s' % ('-' * 4, '-' * 8, '-' * 9, '-' * 6, '-' * 15, '-' * 21)) pattern = '%11s %8s %9s %6s %15s %21s' lines.append(pattern % ('#', 'E-value', 'Bit score', 'Span', 'Query range', 'Hit range')) lines.append( pattern % ('-' * 4, '-' * 8, '-' * 9, '-' * 6, '-' * 15, '-' * 21)) for idx, hsp in enumerate(self.hsps): # evalue evalue = getattr_str(hsp, 'evalue', fmt='%.2g') # bitscore bitscore = getattr_str(hsp, 'bitscore', fmt='%.2f') # alignment length aln_span = getattr_str(hsp, 'aln_span') # query region query_start = getattr_str(hsp, 'query_start') query_end = getattr_str(hsp, 'query_end') query_range = '[%s:%s]' % (query_start, query_end) # max column length is 18 query_range = trim_str(query_range, 15, '~]') # hit region hit_start = getattr_str(hsp, 'hit_start') hit_end = getattr_str(hsp, 'hit_end') hit_range = '[%s:%s]' % (hit_start, hit_end) hit_range = trim_str(hit_range, 21, '~]') # append the hsp row lines.append(pattern % (str(idx), evalue, bitscore, aln_span, query_range, hit_range)) return '\n'.join(lines) def __getitem__(self, idx): # if key is slice, return a new Hit instance if isinstance(idx, slice): obj = self.__class__(self.hsps[idx]) self._transfer_attrs(obj) return obj return self._items[idx] def __setitem__(self, idx, hsps): # handle case if hsps is a list of hsp if isinstance(hsps, (list, tuple)): for hsp in hsps: self._validate_hsp(hsp) else: self._validate_hsp(hsps) self._items[idx] = hsps def __delitem__(self, idx): del self._items[idx] ## hsp properties ## def _validate_hsp(self, hsp): """Validates an HSP object. Valid HSP objects have the same hit_id as the Hit object ID and the same query_id as the Hit object's query_id. """ if not isinstance(hsp, HSP): raise TypeError("Hit objects can only contain HSP objects.") # HACK: to make validation during __init__ work if self._items: if self.id is not None: if hsp.hit_id != self.id: raise ValueError("Expected HSP with hit ID %r, " "found %r instead." % (self.id, hsp.hit_id)) else: self.id = hsp.hit_id if self.description is not None: if hsp.hit_description != self.description: raise ValueError("Expected HSP with hit description %r, " "found %r instead." % (self.description, hsp.hit_description)) else: self.description = hsp.hit_description if self.query_id is not None: if hsp.query_id != self.query_id: raise ValueError("Expected HSP with query ID %r, " "found %r instead." % (self.query_id, hsp.query_id)) else: self.query_id = hsp.query_id if self.query_description is not None: if hsp.query_description != self.query_description: raise ValueError( "Expected HSP with query description %r, " "found %r instead." % (self.query_description, hsp.query_description)) else: self.query_description = hsp.query_description ## properties ## description = optionalcascade('_description', 'hit_description', """Hit description""") query_description = optionalcascade( '_query_description', 'query_description', """Description of the query that produced the hit""") id = optionalcascade('_id', 'hit_id', """Hit ID string.""") query_id = optionalcascade( '_query_id', 'query_id', """ID string of the query that produced the hit""") # returns all hsps hsps = allitems(doc="""HSP objects contained in the Hit""") @property def fragments(self): """HSPFragment objects contained in the Hit""" return [frag for frag in chain(*self._items)] ## public methods ## def append(self, hsp): """Adds a HSP object to the end of Hit. Parameters hsp -- HSP object to append. Any HSP object appended must have the same `hit_id` property as the Hit object's `id` property and the same `query_id` property as the Hit object's `query_id` property. """ self._validate_hsp(hsp) self._items.append(hsp) def filter(self, func=None): """Creates a new Hit object whose HSP objects pass the filter function. Arguments: func -- Callback function that accepts a HSP object as its parameter, does a boolean check, and returns True or False. `filter` is analogous to Python's built-in `filter` function, except that instead of returning a list it returns a `Hit` object. Here is an example of using `filter` to select for HSPs having bitscores bigger than 60: >>> from Bio import SearchIO >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next() >>> hit = qresult[3] >>> evalue_filter = lambda hsp: hsp.bitscore > 60 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print filtered_hit Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] """ hsps = filter(func, self.hsps) if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def index(self, hsp): """Returns the index of a given HSP object, zero-based. Arguments: hsp -- HSP object to be looked up. """ return self._items.index(hsp) def map(self, func=None): """Creates a new Hit object, mapping the given function to its HSPs. Arguments: func -- Callback function that accepts a HSP object as its parameter and also returns a HSP object. `map` is analogous to Python's built-in `map` function. It is applied to all HSPs contained in the Hit object and returns a new Hit object. """ if func is not None: hsps = map(func, self.hsps[:]) # this creates a shallow copy else: hsps = self.hsps[:] if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def pop(self, index=-1): """Removes and returns the HSP object at the specified index. Arguments: index -- Integer denoting the index of the HSP object to remove. """ return self._items.pop(index) def sort(self, key=None, reverse=False, in_place=True): """Sorts the HSP objects. Arguments: key -- Function used to sort the HSP objects. reverse -- Boolean, whether to reverse the sorting or not. in_place -- Boolean, whether to perform sorting in place (in the same object) or not (creating a new object). `sort` defaults to sorting in-place, to mimick Python's `list.sort` method. If you set the `in_place` argument to False, it will treat return a new, sorted Hit object and keep the initial one unsorted """ if in_place: self._items.sort(key=key, reverse=reverse) else: hsps = self.hsps[:] hsps.sort(key=key, reverse=reverse) obj = self.__class__(hsps) self._transfer_attrs(obj) return obj