Esempio n. 1
0
def parse(handle, format=None, **kwargs):
    """Turns a search output file into a generator that yields QueryResult
    objects.

     - handle - Handle to the file, or the filename as a string.
     - format - Lower case string denoting one of the supported formats.
     - kwargs - Format-specific keyword arguments.

    This function is used to iterate over each query in a given search output
    file:

    >>> from anarci.Bio import SearchIO
    >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    >>> qresults
    <generator object ...>
    >>> for qresult in qresults:
    ...     print("Search %s has %i hits" % (qresult.id, len(qresult)))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    Depending on the file format, `parse` may also accept additional keyword
    argument(s) that modifies the behavior of the format parser. Here is a
    simple example, where the keyword argument enables parsing of a commented
    BLAST tabular output file:

    >>> from anarci.Bio import SearchIO
    >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True):
    ...     print("Search %s has %i hits" % (qresult.id, len(qresult)))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    """
    # get the iterator object and do error checking
    iterator = get_processor(format, _ITERATOR_MAP)

    # HACK: force BLAST XML decoding to use utf-8
    handle_kwargs = {}
    if format == 'blast-xml' and sys.version_info[0] > 2:
        handle_kwargs['encoding'] = 'utf-8'

    # and start iterating
    with as_handle(handle, 'rU', **handle_kwargs) as source_file:
        generator = iterator(source_file, **kwargs)

        for qresult in generator:
            yield qresult
Esempio n. 2
0
def write(qresults, handle, format=None, **kwargs):
    """Writes QueryResult objects to a file in the given format.

     - qresults - An iterator returning QueryResult objects or a single
                  QueryResult object.
     - handle   - Handle to the file, or the filename as a string.
     - format   - Lower case string denoting one of the supported formats.
     - kwargs   - Format-specific keyword arguments.

    The `write` function writes QueryResult object(s) into the given output
    handle / filename. You can supply it with a single QueryResult object or an
    iterable returning one or more QueryResult objects. In both cases, the
    function will return a tuple of four values: the number of QueryResult, Hit,
    HSP, and HSPFragment objects it writes to the output file::

        from anarci.Bio import SearchIO
        qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
        SearchIO.write(qresults, 'results.tab', 'blast-tab')
        <stdout> (3, 239, 277, 277)

    The output of different formats may be adjusted using the format-specific
    keyword arguments. Here is an example that writes BLAT PSL output file with
    a header::

        from anarci.Bio import SearchIO
        qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl')
        SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True)
        <stdout> (2, 13, 22, 26)

    """
    # turn qresults into an iterator if it's a single QueryResult object
    if isinstance(qresults, QueryResult):
        qresults = iter([qresults])
    else:
        qresults = iter(qresults)

    # get the writer object and do error checking
    writer_class = get_processor(format, _WRITER_MAP)

    # write to the handle
    with as_handle(handle, 'w') as target_file:
        writer = writer_class(target_file, **kwargs)
        # count how many qresults, hits, and hsps
        qresult_count, hit_count, hsp_count, frag_count = \
                writer.write_file(qresults)

    return qresult_count, hit_count, hsp_count, frag_count
Esempio n. 3
0
 def proxy_factory(format, filename=None):
     """Given a filename returns proxy object, else boolean if format OK."""
     if filename:
         return get_processor(format, _INDEXER_MAP)(filename, **kwargs)
     else:
         return format in _INDEXER_MAP
Esempio n. 4
0
def index(filename, format=None, key_function=None, **kwargs):
    """Indexes a search output file and returns a dictionary-like object.

     - filename     - string giving name of file to be indexed
     - format       - Lower case string denoting one of the supported formats.
     - key_function - Optional callback function which when given a
                      QueryResult should return a unique key for the dictionary.
     - kwargs       - Format-specific keyword arguments.

    Index returns a pseudo-dictionary object with QueryResult objects as its
    values and a string identifier as its keys. The function is mainly useful
    for dealing with large search output files, as it enables access to any
    given QueryResult object much faster than using parse or read.

    Index works by storing in-memory the start locations of all queries in a
    file. When a user requested access to the query, this function will jump
    to its start position, parse the whole query, and return it as a
    QueryResult object:

    >>> from anarci.Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=None)
    >>> sorted(search_idx)
    ['gi|156630997:105-1160', 'gi|195230749:301-1383', ..., 'gi|53729353:216-1313']
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from anarci.Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None)
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    You can supply a custom callback function to alter the default identifier
    string. This function should accept as its input the QueryResult ID string
    and return a modified version of it.

    >>> from anarci.Bio import SearchIO
    >>> key_func = lambda id: id.split('|')[1]
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml', key_func)
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=<function <lambda> at ...>)
    >>> sorted(search_idx)
    ['156630997:105-1160', ..., '371502086:108-1205', '53729353:216-1313']
    >>> search_idx['156630997:105-1160']
    QueryResult(id='gi|156630997:105-1160', 5 hits)
    >>> search_idx.close()

    Note that the callback function does not change the QueryResult's ID value.
    It only changes the key value used to retrieve the associated QueryResult.

    """
    if not isinstance(filename, basestring):
        raise TypeError("Need a filename (not a handle)")

    from anarci.Bio.File import _IndexedSeqFileDict
    proxy_class = get_processor(format, _INDEXER_MAP)
    repr = "SearchIO.index(%r, %r, key_function=%r)" \
        % (filename, format, key_function)
    return _IndexedSeqFileDict(proxy_class(filename, **kwargs), key_function,
                               repr, "QueryResult")