Exemple #1
0
    def __init__(self, filename, format, key_function=None, **kwargs):
        """Initializes _IndexedSearch instance.

        filename -- The source filename as string.
        format -- Lower case string denoting one of the supported formats.
        key_function -- Optional callbak function which when given a Result
                        should return a unique key for the dictionary.

        """
        self._filename = filename
        self._format = format
        self._key_function = key_function

        indexer_class = get_processor(format, _INDEXER_MAP)
        indexed_obj = indexer_class(filename, **kwargs)
        self._indexer = indexed_obj

        # default key function is lambda rec: rec.id
        if key_function:
            offset_iter = ((key_function(key), offset, length) for \
                    (key, offset, length) in indexed_obj)
        else:
            offset_iter = indexed_obj

        index = {}
        for key, offset, length in offset_iter:
            if key in index:
                self._indexer._handle.close()
                raise ValueError("Duplicate key %r" % key)
            else:
                index[key] = offset

        self._index = index
Exemple #2
0
    def __init__(self, filename, format, key_function=None, **kwargs):
        """Initializes _IndexedSearch instance.

        filename -- The source filename as string.
        format -- Lower case string denoting one of the supported formats.
        key_function -- Optional callbak function which when given a Result
                        should return a unique key for the dictionary.

        """
        self._filename = filename
        self._format = format
        self._key_function = key_function

        indexer_class = get_processor(format, _INDEXER_MAP)
        indexed_obj = indexer_class(filename, **kwargs)
        self._indexer = indexed_obj

        # default key function is lambda rec: rec.id
        if key_function:
            offset_iter = ((key_function(key), offset, length) for \
                    (key, offset, length) in indexed_obj)
        else:
            offset_iter = indexed_obj

        index = {}
        for key, offset, length in offset_iter:
            if key in index:
                self._indexer._handle.close()
                raise ValueError("Duplicate key %r" % key)
            else:
                index[key] = offset

        self._index = index
Exemple #3
0
def parse(handle, format=None, **kwargs):
    """Turns a search output file into a generator that yields QueryResult
    objects.

    Arguments:
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    This function is used to iterate over each query in a given search output
    file:

    >>> from Bio import SearchIO
    >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    >>> qresults
    <generator object ...>
    >>> for qresult in qresults:
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    Depending on the file format, `parse` may also accept additional keyword
    argument(s) that modifies the behavior of the format parser. Here is a
    simple example, where the keyword argument enables parsing of a commented
    BLAST tabular output file:

    >>> from Bio import SearchIO
    >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True):
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    """
    # get the iterator object and do error checking
    iterator = get_processor(format, _ITERATOR_MAP)

    # HACK: force BLAST XML decoding to use utf-8
    handle_kwargs = {}
    if format == 'blast-xml' and sys.version_info[0] > 2:
        handle_kwargs['encoding'] = 'utf-8'

    # and start iterating
    with as_handle(handle, 'rU', **handle_kwargs) as source_file:
        generator = iterator(source_file, **kwargs)

        for qresult in generator:
            yield qresult
Exemple #4
0
def parse(handle, format=None, **kwargs):
    """Turns a search output file into a generator that yields QueryResult
    objects.

    Arguments:
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    This function is used to iterate over each query in a given search output
    file:

    >>> from Bio import SearchIO
    >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    >>> qresults
    <generator object ...>
    >>> for qresult in qresults:
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    Depending on the file format, `parse` may also accept additional keyword
    argument(s) that modifies the behavior of the format parser. Here is a
    simple example, where the keyword argument enables parsing of a commented
    BLAST tabular output file:

    >>> from Bio import SearchIO
    >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True):
    ...     print "Search %s has %i hits" % (qresult.id, len(qresult))
    ...
    Search 33211 has 100 hits
    Search 33212 has 44 hits
    Search 33213 has 95 hits

    """
    # get the iterator object and do error checking
    iterator = get_processor(format, _ITERATOR_MAP)

    # HACK: force BLAST XML decoding to use utf-8
    handle_kwargs = {}
    if format == 'blast-xml' and sys.version_info[0] > 2:
        handle_kwargs['encoding'] = 'utf-8'

    # and start iterating
    with as_handle(handle, 'rU', **handle_kwargs) as source_file:
        generator = iterator(source_file, **kwargs)

        for qresult in generator:
            yield qresult
Exemple #5
0
def write(qresults, handle, format=None, **kwargs):
    """Writes QueryResult objects to a file in the given format.

    Arguments:
    qresults -- An iterator returning QueryResult objects or a single
                QueryResult object.
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    The `write` function writes QueryResult object(s) into the given output
    handle / filename. You can supply it with a single QueryResult object or an
    iterable returning one or more QueryResult objects. In both cases, the
    function will return a tuple of four values: the number of QueryResult, Hit,
    HSP, and HSPFragment objects it writes to the output file.

    from Bio import SearchIO
    qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    SearchIO.write(qresults, 'results.tab', 'blast-tab')
    <stdout> (3, 239, 277, 277)

    The output of different formats may be adjusted using the format-specific
    keyword arguments. Here is an example that writes BLAT PSL output file with
    a header:

    from Bio import SearchIO
    qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl')
    SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True)
    <stdout> (2, 13, 22, 26)

    """
    # turn qresults into an iterator if it's a single QueryResult object
    if isinstance(qresults, QueryResult):
        qresults = iter([qresults])
    else:
        qresults = iter(qresults)

    # get the writer object and do error checking
    writer_class = get_processor(format, _WRITER_MAP)

    # write to the handle
    with as_handle(handle, 'w') as target_file:
        writer = writer_class(target_file, **kwargs)
        # count how many qresults, hits, and hsps
        qresult_count, hit_count, hsp_count, frag_count = \
                writer.write_file(qresults)

    return qresult_count, hit_count, hsp_count, frag_count
Exemple #6
0
def write(qresults, handle, format=None, **kwargs):
    """Writes QueryResult objects to a file in the given format.

    Arguments:
    qresults -- An iterator returning QueryResult objects or a single
                QueryResult object.
    handle -- Handle to the file, or the filename as a string.
    format -- Lower case string denoting one of the supported formats.
    kwargs -- Format-specific keyword arguments.

    The `write` function writes QueryResult object(s) into the given output
    handle / filename. You can supply it with a single QueryResult object or an
    iterable returning one or more QueryResult objects. In both cases, the
    function will return a tuple of four values: the number of QueryResult, Hit,
    HSP, and HSPFragment objects it writes to the output file.

    from Bio import SearchIO
    qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml')
    SearchIO.write(qresults, 'results.tab', 'blast-tab')
    <stdout> (3, 239, 277, 277)

    The output of different formats may be adjusted using the format-specific
    keyword arguments. Here is an example that writes BLAT PSL output file with
    a header:

    from Bio import SearchIO
    qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl')
    SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True)
    <stdout> (2, 13, 22, 26)

    """
    # turn qresults into an iterator if it's a single QueryResult object
    if isinstance(qresults, QueryResult):
        qresults = iter([qresults])
    else:
        qresults = iter(qresults)

    # get the writer object and do error checking
    writer_class = get_processor(format, _WRITER_MAP)

    # write to the handle
    with as_handle(handle, 'w') as target_file:
        writer = writer_class(target_file, **kwargs)
        # count how many qresults, hits, and hsps
        qresult_count, hit_count, hsp_count, frag_count = \
                writer.write_file(qresults)

    return qresult_count, hit_count, hsp_count, frag_count
Exemple #7
0
 def proxy_factory(format, filename=None):
     """Given a filename returns proxy object, else boolean if format OK."""
     if filename:
         return get_processor(format, _INDEXER_MAP)(filename, **kwargs)
     else:
         return format in _INDEXER_MAP
Exemple #8
0
def index(filename, format=None, key_function=None, **kwargs):
    """Indexes a search output file and returns a dictionary-like object.

    Arguments:
    filename -- string giving name of file to be indexed
    format -- Lower case string denoting one of the supported formats.
    key_function -- Optional callback function which when given a
                    QueryResult should return a unique key for the dictionary.
    kwargs -- Format-specific keyword arguments.

    Index returns a pseudo-dictionary object with QueryResult objects as its
    values and a string identifier as its keys. The function is mainly useful
    for dealing with large search output files, as it enables access to any
    given QueryResult object much faster than using parse or read.

    Index works by storing in-memory the start locations of all queries in a
    file. When a user requested access to the query, this function will jump
    to its start position, parse the whole query, and return it as a
    QueryResult object:

    >>> from Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=None)
    >>> sorted(search_idx.keys())
    ['gi|156630997:105-1160', 'gi|195230749:301-1383', ..., 'gi|53729353:216-1313']
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None)
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)

    You can supply a custom callback function to alter the default identifier
    string. This function should accept as its input the QueryResult ID string
    and return a modified version of it.

    >>> from Bio import SearchIO
    >>> key_func = lambda id: id.split('|')[1]
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml', key_func)
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=<function <lambda> at ...>)
    >>> sorted(search_idx.keys())
    ['156630997:105-1160', ..., '371502086:108-1205', '53729353:216-1313']
    >>> search_idx['156630997:105-1160']
    QueryResult(id='gi|156630997:105-1160', 5 hits)

    Note that the callback function does not change the QueryResult's ID value.
    It only changes the key value used to retrieve the associated QueryResult.

    """
    if not isinstance(filename, basestring):
        raise TypeError("Need a filename (not a handle)")

    from Bio.File import _IndexedSeqFileDict
    proxy_class = get_processor(format, _INDEXER_MAP)
    repr = "SearchIO.index(%r, %r, key_function=%r)" \
        % (filename, format, key_function)
    return _IndexedSeqFileDict(proxy_class(filename, **kwargs),
                               key_function, repr, "QueryResult")
Exemple #9
0
 def proxy_factory(format, filename=None):
     """Given a filename returns proxy object, else boolean if format OK."""
     if filename:
         return get_processor(format, _INDEXER_MAP)(filename, **kwargs)
     else:
         return format in _INDEXER_MAP
Exemple #10
0
def index(filename, format=None, key_function=None, **kwargs):
    """Indexes a search output file and returns a dictionary-like object.

     - filename     - string giving name of file to be indexed
     - format       - Lower case string denoting one of the supported formats.
     - key_function - Optional callback function which when given a
                      QueryResult should return a unique key for the dictionary.
     - kwargs       - Format-specific keyword arguments.

    Index returns a pseudo-dictionary object with QueryResult objects as its
    values and a string identifier as its keys. The function is mainly useful
    for dealing with large search output files, as it enables access to any
    given QueryResult object much faster than using parse or read.

    Index works by storing in-memory the start locations of all queries in a
    file. When a user requested access to the query, this function will jump
    to its start position, parse the whole query, and return it as a
    QueryResult object:

    >>> from Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=None)
    >>> sorted(search_idx)
    ['gi|156630997:105-1160', 'gi|195230749:301-1383', ..., 'gi|53729353:216-1313']
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None)
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    You can supply a custom callback function to alter the default identifier
    string. This function should accept as its input the QueryResult ID string
    and return a modified version of it.

    >>> from Bio import SearchIO
    >>> key_func = lambda id: id.split('|')[1]
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml', key_func)
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=<function <lambda> at ...>)
    >>> sorted(search_idx)
    ['156630997:105-1160', ..., '371502086:108-1205', '53729353:216-1313']
    >>> search_idx['156630997:105-1160']
    QueryResult(id='gi|156630997:105-1160', 5 hits)
    >>> search_idx.close()

    Note that the callback function does not change the QueryResult's ID value.
    It only changes the key value used to retrieve the associated QueryResult.

    """
    if not isinstance(filename, str):
        raise TypeError("Need a filename (not a handle)")

    from Bio.File import _IndexedSeqFileDict

    proxy_class = get_processor(format, _INDEXER_MAP)
    repr = "SearchIO.index(%r, %r, key_function=%r)" % (filename, format,
                                                        key_function)
    return _IndexedSeqFileDict(proxy_class(filename, **kwargs), key_function,
                               repr, "QueryResult")
Exemple #11
0
    def __init__(self, index_filename, filenames, format, key_function,
            max_open=10, overwrite=False, **kwargs):
        """Initializes a _DbIndexedSearch instance.

        index_filename -- The SQLite filename.
        filenames -- List of strings specifying file(s) to be indexed, or when
                     indexing a single file this can be given as a string.
                     (optional if reloading an existing index, but must match)
        format -- Lower case string denoting one of the supported formats.
                  (optional if reloading an existing index, but must match)
        indexer -- Format-specific Indexer class.
        key_function - Optional callback function which when given a
                       Result identifier string should return a unique
                       key for the dictionary.
        max_open -- Integer of maximum open file objects allowed.
        overwrite -- Boolean, whether to overwrite existing index database
                     (if it exists) or not.

        """
        if not _sqlite:
            # Hack for Jython (of if Python is compiled without it)
            from Bio import MissingPythonDependencyError
            raise MissingPythonDependencyError("Requires sqlite3, which is "
                                               "included Python 2.5+")
        indexer_proxies = {}
        indexer_class = get_processor(format, _INDEXER_MAP)
        self._indexer_class = indexer_class
        self._kwargs = kwargs

        # remove index_filename if overwrite is True and the file exists
        if overwrite and os.path.isfile(index_filename):
            os.remove(index_filename)

        if os.path.isfile(index_filename):
            con = _sqlite.connect(index_filename)
            self._con = con
            try:
                # get the # of result offsets stored in the database
                count, = con.execute("SELECT value FROM meta_data WHERE "
                        "key=?;", ('count',)).fetchone()
                self._length = int(count)
                if self._length == -1:
                    con.close()
                    raise ValueError("Unfinished/partial database")

                # count the # of result offsets stored in the database
                # for cross-checking
                count, = con.execute("SELECT COUNT(key) FROM "
                        "offset_data;").fetchone()
                if self._length != int(count):
                    con.close()
                    raise ValueError("Corrupt database? %i entries not %i" %
                            (int(count), self._length))

                # check if the database format is the same as the given format
                self._format, = con.execute("SELECT value FROM meta_data "
                        "WHERE key=?;", ('format',)).fetchone()
                if format and format != self._format:
                    con.close()
                    raise ValueError("Incorrect format specified: '%s', "
                            "expected: '%s'" % (format, self._format))

                # check filenames # and names
                self._filenames = [row[0] for row in \
                        con.execute("SELECT name FROM file_data ORDER BY "
                            "file_number;").fetchall()]
                if filenames and len(filenames) != len(self._filenames):
                    con.close()
                    raise ValueError("Index file says %i files, not %i" %
                            (len(self._filenames), len(filenames)))
                if filenames and filenames != self._filenames:
                    con.close()
                    raise ValueError("Index file has different filenames")

            except _OperationalError, err:
                con.close()
                raise ValueError("Not a Biopython index database? %s" % err)
Exemple #12
0
    def __init__(self,
                 index_filename,
                 filenames,
                 format,
                 key_function,
                 max_open=10,
                 overwrite=False,
                 **kwargs):
        """Initializes a _DbIndexedSearch instance.

        index_filename -- The SQLite filename.
        filenames -- List of strings specifying file(s) to be indexed, or when
                     indexing a single file this can be given as a string.
                     (optional if reloading an existing index, but must match)
        format -- Lower case string denoting one of the supported formats.
                  (optional if reloading an existing index, but must match)
        indexer -- Format-specific Indexer class.
        key_function - Optional callback function which when given a
                       Result identifier string should return a unique
                       key for the dictionary.
        max_open -- Integer of maximum open file objects allowed.
        overwrite -- Boolean, whether to overwrite existing index database
                     (if it exists) or not.

        """
        if not _sqlite:
            # Hack for Jython (of if Python is compiled without it)
            from Bio import MissingPythonDependencyError
            raise MissingPythonDependencyError("Requires sqlite3, which is "
                                               "included Python 2.5+")
        indexer_proxies = {}
        indexer_class = get_processor(format, _INDEXER_MAP)
        self._indexer_class = indexer_class
        self._kwargs = kwargs

        # remove index_filename if overwrite is True and the file exists
        if overwrite and os.path.isfile(index_filename):
            os.remove(index_filename)

        if os.path.isfile(index_filename):
            con = _sqlite.connect(index_filename)
            self._con = con
            try:
                # get the # of result offsets stored in the database
                count, = con.execute(
                    "SELECT value FROM meta_data WHERE "
                    "key=?;", ('count', )).fetchone()
                self._length = int(count)
                if self._length == -1:
                    con.close()
                    raise ValueError("Unfinished/partial database")

                # count the # of result offsets stored in the database
                # for cross-checking
                count, = con.execute("SELECT COUNT(key) FROM "
                                     "offset_data;").fetchone()
                if self._length != int(count):
                    con.close()
                    raise ValueError("Corrupt database? %i entries not %i" %
                                     (int(count), self._length))

                # check if the database format is the same as the given format
                self._format, = con.execute(
                    "SELECT value FROM meta_data "
                    "WHERE key=?;", ('format', )).fetchone()
                if format and format != self._format:
                    con.close()
                    raise ValueError("Incorrect format specified: '%s', "
                                     "expected: '%s'" % (format, self._format))

                # check filenames # and names
                self._filenames = [row[0] for row in \
                        con.execute("SELECT name FROM file_data ORDER BY "
                            "file_number;").fetchall()]
                if filenames and len(filenames) != len(self._filenames):
                    con.close()
                    raise ValueError("Index file says %i files, not %i" %
                                     (len(self._filenames), len(filenames)))
                if filenames and filenames != self._filenames:
                    con.close()
                    raise ValueError("Index file has different filenames")

            except _OperationalError, err:
                con.close()
                raise ValueError("Not a Biopython index database? %s" % err)