def index_db(index_filename, filenames=None, format=None, key_function=None, **kwargs): """Indexes several search output files into an SQLite database. Arguments: index_filename -- The SQLite filename. filenames -- List of strings specifying file(s) to be indexed, or when indexing a single file this can be given as a string. (optional if reloading an existing index, but must match) format -- Lower case string denoting one of the supported formats. (optional if reloading an existing index, but must match) key_function -- Optional callback function which when given a QueryResult identifier string should return a unique key for the dictionary. kwargs -- Format-specific keyword arguments. The `index_db` function is similar to `index` in that it indexes the start position of all queries from search output files. The main difference is instead of storing these indices in-memory, they are written to disk as an SQLite database file. This allows the indices to persist between Python sessions. This enables access to any queries in the file without any indexing overhead, provided it has been indexed at least once. >>> from Bio import SearchIO >>> db_idx = SearchIO.index_db(':memory:', 'Blast/mirna.xml', 'blast-xml') >>> sorted(db_idx.keys()) ['33211', '33212', '33213'] >>> db_idx['33212'] QueryResult(id='33212', 44 hits) `index_db` can also index multiple files and store them in the same database, making it easier to group multiple search files and access them from a single interface. >>> from Bio import SearchIO >>> files = ['Blast/mirna.xml', 'Blast/wnts.xml'] >>> db_idx = SearchIO.index_db(':memory:', files, 'blast-xml') >>> sorted(db_idx.keys()) ['33211', '33212', '33213', 'gi|156630997:105-1160', ..., 'gi|53729353:216-1313'] >>> db_idx['33212'] QueryResult(id='33212', 44 hits) One common example where this is helpful is if you had a large set of query sequences (say ten thousand) which you split into ten query files of one thousand sequences each in order to run as ten separate BLAST jobs on a cluster. You could use `index_db` to index the ten BLAST output files together for seamless access to all the results as one dictionary. Note that ':memory:' rather than an index filename tells SQLite to hold the index database in memory. This is useful for quick tests, but using the Bio.SearchIO.index(...) function instead would use less memory. BGZF compressed files are supported, and detected automatically. Ordinary GZIP compressed files are not supported. """ # cast filenames to list if it's a string # (can we check if it's a string or a generator?) if isinstance(filenames, basestring): filenames = [filenames] from Bio.File import _SQLiteManySeqFilesDict repr = "SearchIO.index_db(%r, filenames=%r, format=%r, key_function=%r, ...)" \ % (index_filename, filenames, format, key_function) def proxy_factory(format, filename=None): """Given a filename returns proxy object, else boolean if format OK.""" if filename: return get_processor(format, _INDEXER_MAP)(filename, **kwargs) else: return format in _INDEXER_MAP return _SQLiteManySeqFilesDict(index_filename, filenames, proxy_factory, format, key_function, repr)
def index_db(index_filename, filenames=None, format=None, alphabet=None, key_function=None): """Index several sequence files and return a dictionary like object. The index is stored in an SQLite database rather than in memory (as in the Bio.SeqIO.index(...) function). - index_filename - Where to store the SQLite index - filenames - list of strings specifying file(s) to be indexed, or when indexing a single file this can be given as a string. (optional if reloading an existing index, but must match) - format - lower case string describing the file format (optional if reloading an existing index, but must match) - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. format="fasta" or "tab") - key_function - Optional callback function which when given a SeqRecord identifier string should return a unique key for the dictionary. This indexing function will return a dictionary like object, giving the SeqRecord objects as values: >>> from Bio.Alphabet import generic_protein >>> from Bio import SeqIO >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"] >>> def get_gi(name): ... parts = name.split("|") ... i = parts.index("gi") ... assert i != -1 ... return parts[i+1] >>> idx_name = ":memory:" #use an in memory SQLite DB for this test >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi) >>> len(records) 95 >>> records["7525076"].description 'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]' >>> records["45478717"].description 'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]' >>> records.close() In this example the two files contain 85 and 10 records respectively. BGZF compressed files are supported, and detected automatically. Ordinary GZIP compressed files are not supported. See also: Bio.SeqIO.index() and Bio.SeqIO.to_dict(), and the Python module glob which is useful for building lists of files. """ # Try and give helpful error messages: if not isinstance(index_filename, basestring): raise TypeError("Need a string for the index filename") if isinstance(filenames, basestring): # Make the API a little more friendly, and more similar # to Bio.SeqIO.index(...) for indexing just one file. filenames = [filenames] if filenames is not None and not isinstance(filenames, list): raise TypeError( "Need a list of filenames (as strings), or one filename") if format is not None and not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if format and format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %r" % alphabet) # Map the file format to a sequence iterator: from ._index import _FormatToRandomAccess # Lazy import from Bio.File import _SQLiteManySeqFilesDict repr = "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \ % (index_filename, filenames, format, alphabet, key_function) def proxy_factory(format, filename=None): """Given a filename returns proxy object, else boolean if format OK.""" if filename: return _FormatToRandomAccess[format](filename, format, alphabet) else: return format in _FormatToRandomAccess return _SQLiteManySeqFilesDict(index_filename, filenames, proxy_factory, format, key_function, repr)
def index_db(index_filename, filenames=None, format=None, alphabet=None, key_function=None): """Index several sequence files and return a dictionary like object. The index is stored in an SQLite database rather than in memory (as in the Bio.SeqIO.index(...) function). - index_filename - Where to store the SQLite index - filenames - list of strings specifying file(s) to be indexed, or when indexing a single file this can be given as a string. (optional if reloading an existing index, but must match) - format - lower case string describing the file format (optional if reloading an existing index, but must match) - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. format="fasta" or "tab") - key_function - Optional callback function which when given a SeqRecord identifier string should return a unique key for the dictionary. This indexing function will return a dictionary like object, giving the SeqRecord objects as values: >>> from Bio.Alphabet import generic_protein >>> from Bio import SeqIO >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"] >>> def get_gi(name): ... parts = name.split("|") ... i = parts.index("gi") ... assert i != -1 ... return parts[i+1] >>> idx_name = ":memory:" #use an in memory SQLite DB for this test >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi) >>> len(records) 95 >>> records["7525076"].description 'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]' >>> records["45478717"].description 'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]' >>> records.close() In this example the two files contain 85 and 10 records respectively. BGZF compressed files are supported, and detected automatically. Ordinary GZIP compressed files are not supported. See Also: Bio.SeqIO.index() and Bio.SeqIO.to_dict(), and the Python module glob which is useful for building lists of files. """ # Try and give helpful error messages: if not isinstance(index_filename, basestring): raise TypeError("Need a string for the index filename") if isinstance(filenames, basestring): # Make the API a little more friendly, and more similar # to Bio.SeqIO.index(...) for indexing just one file. filenames = [filenames] if filenames is not None and not isinstance(filenames, list): raise TypeError( "Need a list of filenames (as strings), or one filename") if format is not None and not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if format and format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %r" % alphabet) # Map the file format to a sequence iterator: from ._index import _FormatToRandomAccess # Lazy import from Bio.File import _SQLiteManySeqFilesDict repr = "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \ % (index_filename, filenames, format, alphabet, key_function) def proxy_factory(format, filename=None): """Given a filename returns proxy object, else boolean if format OK.""" if filename: return _FormatToRandomAccess[format](filename, format, alphabet) else: return format in _FormatToRandomAccess return _SQLiteManySeqFilesDict(index_filename, filenames, proxy_factory, format, key_function, repr)
def index_db(index_filename, filenames=None, format=None, key_function=None, **kwargs): """Indexes several search output files into an SQLite database. - index_filename - The SQLite filename. - filenames - List of strings specifying file(s) to be indexed, or when indexing a single file this can be given as a string. (optional if reloading an existing index, but must match) - format - Lower case string denoting one of the supported formats. (optional if reloading an existing index, but must match) - key_function - Optional callback function which when given a QueryResult identifier string should return a unique key for the dictionary. - kwargs - Format-specific keyword arguments. The ``index_db`` function is similar to ``index`` in that it indexes the start position of all queries from search output files. The main difference is instead of storing these indices in-memory, they are written to disk as an SQLite database file. This allows the indices to persist between Python sessions. This enables access to any queries in the file without any indexing overhead, provided it has been indexed at least once. >>> from Bio import SearchIO >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only! >>> db_idx = SearchIO.index_db(idx_filename, 'Blast/mirna.xml', 'blast-xml') >>> sorted(db_idx) ['33211', '33212', '33213'] >>> db_idx['33212'] QueryResult(id='33212', 44 hits) >>> db_idx.close() ``index_db`` can also index multiple files and store them in the same database, making it easier to group multiple search files and access them from a single interface. >>> from Bio import SearchIO >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only! >>> files = ['Blast/mirna.xml', 'Blast/wnts.xml'] >>> db_idx = SearchIO.index_db(idx_filename, files, 'blast-xml') >>> sorted(db_idx) ['33211', '33212', '33213', 'gi|156630997:105-1160', ..., 'gi|53729353:216-1313'] >>> db_idx['33212'] QueryResult(id='33212', 44 hits) >>> db_idx.close() One common example where this is helpful is if you had a large set of query sequences (say ten thousand) which you split into ten query files of one thousand sequences each in order to run as ten separate BLAST jobs on a cluster. You could use ``index_db`` to index the ten BLAST output files together for seamless access to all the results as one dictionary. Note that ':memory:' rather than an index filename tells SQLite to hold the index database in memory. This is useful for quick tests, but using the Bio.SearchIO.index(...) function instead would use less memory. BGZF compressed files are supported, and detected automatically. Ordinary GZIP compressed files are not supported. See also Bio.SearchIO.index(), Bio.SearchIO.to_dict(), and the Python module glob which is useful for building lists of files. """ # cast filenames to list if it's a string # (can we check if it's a string or a generator?) if isinstance(filenames, str): filenames = [filenames] from Bio.File import _SQLiteManySeqFilesDict repr = "SearchIO.index_db(%r, filenames=%r, format=%r, key_function=%r, ...)" % ( index_filename, filenames, format, key_function, ) def proxy_factory(format, filename=None): """Given a filename returns proxy object, else boolean if format OK.""" if filename: return get_processor(format, _INDEXER_MAP)(filename, **kwargs) else: return format in _INDEXER_MAP return _SQLiteManySeqFilesDict(index_filename, filenames, proxy_factory, format, key_function, repr)