Ejemplo n.º 1
0
def index(filename, format=None, key_function=None, **kwargs):
    """Indexes a search output file and returns a dictionary-like object.

     - filename     - string giving name of file to be indexed
     - format       - Lower case string denoting one of the supported formats.
     - key_function - Optional callback function which when given a
                      QueryResult should return a unique key for the dictionary.
     - kwargs       - Format-specific keyword arguments.

    Index returns a pseudo-dictionary object with QueryResult objects as its
    values and a string identifier as its keys. The function is mainly useful
    for dealing with large search output files, as it enables access to any
    given QueryResult object much faster than using parse or read.

    Index works by storing in-memory the start locations of all queries in a
    file. When a user requested access to the query, this function will jump
    to its start position, parse the whole query, and return it as a
    QueryResult object:

    >>> from SAP.Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=None)
    >>> sorted(search_idx)
    ['gi|156630997:105-1160', 'gi|195230749:301-1383', ..., 'gi|53729353:216-1313']
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from SAP.Bio import SearchIO
    >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml')
    >>> search_idx
    SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None)
    >>> search_idx['gi|195230749:301-1383']
    QueryResult(id='gi|195230749:301-1383', 5 hits)
    >>> search_idx.close()

    You can supply a custom callback function to alter the default identifier
    string. This function should accept as its input the QueryResult ID string
    and return a modified version of it.

    >>> from SAP.Bio import SearchIO
    >>> key_func = lambda id: id.split('|')[1]
    >>> search_idx = SearchIO.index('Blast/wnts.xml', 'blast-xml', key_func)
    >>> search_idx
    SearchIO.index('Blast/wnts.xml', 'blast-xml', key_function=<function <lambda> at ...>)
    >>> sorted(search_idx)
    ['156630997:105-1160', ..., '371502086:108-1205', '53729353:216-1313']
    >>> search_idx['156630997:105-1160']
    QueryResult(id='gi|156630997:105-1160', 5 hits)
    >>> search_idx.close()

    Note that the callback function does not change the QueryResult's ID value.
    It only changes the key value used to retrieve the associated QueryResult.

    """
    if not isinstance(filename, basestring):
        raise TypeError("Need a filename (not a handle)")

    from SAP.Bio.File import _IndexedSeqFileDict
    proxy_class = get_processor(format, _INDEXER_MAP)
    repr = "SearchIO.index(%r, %r, key_function=%r)" \
        % (filename, format, key_function)
    return _IndexedSeqFileDict(proxy_class(filename, **kwargs),
                               key_function, repr, "QueryResult")
Ejemplo n.º 2
0
def index(filename, format, alphabet=None, key_function=None):
    """Indexes a sequence file and returns a dictionary like object.

     - filename - string giving name of file to be indexed
     - format   - lower case string describing the file format
     - alphabet - optional Alphabet object, useful when the sequence type
                  cannot be automatically inferred from the file itself
                  (e.g. format="fasta" or "tab")
     - key_function - Optional callback function which when given a
                  SeqRecord identifier string should return a unique
                  key for the dictionary.

    This indexing function will return a dictionary like object, giving the
    SeqRecord objects as values:

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.index("Quality/example.fastq", "fastq")
    >>> len(records)
    3
    >>> sorted(records)
    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>
    >>> "EAS54_6_R1_2_1_540_792" in records
    True
    >>> print(records.get("Missing", None))
    None
    >>> records.close()

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.index("Quality/example.fastq.bgz", "fastq")
    >>> len(records)
    3
    >>> print(records["EAS54_6_R1_2_1_540_792"].seq)
    TTGGCAGGCCAAGGCCGATGGATCA
    >>> records.close()

    Note that this pseudo dictionary will not support all the methods of a
    true Python dictionary, for example values() is not defined since this
    would require loading all of the records into memory at once.

    When you call the index function, it will scan through the file, noting
    the location of each record. When you access a particular record via the
    dictionary methods, the code will jump to the appropriate part of the
    file and then parse that section into a SeqRecord.

    Note that not all the input formats supported by Bio.SeqIO can be used
    with this index function. It is designed to work only with sequential
    file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
    interlaced file format (e.g. alignment formats such as "clustal").

    For small files, it may be more efficient to use an in memory Python
    dictionary, e.g.

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.to_dict(SeqIO.parse("Quality/example.fastq", "fastq"))
    >>> len(records)
    3
    >>> sorted(records)
    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>

    As with the to_dict() function, by default the id string of each record
    is used as the key. You can specify a callback function to transform
    this (the record identifier string) into your preferred key. For example:

    >>> from SAP.Bio import SeqIO
    >>> def make_tuple(identifier):
    ...     parts = identifier.split("_")
    ...     return int(parts[-2]), int(parts[-1])
    >>> records = SeqIO.index("Quality/example.fastq", "fastq",
    ...                       key_function=make_tuple)
    >>> len(records)
    3
    >>> sorted(records)
    [(413, 324), (443, 348), (540, 792)]
    >>> print(records[(540, 792)].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>
    >>> (540, 792) in records
    True
    >>> "EAS54_6_R1_2_1_540_792" in records
    False
    >>> print(records.get("Missing", None))
    None
    >>> records.close()

    Another common use case would be indexing an NCBI style FASTA file,
    where you might want to extract the GI number from the FASTA identifer
    to use as the dictionary key.

    Notice that unlike the to_dict() function, here the key_function does
    not get given the full SeqRecord to use to generate the key. Doing so
    would impose a severe performance penalty as it would require the file
    to be completely parsed while building the index. Right now this is
    usually avoided.

    See also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()
    """
    #Try and give helpful error messages:
    if not isinstance(filename, basestring):
        raise TypeError("Need a filename (not a handle)")
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet) or
                                     isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %s" % repr(alphabet))

    #Map the file format to a sequence iterator:
    from ._index import _FormatToRandomAccess # Lazy import
    from SAP.Bio.File import _IndexedSeqFileDict
    try:
        proxy_class = _FormatToRandomAccess[format]
    except KeyError:
        raise ValueError("Unsupported format %r" % format)
    repr = "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \
        % (filename, format, alphabet, key_function)
    return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),
                               key_function, repr, "SeqRecord")
Ejemplo n.º 3
0
def index(filename, format, alphabet=None, key_function=None):
    """Indexes a sequence file and returns a dictionary like object.

     - filename - string giving name of file to be indexed
     - format   - lower case string describing the file format
     - alphabet - optional Alphabet object, useful when the sequence type
                  cannot be automatically inferred from the file itself
                  (e.g. format="fasta" or "tab")
     - key_function - Optional callback function which when given a
                  SeqRecord identifier string should return a unique
                  key for the dictionary.

    This indexing function will return a dictionary like object, giving the
    SeqRecord objects as values:

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.index("Quality/example.fastq", "fastq")
    >>> len(records)
    3
    >>> sorted(records)
    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>
    >>> "EAS54_6_R1_2_1_540_792" in records
    True
    >>> print(records.get("Missing", None))
    None
    >>> records.close()

    If the file is BGZF compressed, this is detected automatically. Ordinary
    GZIP files are not supported:

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.index("Quality/example.fastq.bgz", "fastq")
    >>> len(records)
    3
    >>> print(records["EAS54_6_R1_2_1_540_792"].seq)
    TTGGCAGGCCAAGGCCGATGGATCA
    >>> records.close()

    Note that this pseudo dictionary will not support all the methods of a
    true Python dictionary, for example values() is not defined since this
    would require loading all of the records into memory at once.

    When you call the index function, it will scan through the file, noting
    the location of each record. When you access a particular record via the
    dictionary methods, the code will jump to the appropriate part of the
    file and then parse that section into a SeqRecord.

    Note that not all the input formats supported by Bio.SeqIO can be used
    with this index function. It is designed to work only with sequential
    file formats (e.g. "fasta", "gb", "fastq") and is not suitable for any
    interlaced file format (e.g. alignment formats such as "clustal").

    For small files, it may be more efficient to use an in memory Python
    dictionary, e.g.

    >>> from SAP.Bio import SeqIO
    >>> records = SeqIO.to_dict(SeqIO.parse("Quality/example.fastq", "fastq"))
    >>> len(records)
    3
    >>> sorted(records)
    ['EAS54_6_R1_2_1_413_324', 'EAS54_6_R1_2_1_443_348', 'EAS54_6_R1_2_1_540_792']
    >>> print(records["EAS54_6_R1_2_1_540_792"].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>

    As with the to_dict() function, by default the id string of each record
    is used as the key. You can specify a callback function to transform
    this (the record identifier string) into your preferred key. For example:

    >>> from SAP.Bio import SeqIO
    >>> def make_tuple(identifier):
    ...     parts = identifier.split("_")
    ...     return int(parts[-2]), int(parts[-1])
    >>> records = SeqIO.index("Quality/example.fastq", "fastq",
    ...                       key_function=make_tuple)
    >>> len(records)
    3
    >>> sorted(records)
    [(413, 324), (443, 348), (540, 792)]
    >>> print(records[(540, 792)].format("fasta"))
    >EAS54_6_R1_2_1_540_792
    TTGGCAGGCCAAGGCCGATGGATCA
    <BLANKLINE>
    >>> (540, 792) in records
    True
    >>> "EAS54_6_R1_2_1_540_792" in records
    False
    >>> print(records.get("Missing", None))
    None
    >>> records.close()

    Another common use case would be indexing an NCBI style FASTA file,
    where you might want to extract the GI number from the FASTA identifer
    to use as the dictionary key.

    Notice that unlike the to_dict() function, here the key_function does
    not get given the full SeqRecord to use to generate the key. Doing so
    would impose a severe performance penalty as it would require the file
    to be completely parsed while building the index. Right now this is
    usually avoided.

    See also: Bio.SeqIO.index_db() and Bio.SeqIO.to_dict()
    """
    #Try and give helpful error messages:
    if not isinstance(filename, basestring):
        raise TypeError("Need a filename (not a handle)")
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet)
                                     or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %s" % repr(alphabet))

    #Map the file format to a sequence iterator:
    from ._index import _FormatToRandomAccess  # Lazy import
    from SAP.Bio.File import _IndexedSeqFileDict
    try:
        proxy_class = _FormatToRandomAccess[format]
    except KeyError:
        raise ValueError("Unsupported format %r" % format)
    repr = "SeqIO.index(%r, %r, alphabet=%r, key_function=%r)" \
        % (filename, format, alphabet, key_function)
    return _IndexedSeqFileDict(proxy_class(filename, format, alphabet),
                               key_function, repr, "SeqRecord")