class Autocompleter():
    """Autocomplete System.

    Maintains a trie with keys from a given corpus of words.
    Gives autocompletion suggestions by retrieving all keys for a give prefix.
    """
    def __init__(self, words):
        """Initialize a autocompleter with a given set of words."""
        self.trie = CharTrie((word, True) for word in words)

    def suggest(self, prefix):
        """Return all words in the corpus starting with a given prefix."""
        try:
            return self.trie.keys(prefix=prefix)
        except KeyError:
            return []
Beispiel #2
0
def tile_stats(orfs, tiles):
    """compute tile stats

    orfs and tiles are name->seq dicts

    NOTE: for prefix trie stats (e.g., num of tiles per orf), it is assumed the
    orf name is a prefix to the name of a tile from that orf
    """
    import numpy as np

    tile_lens = np.asarray([len(t) for t in tiles.values()])
    orf_lens = np.asarray([len(o) for o in orfs.values()])
    tile_size = int(round(np.median(tile_lens)).tolist())

    # compute tile counts for each orf
    orf_prefixes = CharTrie()
    for name in orfs:
        orf_prefixes[name] = True
    # ensure that no ORF name is a prefix for a different valid ORF
    for name in orfs:
        if len(orf_prefixes.keys(name)) != 1:
            print(orf_prefixes.keys(name))
            raise ValueError(
                "some ORF name is a prefix for a different valid ORF")
    tile_prefixes = CharTrie()
    for name in tiles:
        tile_prefixes[name] = True
    # compute orf coverages
    orf_coverages = {}
    for (orf, seq) in orfs.items():
        orf_residues = len(seq)
        tile_residues = 0.0
        if tile_prefixes.has_subtrie(orf) or (orf in tile_prefixes):
            for tile in tile_prefixes.keys(orf):
                tile_residues += len(tiles[tile])
        orf_coverages[orf] = tile_residues / orf_residues

    stats = {}
    stats["tile_size"] = tile_size
    stats["num_tiles"] = len(tiles)
    stats["total_tile_residues"] = tile_lens.sum().tolist()
    stats["avg_orf_coverage"] = tile_lens.sum().tolist() / orf_lens.sum(
    ).tolist()
    stats["num_orfs_smaller_than_tile_size"] = (orf_lens <
                                                tile_size).sum().tolist()
    stats["approx_num_tiles_naive_1x_tiling"] = (np.ceil(
        orf_lens / tile_size).sum().tolist())
    stats["avg_orf_coverage"] = sum(
        orf_coverages.values()) / len(orf_coverages)
    stats["max_tiles_per_len_normed_orf"] = max(orf_coverages.values())
    stats["tile_len_hist"] = compute_int_hist(tile_lens)
    # what is the tile coverage of each ORF (tot tile residues / orf residues)
    # tiles are assigned to ORFs if they share a name
    stats["orf_coverage_hist"] = compute_float_hist(
        list(orf_coverages.values()))
    stats["top_5_orf_cov"] = list(
        map(
            list,
            sorted(orf_coverages.items(), key=lambda tup: tup[1],
                   reverse=True)[:5],
        ))
    stats["bot_5_orf_cov"] = list(
        map(list,
            sorted(orf_coverages.items(), key=lambda tup: tup[1])[:5]))

    return stats