class Autocompleter(): """Autocomplete System. Maintains a trie with keys from a given corpus of words. Gives autocompletion suggestions by retrieving all keys for a give prefix. """ def __init__(self, words): """Initialize a autocompleter with a given set of words.""" self.trie = CharTrie((word, True) for word in words) def suggest(self, prefix): """Return all words in the corpus starting with a given prefix.""" try: return self.trie.keys(prefix=prefix) except KeyError: return []
def tile_stats(orfs, tiles): """compute tile stats orfs and tiles are name->seq dicts NOTE: for prefix trie stats (e.g., num of tiles per orf), it is assumed the orf name is a prefix to the name of a tile from that orf """ import numpy as np tile_lens = np.asarray([len(t) for t in tiles.values()]) orf_lens = np.asarray([len(o) for o in orfs.values()]) tile_size = int(round(np.median(tile_lens)).tolist()) # compute tile counts for each orf orf_prefixes = CharTrie() for name in orfs: orf_prefixes[name] = True # ensure that no ORF name is a prefix for a different valid ORF for name in orfs: if len(orf_prefixes.keys(name)) != 1: print(orf_prefixes.keys(name)) raise ValueError( "some ORF name is a prefix for a different valid ORF") tile_prefixes = CharTrie() for name in tiles: tile_prefixes[name] = True # compute orf coverages orf_coverages = {} for (orf, seq) in orfs.items(): orf_residues = len(seq) tile_residues = 0.0 if tile_prefixes.has_subtrie(orf) or (orf in tile_prefixes): for tile in tile_prefixes.keys(orf): tile_residues += len(tiles[tile]) orf_coverages[orf] = tile_residues / orf_residues stats = {} stats["tile_size"] = tile_size stats["num_tiles"] = len(tiles) stats["total_tile_residues"] = tile_lens.sum().tolist() stats["avg_orf_coverage"] = tile_lens.sum().tolist() / orf_lens.sum( ).tolist() stats["num_orfs_smaller_than_tile_size"] = (orf_lens < tile_size).sum().tolist() stats["approx_num_tiles_naive_1x_tiling"] = (np.ceil( orf_lens / tile_size).sum().tolist()) stats["avg_orf_coverage"] = sum( orf_coverages.values()) / len(orf_coverages) stats["max_tiles_per_len_normed_orf"] = max(orf_coverages.values()) stats["tile_len_hist"] = compute_int_hist(tile_lens) # what is the tile coverage of each ORF (tot tile residues / orf residues) # tiles are assigned to ORFs if they share a name stats["orf_coverage_hist"] = compute_float_hist( list(orf_coverages.values())) stats["top_5_orf_cov"] = list( map( list, sorted(orf_coverages.items(), key=lambda tup: tup[1], reverse=True)[:5], )) stats["bot_5_orf_cov"] = list( map(list, sorted(orf_coverages.items(), key=lambda tup: tup[1])[:5])) return stats