def reset(self): # wid -> {docid -> weight}; t -> D -> w(D, t) # Different indexers have different notions of term weight, but we # expect each indexer to use ._wordinfo to map wids to its notion # of a docid-to-weight map. # There are two kinds of OOV words: wid 0 is explicitly OOV, # and it's possible that the lexicon will return a non-zero wid # for a word we don't currently know about. For example, if we # unindex the last doc containing a particular word, that wid # remains in the lexicon, but is no longer in our _wordinfo map; # lexicons can also be shared across indices, and some other index # may introduce a lexicon word we've never seen. # A word is in-vocabulary for this index if and only if # wid in _wordinfo. Note that wid 0 must not be a key. # This does not use the BTree family since wids are always "I" # flavor trees. self._wordinfo = IOBTree() # docid -> weight # Different indexers have different notions of doc weight, but we # expect each indexer to use ._docweight to map docids to its # notion of what a doc weight is. self._docweight = self.family.IF.BTree() # docid -> WidCode'd list of wids # Used for un-indexing, and for phrase search. self._docwords = self.family.IO.BTree() # Use a BTree length for efficient length computation w/o conflicts self.word_count = Length.Length() self.indexed_count = Length.Length()
def init_simulation_db_structure(db_conn): """ Init database structure for storing simulations. :param db_conn: DB connection :return: None """ db_conn.root.simulations = OOBTree.BTree() # To store summary statistics of simulations. db_conn.root.n_simulations = Length.Length() db_conn.root.n_simulation_batches = Length.Length() # To store least common multiple of sizes of simulation batches. db_conn.root.simulation_batch_sizes_lcm = Length.Length(1)
def init_attractor_db_structure(db_conn): """ Init database structure for storing aggregated attractors. :param db_conn: database connection :return None """ db_conn.root.aggregated_attractors = OOBTree.BTree() db_conn.root.aggregated_attractor_keys_by_batch_index = OOBTree.BTree() db_conn.root.sorted_aggregated_attractors = OOBTree.BTree() # To store summary statistics of attractors. db_conn.root.n_aggregated_attractors = Length.Length() db_conn.root.total_frequency = Length.Length() db_conn.root.n_aggregated_attractor_batches = Length.Length() # To store least common multiple of sizes of aggregated attractor # batches. db_conn.root.aggregated_attractor_batch_sizes_lcm = Length.Length(1)
def _del_wordinfo(self, wid, docid): doc2score = self._wordinfo[wid] del doc2score[docid] if doc2score: self._wordinfo[wid] = doc2score # not redundant: Persistency! else: del self._wordinfo[wid] try: self.word_count.change(-1) except AttributeError: # upgrade word_count to Length object self.word_count = Length.Length(len(self._wordinfo))
def unindex_doc(self, docid): if docid not in self._docwords: return for wid in self.family.IF.TreeSet(self.get_words(docid)).keys(): self._del_wordinfo(wid, docid) del self._docwords[docid] del self._docweight[docid] try: self.indexed_count.change(-1) except AttributeError: # upgrade indexed_count to Length object self.indexed_count = Length.Length(len(self._docweight))
def index_doc(self, docid, text): if docid in self._docwords: return self.reindex_doc(docid, text) wids = self._lexicon.sourceToWordIds(text) wid2weight, docweight = self._get_frequencies(wids) self._mass_add_wordinfo(wid2weight, docid) self._docweight[docid] = docweight self._docwords[docid] = widcode.encode(wids) try: self.indexed_count.change(1) except AttributeError: # upgrade indexed_count to Length object self.indexed_count = Length.Length(len(self._docweight)) return len(wids)
def _mass_add_wordinfo(self, wid2weight, docid): dicttype = type({}) get_doc2score = self._wordinfo.get new_word_count = 0 for wid, weight in wid2weight.items(): doc2score = get_doc2score(wid) if doc2score is None: doc2score = {} new_word_count += 1 elif (isinstance(doc2score, dicttype) and len(doc2score) == self.DICT_CUTOFF): doc2score = self.family.IF.BTree(doc2score) doc2score[docid] = weight self._wordinfo[wid] = doc2score # not redundant: Persistency! try: self.word_count.change(new_word_count) except AttributeError: # upgrade word_count to Length object self.word_count = Length.Length(len(self._wordinfo))
def _add_wordinfo(self, wid, f, docid): # Store a wordinfo in a dict as long as there are less than # DICT_CUTOFF docids in the dict. Otherwise use an IFBTree. # The pickle of a dict is smaller than the pickle of an # IFBTree, substantially so for small mappings. Thus, we use # a dictionary until the mapping reaches DICT_CUTOFF elements. # The cutoff is chosen based on the implementation # characteristics of Python dictionaries. The dict hashtable # always has 2**N slots and is resized whenever it is 2/3s # full. A pickled dict with 10 elts is half the size of an # IFBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So # choose 10 as the cutoff for now. # The IFBTree has a smaller in-memory representation than a # dictionary, so pickle size isn't the only consideration when # choosing the threshold. The pickle of a 500-elt dict is 92% # of the size of the same IFBTree, but the dict uses more # space when it is live in memory. An IFBTree stores two C # arrays of ints, one for the keys and one for the values. It # holds up to 120 key-value pairs in a single bucket. doc2score = self._wordinfo.get(wid) if doc2score is None: doc2score = {} # XXX Holy ConflictError, Batman! try: self.word_count.change(1) except AttributeError: # upgrade word_count to Length object self.word_count = Length.Length(len(self._wordinfo)) self.word_count.change(1) else: # _add_wordinfo() is called for each update. If the map # size exceeds the DICT_CUTOFF, convert to an IFBTree. # Obscure: First check the type. If it's not a dict, it # can't need conversion, and then we can avoid an expensive # len(IFBTree). if (isinstance(doc2score, type({})) and len(doc2score) == self.DICT_CUTOFF): doc2score = self.family.IF.BTree(doc2score) doc2score[docid] = f self._wordinfo[wid] = doc2score # not redundant: Persistency!
def clear(self): self.values_to_documents = self.family.OO.BTree() self.documents_to_values = self.family.IO.BTree() self.documentCount = Length.Length(0) self.wordCount = Length.Length(0)