Ejemplo n.º 1
0
    def reset(self):
        # wid -> {docid -> weight}; t -> D -> w(D, t)
        # Different indexers have different notions of term weight, but we
        # expect each indexer to use ._wordinfo to map wids to its notion
        # of a docid-to-weight map.
        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
        # and it's possible that the lexicon will return a non-zero wid
        # for a word we don't currently know about.  For example, if we
        # unindex the last doc containing a particular word, that wid
        # remains in the lexicon, but is no longer in our _wordinfo map;
        # lexicons can also be shared across indices, and some other index
        # may introduce a lexicon word we've never seen.
        # A word is in-vocabulary for this index if and only if
        # wid in _wordinfo.  Note that wid 0 must not be a key.
        # This does not use the BTree family since wids are always "I"
        # flavor trees.
        self._wordinfo = IOBTree()

        # docid -> weight
        # Different indexers have different notions of doc weight, but we
        # expect each indexer to use ._docweight to map docids to its
        # notion of what a doc weight is.
        self._docweight = self.family.IF.BTree()

        # docid -> WidCode'd list of wids
        # Used for un-indexing, and for phrase search.
        self._docwords = self.family.IO.BTree()

        # Use a BTree length for efficient length computation w/o conflicts
        self.word_count = Length.Length()
        self.indexed_count = Length.Length()
Ejemplo n.º 2
0
def init_simulation_db_structure(db_conn):
    """
    Init database structure for storing simulations.

    :param db_conn: DB connection
    :return: None
    """
    db_conn.root.simulations = OOBTree.BTree()
    # To store summary statistics of simulations.
    db_conn.root.n_simulations = Length.Length()
    db_conn.root.n_simulation_batches = Length.Length()
    # To store least common multiple of sizes of simulation batches.
    db_conn.root.simulation_batch_sizes_lcm = Length.Length(1)
Ejemplo n.º 3
0
def init_attractor_db_structure(db_conn):
    """
    Init database structure for storing aggregated attractors.

    :param db_conn: database connection
    :return None
    """
    db_conn.root.aggregated_attractors = OOBTree.BTree()
    db_conn.root.aggregated_attractor_keys_by_batch_index = OOBTree.BTree()
    db_conn.root.sorted_aggregated_attractors = OOBTree.BTree()
    # To store summary statistics of attractors.
    db_conn.root.n_aggregated_attractors = Length.Length()
    db_conn.root.total_frequency = Length.Length()
    db_conn.root.n_aggregated_attractor_batches = Length.Length()
    # To store least common multiple of sizes of aggregated attractor
    # batches.
    db_conn.root.aggregated_attractor_batch_sizes_lcm = Length.Length(1)
Ejemplo n.º 4
0
 def _del_wordinfo(self, wid, docid):
     doc2score = self._wordinfo[wid]
     del doc2score[docid]
     if doc2score:
         self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
     else:
         del self._wordinfo[wid]
         try:
             self.word_count.change(-1)
         except AttributeError:
             # upgrade word_count to Length object
             self.word_count = Length.Length(len(self._wordinfo))
Ejemplo n.º 5
0
 def unindex_doc(self, docid):
     if docid not in self._docwords:
         return
     for wid in self.family.IF.TreeSet(self.get_words(docid)).keys():
         self._del_wordinfo(wid, docid)
     del self._docwords[docid]
     del self._docweight[docid]
     try:
         self.indexed_count.change(-1)
     except AttributeError:
         # upgrade indexed_count to Length object
         self.indexed_count = Length.Length(len(self._docweight))
Ejemplo n.º 6
0
 def index_doc(self, docid, text):
     if docid in self._docwords:
         return self.reindex_doc(docid, text)
     wids = self._lexicon.sourceToWordIds(text)
     wid2weight, docweight = self._get_frequencies(wids)
     self._mass_add_wordinfo(wid2weight, docid)
     self._docweight[docid] = docweight
     self._docwords[docid] = widcode.encode(wids)
     try:
         self.indexed_count.change(1)
     except AttributeError:
         # upgrade indexed_count to Length object
         self.indexed_count = Length.Length(len(self._docweight))
     return len(wids)
Ejemplo n.º 7
0
 def _mass_add_wordinfo(self, wid2weight, docid):
     dicttype = type({})
     get_doc2score = self._wordinfo.get
     new_word_count = 0
     for wid, weight in wid2weight.items():
         doc2score = get_doc2score(wid)
         if doc2score is None:
             doc2score = {}
             new_word_count += 1
         elif (isinstance(doc2score, dicttype)
               and len(doc2score) == self.DICT_CUTOFF):
             doc2score = self.family.IF.BTree(doc2score)
         doc2score[docid] = weight
         self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
     try:
         self.word_count.change(new_word_count)
     except AttributeError:
         # upgrade word_count to Length object
         self.word_count = Length.Length(len(self._wordinfo))
Ejemplo n.º 8
0
    def _add_wordinfo(self, wid, f, docid):
        # Store a wordinfo in a dict as long as there are less than
        # DICT_CUTOFF docids in the dict.  Otherwise use an IFBTree.

        # The pickle of a dict is smaller than the pickle of an
        # IFBTree, substantially so for small mappings.  Thus, we use
        # a dictionary until the mapping reaches DICT_CUTOFF elements.

        # The cutoff is chosen based on the implementation
        # characteristics of Python dictionaries.  The dict hashtable
        # always has 2**N slots and is resized whenever it is 2/3s
        # full.  A pickled dict with 10 elts is half the size of an
        # IFBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
        # choose 10 as the cutoff for now.

        # The IFBTree has a smaller in-memory representation than a
        # dictionary, so pickle size isn't the only consideration when
        # choosing the threshold.  The pickle of a 500-elt dict is 92%
        # of the size of the same IFBTree, but the dict uses more
        # space when it is live in memory.  An IFBTree stores two C
        # arrays of ints, one for the keys and one for the values.  It
        # holds up to 120 key-value pairs in a single bucket.
        doc2score = self._wordinfo.get(wid)
        if doc2score is None:
            doc2score = {}  # XXX Holy ConflictError, Batman!
            try:
                self.word_count.change(1)
            except AttributeError:
                # upgrade word_count to Length object
                self.word_count = Length.Length(len(self._wordinfo))
                self.word_count.change(1)
        else:
            # _add_wordinfo() is called for each update.  If the map
            # size exceeds the DICT_CUTOFF, convert to an IFBTree.
            # Obscure:  First check the type.  If it's not a dict, it
            # can't need conversion, and then we can avoid an expensive
            # len(IFBTree).
            if (isinstance(doc2score, type({}))
                    and len(doc2score) == self.DICT_CUTOFF):
                doc2score = self.family.IF.BTree(doc2score)
        doc2score[docid] = f
        self._wordinfo[wid] = doc2score  # not redundant:  Persistency!
Ejemplo n.º 9
0
 def clear(self):
     self.values_to_documents = self.family.OO.BTree()
     self.documents_to_values = self.family.IO.BTree()
     self.documentCount = Length.Length(0)
     self.wordCount = Length.Length(0)