Exemple #1
0
 def index_lastloc(self):
   lastloc = None
   for (_,idx) in self.iteridxs():
     (ndocs,_) = idx_info(idx)
     (lastloc,_) = idx_docid2info(idx, ndocs-1)
     # the first index must be newest, so we stop here.
     break
   return lastloc
Exemple #2
0
 def index_lastloc(self):
     lastloc = None
     for (_, idx) in self.iteridxs():
         (ndocs, _) = idx_info(idx)
         (lastloc, _) = idx_docid2info(idx, ndocs - 1)
         # the first index must be newest, so we stop here.
         break
     return lastloc
Exemple #3
0
 def load_status(self, x):
   from base64 import b64decode
   try:
     (idxid0, docid0, found_docs) = unpack('>HiH', b64decode(x))
   except:
     return
   # put dummy locs (max. 65535)
   self.start_loc = (idxid0, docid0)
   self.found_docs = [None] * found_docs
   searched_docs0 = 0
   for (idxid,idx) in self._indexdb.iteridxs(end=idxid0-1):
     (ndocs, _) = idx_info(idx)
     searched_docs0 += ndocs
   self.searched_docs = (searched_docs0, ndocs-docid0)
   return
Exemple #4
0
 def load_status(self, x):
     from base64 import b64decode
     try:
         (idxid0, docid0, found_docs) = unpack('>HiH', b64decode(x))
     except:
         return
     # put dummy locs (max. 65535)
     self.start_loc = (idxid0, docid0)
     self.found_docs = [None] * found_docs
     searched_docs0 = 0
     for (idxid, idx) in self._indexdb.iteridxs(end=idxid0 - 1):
         (ndocs, _) = idx_info(idx)
         searched_docs0 += ndocs
     self.searched_docs = (searched_docs0, ndocs - docid0)
     return
Exemple #5
0
 def open(self):
     if not self.cdb:
         self.cdb = cdb.init(self.fname)
         (self.ndocs, self.nterms) = idx_info(self.cdb)
     return
Exemple #6
0
 def open(self):
   if not self.cdb:
     self.cdb = cdb.init(self.fname)
     (self.ndocs, self.nterms) = idx_info(self.cdb)
   return
Exemple #7
0
  def get_docids(self):
    "Returns a list of DocIDs that have a given feature."
    (start_idx, start_docid0) = self.start_loc
    (end_idx, end_docid0) = self.end_loc
    # We maintain the number of docs that have been searched so far.
    # But this is separeted into two parts:
    #  "all the docs included up to the previous index" + 
    #  "the number of docs that have been searched within the current index"
    # This way we can compute the number of searched docs deterministicly
    # without any cumulative counting within iterators
    # (no worry for double counting!).
    (searched_docs0, _) = self.searched_docs

    #  start_idx <= idxid <= end_idx.
    #  start_docid-1 >= docid >= end_docid.
    for (idxid,idx) in self._indexdb.iteridxs(start_idx, end_idx):
      assert isinstance(idxid, int)
      try:
        (ndocs, _) = idx_info(idx)
      except KeyError:
        continue
      if idxid == start_idx:
        start_docid = min(start_docid0, ndocs)
      else:
        start_docid = ndocs
      if idxid == end_idx:
        end_docid = end_docid0
      else:
        end_docid = 0
      
      if self.pos_preds:
        conj = False
        docs = {}
      else:
        # no positive predicate.
        conj = True
        docs = dict( (docid,[]) for docid in xrange(start_docid-1,-1,-1) )
      
      # Get a set of narrowed documents for each predicate.
      for pred in (self.pos_preds + self.neg_preds):
        # locs: docids must be in decending order. (ie. start_docid > end_docid)
        locs = [ (docid,sentid) for (docid,sentid) in pred.narrow_docids(idx)
                 if start_docid >= docid and docid > end_docid ]
        locs = [ (docid,sentid) for (docid,sentid) in locs
                 if pred.check_docid(docid) and pred.check_sentid(sentid) ]
        if not locs:
          if pred.neg:
            continue
          elif not self.disjunctive:
            docs.clear()
            break

        if self.disjunctive:
          # disjunctive (OR) search.
          docs1 = {}
          for (docid,sentid) in locs:
            if docid not in docs1:
              sentids = array('i')
              docs1[docid] = sentids
            else:
              sentids = docs1[docid]
            assert isinstance(sentids, array)
            sentids.append(sentid)
          # combine with the previous docs.
          for (docid,sentids) in docs1.iteritems():
            if docid not in docs:
              r = []
              docs[docid] = r
            else:
              r = docs[docid]
            x = (sentids, pred.checkpat)
            r.append(x)

        elif pred.neg:
          # negative conjunctive (-AND) search.
          for (docid,sentid) in locs:
            if docid in docs:
              del docs[docid]

        else:
          # positive conjunctive (+AND) search.
          docs1 = {}
          for (docid,sentid) in locs:
            if conj and (docid not in docs): continue
            if docid not in docs1:
              sentids = array('i')
              docs1[docid] = sentids
            else:
              sentids = docs1[docid]
            assert isinstance(sentids, array)
            sentids.append(sentid)
          if conj:
            # intersect with the previous docs.
            tmp = {}
            for (docid,sentids) in docs1.iteritems():
              r = docs[docid]
              x = (sentids, pred.checkpat)
              r.append(x)
              tmp[docid] = r
            docs = tmp
          else:
            # first positive predicate.
            conj = True
            for (docid,sentids) in docs1.iteritems():
              docs[docid] = [(sentids, pred.checkpat)]

      # docs: the candidate documents in the current index file.
      docs2 = docs.items()
      docs2.sort(reverse=True)
      found = set()
      for (docid,contexts) in docs2:
        self.start_loc = (idxid, docid)
        self.searched_docs = (searched_docs0, ndocs-docid)
        # Skip if the document is already in the list.
        if docid in found: continue
        found.add(docid)
        yield (idx,docid,contexts)

      # Finished this index.
      searched_docs0 += ndocs
      self.searched_docs = (searched_docs0, 0)
    return
Exemple #8
0
 def total_docs(self):
   total = 0
   for (_,idx) in self.iteridxs():
     (ndocs,_) = idx_info(idx)
     total += ndocs
   return total
Exemple #9
0
    def get_docids(self):
        "Returns a list of DocIDs that have a given feature."
        (start_idx, start_docid0) = self.start_loc
        (end_idx, end_docid0) = self.end_loc
        # We maintain the number of docs that have been searched so far.
        # But this is separeted into two parts:
        #  "all the docs included up to the previous index" +
        #  "the number of docs that have been searched within the current index"
        # This way we can compute the number of searched docs deterministicly
        # without any cumulative counting within iterators
        # (no worry for double counting!).
        (searched_docs0, _) = self.searched_docs

        #  start_idx <= idxid <= end_idx.
        #  start_docid-1 >= docid >= end_docid.
        for (idxid, idx) in self._indexdb.iteridxs(start_idx, end_idx):
            assert isinstance(idxid, int)
            try:
                (ndocs, _) = idx_info(idx)
            except KeyError:
                continue
            if idxid == start_idx:
                start_docid = min(start_docid0, ndocs)
            else:
                start_docid = ndocs
            if idxid == end_idx:
                end_docid = end_docid0
            else:
                end_docid = 0

            if self.pos_preds:
                conj = False
                docs = {}
            else:
                # no positive predicate.
                conj = True
                docs = dict(
                    (docid, []) for docid in xrange(start_docid - 1, -1, -1))

            # Get a set of narrowed documents for each predicate.
            for pred in (self.pos_preds + self.neg_preds):
                # locs: docids must be in decending order. (ie. start_docid > end_docid)
                locs = [(docid, sentid)
                        for (docid, sentid) in pred.narrow_docids(idx)
                        if start_docid >= docid and docid > end_docid]
                locs = [
                    (docid, sentid) for (docid, sentid) in locs
                    if pred.check_docid(docid) and pred.check_sentid(sentid)
                ]
                if not locs:
                    if pred.neg:
                        continue
                    elif not self.disjunctive:
                        docs.clear()
                        break

                if self.disjunctive:
                    # disjunctive (OR) search.
                    docs1 = {}
                    for (docid, sentid) in locs:
                        if docid not in docs1:
                            sentids = array('i')
                            docs1[docid] = sentids
                        else:
                            sentids = docs1[docid]
                        assert isinstance(sentids, array)
                        sentids.append(sentid)
                    # combine with the previous docs.
                    for (docid, sentids) in docs1.iteritems():
                        if docid not in docs:
                            r = []
                            docs[docid] = r
                        else:
                            r = docs[docid]
                        x = (sentids, pred.checkpat)
                        r.append(x)

                elif pred.neg:
                    # negative conjunctive (-AND) search.
                    for (docid, sentid) in locs:
                        if docid in docs:
                            del docs[docid]

                else:
                    # positive conjunctive (+AND) search.
                    docs1 = {}
                    for (docid, sentid) in locs:
                        if conj and (docid not in docs): continue
                        if docid not in docs1:
                            sentids = array('i')
                            docs1[docid] = sentids
                        else:
                            sentids = docs1[docid]
                        assert isinstance(sentids, array)
                        sentids.append(sentid)
                    if conj:
                        # intersect with the previous docs.
                        tmp = {}
                        for (docid, sentids) in docs1.iteritems():
                            r = docs[docid]
                            x = (sentids, pred.checkpat)
                            r.append(x)
                            tmp[docid] = r
                        docs = tmp
                    else:
                        # first positive predicate.
                        conj = True
                        for (docid, sentids) in docs1.iteritems():
                            docs[docid] = [(sentids, pred.checkpat)]

            # docs: the candidate documents in the current index file.
            docs2 = docs.items()
            docs2.sort(reverse=True)
            found = set()
            for (docid, contexts) in docs2:
                self.start_loc = (idxid, docid)
                self.searched_docs = (searched_docs0, ndocs - docid)
                # Skip if the document is already in the list.
                if docid in found: continue
                found.add(docid)
                yield (idx, docid, contexts)

            # Finished this index.
            searched_docs0 += ndocs
            self.searched_docs = (searched_docs0, 0)
        return
Exemple #10
0
 def total_docs(self):
     total = 0
     for (_, idx) in self.iteridxs():
         (ndocs, _) = idx_info(idx)
         total += ndocs
     return total