def index_lastloc(self): lastloc = None for (_,idx) in self.iteridxs(): (ndocs,_) = idx_info(idx) (lastloc,_) = idx_docid2info(idx, ndocs-1) # the first index must be newest, so we stop here. break return lastloc
def index_lastloc(self): lastloc = None for (_, idx) in self.iteridxs(): (ndocs, _) = idx_info(idx) (lastloc, _) = idx_docid2info(idx, ndocs - 1) # the first index must be newest, so we stop here. break return lastloc
def load_status(self, x): from base64 import b64decode try: (idxid0, docid0, found_docs) = unpack('>HiH', b64decode(x)) except: return # put dummy locs (max. 65535) self.start_loc = (idxid0, docid0) self.found_docs = [None] * found_docs searched_docs0 = 0 for (idxid,idx) in self._indexdb.iteridxs(end=idxid0-1): (ndocs, _) = idx_info(idx) searched_docs0 += ndocs self.searched_docs = (searched_docs0, ndocs-docid0) return
def load_status(self, x): from base64 import b64decode try: (idxid0, docid0, found_docs) = unpack('>HiH', b64decode(x)) except: return # put dummy locs (max. 65535) self.start_loc = (idxid0, docid0) self.found_docs = [None] * found_docs searched_docs0 = 0 for (idxid, idx) in self._indexdb.iteridxs(end=idxid0 - 1): (ndocs, _) = idx_info(idx) searched_docs0 += ndocs self.searched_docs = (searched_docs0, ndocs - docid0) return
def open(self): if not self.cdb: self.cdb = cdb.init(self.fname) (self.ndocs, self.nterms) = idx_info(self.cdb) return
def get_docids(self): "Returns a list of DocIDs that have a given feature." (start_idx, start_docid0) = self.start_loc (end_idx, end_docid0) = self.end_loc # We maintain the number of docs that have been searched so far. # But this is separeted into two parts: # "all the docs included up to the previous index" + # "the number of docs that have been searched within the current index" # This way we can compute the number of searched docs deterministicly # without any cumulative counting within iterators # (no worry for double counting!). (searched_docs0, _) = self.searched_docs # start_idx <= idxid <= end_idx. # start_docid-1 >= docid >= end_docid. for (idxid,idx) in self._indexdb.iteridxs(start_idx, end_idx): assert isinstance(idxid, int) try: (ndocs, _) = idx_info(idx) except KeyError: continue if idxid == start_idx: start_docid = min(start_docid0, ndocs) else: start_docid = ndocs if idxid == end_idx: end_docid = end_docid0 else: end_docid = 0 if self.pos_preds: conj = False docs = {} else: # no positive predicate. conj = True docs = dict( (docid,[]) for docid in xrange(start_docid-1,-1,-1) ) # Get a set of narrowed documents for each predicate. for pred in (self.pos_preds + self.neg_preds): # locs: docids must be in decending order. (ie. start_docid > end_docid) locs = [ (docid,sentid) for (docid,sentid) in pred.narrow_docids(idx) if start_docid >= docid and docid > end_docid ] locs = [ (docid,sentid) for (docid,sentid) in locs if pred.check_docid(docid) and pred.check_sentid(sentid) ] if not locs: if pred.neg: continue elif not self.disjunctive: docs.clear() break if self.disjunctive: # disjunctive (OR) search. docs1 = {} for (docid,sentid) in locs: if docid not in docs1: sentids = array('i') docs1[docid] = sentids else: sentids = docs1[docid] assert isinstance(sentids, array) sentids.append(sentid) # combine with the previous docs. for (docid,sentids) in docs1.iteritems(): if docid not in docs: r = [] docs[docid] = r else: r = docs[docid] x = (sentids, pred.checkpat) r.append(x) elif pred.neg: # negative conjunctive (-AND) search. for (docid,sentid) in locs: if docid in docs: del docs[docid] else: # positive conjunctive (+AND) search. docs1 = {} for (docid,sentid) in locs: if conj and (docid not in docs): continue if docid not in docs1: sentids = array('i') docs1[docid] = sentids else: sentids = docs1[docid] assert isinstance(sentids, array) sentids.append(sentid) if conj: # intersect with the previous docs. tmp = {} for (docid,sentids) in docs1.iteritems(): r = docs[docid] x = (sentids, pred.checkpat) r.append(x) tmp[docid] = r docs = tmp else: # first positive predicate. conj = True for (docid,sentids) in docs1.iteritems(): docs[docid] = [(sentids, pred.checkpat)] # docs: the candidate documents in the current index file. docs2 = docs.items() docs2.sort(reverse=True) found = set() for (docid,contexts) in docs2: self.start_loc = (idxid, docid) self.searched_docs = (searched_docs0, ndocs-docid) # Skip if the document is already in the list. if docid in found: continue found.add(docid) yield (idx,docid,contexts) # Finished this index. searched_docs0 += ndocs self.searched_docs = (searched_docs0, 0) return
def total_docs(self): total = 0 for (_,idx) in self.iteridxs(): (ndocs,_) = idx_info(idx) total += ndocs return total
def get_docids(self): "Returns a list of DocIDs that have a given feature." (start_idx, start_docid0) = self.start_loc (end_idx, end_docid0) = self.end_loc # We maintain the number of docs that have been searched so far. # But this is separeted into two parts: # "all the docs included up to the previous index" + # "the number of docs that have been searched within the current index" # This way we can compute the number of searched docs deterministicly # without any cumulative counting within iterators # (no worry for double counting!). (searched_docs0, _) = self.searched_docs # start_idx <= idxid <= end_idx. # start_docid-1 >= docid >= end_docid. for (idxid, idx) in self._indexdb.iteridxs(start_idx, end_idx): assert isinstance(idxid, int) try: (ndocs, _) = idx_info(idx) except KeyError: continue if idxid == start_idx: start_docid = min(start_docid0, ndocs) else: start_docid = ndocs if idxid == end_idx: end_docid = end_docid0 else: end_docid = 0 if self.pos_preds: conj = False docs = {} else: # no positive predicate. conj = True docs = dict( (docid, []) for docid in xrange(start_docid - 1, -1, -1)) # Get a set of narrowed documents for each predicate. for pred in (self.pos_preds + self.neg_preds): # locs: docids must be in decending order. (ie. start_docid > end_docid) locs = [(docid, sentid) for (docid, sentid) in pred.narrow_docids(idx) if start_docid >= docid and docid > end_docid] locs = [ (docid, sentid) for (docid, sentid) in locs if pred.check_docid(docid) and pred.check_sentid(sentid) ] if not locs: if pred.neg: continue elif not self.disjunctive: docs.clear() break if self.disjunctive: # disjunctive (OR) search. docs1 = {} for (docid, sentid) in locs: if docid not in docs1: sentids = array('i') docs1[docid] = sentids else: sentids = docs1[docid] assert isinstance(sentids, array) sentids.append(sentid) # combine with the previous docs. for (docid, sentids) in docs1.iteritems(): if docid not in docs: r = [] docs[docid] = r else: r = docs[docid] x = (sentids, pred.checkpat) r.append(x) elif pred.neg: # negative conjunctive (-AND) search. for (docid, sentid) in locs: if docid in docs: del docs[docid] else: # positive conjunctive (+AND) search. docs1 = {} for (docid, sentid) in locs: if conj and (docid not in docs): continue if docid not in docs1: sentids = array('i') docs1[docid] = sentids else: sentids = docs1[docid] assert isinstance(sentids, array) sentids.append(sentid) if conj: # intersect with the previous docs. tmp = {} for (docid, sentids) in docs1.iteritems(): r = docs[docid] x = (sentids, pred.checkpat) r.append(x) tmp[docid] = r docs = tmp else: # first positive predicate. conj = True for (docid, sentids) in docs1.iteritems(): docs[docid] = [(sentids, pred.checkpat)] # docs: the candidate documents in the current index file. docs2 = docs.items() docs2.sort(reverse=True) found = set() for (docid, contexts) in docs2: self.start_loc = (idxid, docid) self.searched_docs = (searched_docs0, ndocs - docid) # Skip if the document is already in the list. if docid in found: continue found.add(docid) yield (idx, docid, contexts) # Finished this index. searched_docs0 += ndocs self.searched_docs = (searched_docs0, 0) return
def total_docs(self): total = 0 for (_, idx) in self.iteridxs(): (ndocs, _) = idx_info(idx) total += ndocs return total