def idxmerge(cdbname, idxstomerge, verbose=0): # Count all the unique locations and assign new document ids. idxorder = {} loc2docid = {} for (i,idx) in enumerate(reversed(idxstomerge)): idx.assignnewids1(loc2docid) idxorder[idx] = i n = len(loc2docid) loc2docid = dict( (loc,n-docid) for (loc,docid) in loc2docid.iteritems() ) for idx in idxstomerge: idx.assignnewids2(loc2docid) # Create a new index file. maker = cdb.cdbmake(cdbname, cdbname+'.tmp') if verbose: print >>sys.stderr, 'Merging: %r (docs=%d, est. terms=%d): %r' % \ (cdbname, sum( idx.ndocs for idx in idxstomerge ), estimate_terms( idx.nterms for idx in idxstomerge ), idxstomerge) # Copy sentences to a new index file with unique ids. for idx in idxstomerge: idx.copysents(maker) # Merge document ids and offsets. nterms = 0 docid2info = [] for (k,vs) in cdbmerge(idxstomerge): if k[0] == PROP_LOC or k[0] == PROP_IDXINFO: break if k[0] == PROP_DOCID: # read a docid->loc mapping (oldid,) = unpack('>xi', k) for (info,idx) in vs: if oldid not in idx.old2new: continue newid = idx.old2new[oldid] docid2info.append((newid, info)) assert loc2docid[info[4:]] == newid else: # merge docid+pos sets vs = sorted(( (idxorder[idx], idx.convertoldids(v)) for (v,idx) in vs )) ents = sum( len(a) for (_,a) in vs )/2 (_,r) = vs.pop(0) for (_,a) in vs: r.extend(a) maker.add(k, encode_array(ents, r)) nterms += 1 if verbose and nterms % 1000 == 0: sys.stderr.write('.'); sys.stderr.flush() # write docid->loc mappings (avoiding dupes) docid2info.sort() for (docid,info) in docid2info: maker.add(pack('>ci', PROP_DOCID, docid), info) # write loc->docid mappings (avoiding dupes) for (loc,docid) in sorted(loc2docid.iteritems()): if loc: maker.add(PROP_LOC+loc, pack('>i', docid)) if verbose: print >>sys.stderr, 'done: docs=%d, terms=%d' % (len(docid2info), nterms) maker.add(PROP_IDXINFO, pack('>ii', len(docid2info), nterms)) maker.finish() return
def idxmerge(cdbname, idxstomerge, verbose=0): # Count all the unique locations and assign new document ids. idxorder = {} loc2docid = {} for (i, idx) in enumerate(reversed(idxstomerge)): idx.assignnewids1(loc2docid) idxorder[idx] = i n = len(loc2docid) loc2docid = dict( (loc, n - docid) for (loc, docid) in loc2docid.iteritems()) for idx in idxstomerge: idx.assignnewids2(loc2docid) # Create a new index file. maker = cdb.cdbmake(cdbname, cdbname + '.tmp') if verbose: print >>sys.stderr, 'Merging: %r (docs=%d, est. terms=%d): %r' % \ (cdbname, sum( idx.ndocs for idx in idxstomerge ), estimate_terms( idx.nterms for idx in idxstomerge ), idxstomerge) # Copy sentences to a new index file with unique ids. for idx in idxstomerge: idx.copysents(maker) # Merge document ids and offsets. nterms = 0 docid2info = [] for (k, vs) in cdbmerge(idxstomerge): if k[0] == PROP_LOC or k[0] == PROP_IDXINFO: break if k[0] == PROP_DOCID: # read a docid->loc mapping (oldid, ) = unpack('>xi', k) for (info, idx) in vs: if oldid not in idx.old2new: continue newid = idx.old2new[oldid] docid2info.append((newid, info)) assert loc2docid[info[4:]] == newid else: # merge docid+pos sets vs = sorted( ((idxorder[idx], idx.convertoldids(v)) for (v, idx) in vs)) ents = sum(len(a) for (_, a) in vs) / 2 (_, r) = vs.pop(0) for (_, a) in vs: r.extend(a) maker.add(k, encode_array(ents, r)) nterms += 1 if verbose and nterms % 1000 == 0: sys.stderr.write('.') sys.stderr.flush() # write docid->loc mappings (avoiding dupes) docid2info.sort() for (docid, info) in docid2info: maker.add(pack('>ci', PROP_DOCID, docid), info) # write loc->docid mappings (avoiding dupes) for (loc, docid) in sorted(loc2docid.iteritems()): if loc: maker.add(PROP_LOC + loc, pack('>i', docid)) if verbose: print >> sys.stderr, 'done: docs=%d, terms=%d' % (len(docid2info), nterms) maker.add(PROP_IDXINFO, pack('>ii', len(docid2info), nterms)) maker.finish() return
def add_idx(self, idxid): fname = self.gen_idx_fname(idxid) maker = cdb.cdbmake(fname, fname+'.tmp') return (fname, maker)
def add_idx(self, idxid): fname = self.gen_idx_fname(idxid) maker = cdb.cdbmake(fname, fname + '.tmp') return (fname, maker)