def compressTermIdData(self,termId): return data.compressDocIdTermInstanceTable(self.termIdHash[termId])
def mergePartitions(self,termIdList,*partitions): """Merge the data from partitions into self This must be a method on an ExternalPartition, MemoryPartitions have no concept of merging termIdList must contain the sorted list of all termIds in all partitions the resulting merged partition will contain one entry for each termId """ # seek constants... no need to import them from wherever... SEEK_END = 2 SEEK_CUR = 1 # internal functions to handle some preliminaries # makes the main merge code easier to understand, sacrificing overval function length def _growPartitionFile(howMuch): """extend the disk partition by howMuch""" if os.path.exists(self.path): # prevent trucating existing file print >> sys.stderr, "Extending ExternalPartition %s" % self.path fp = open(self.path,"rb+") else: print >> sys.stderr, "Creating ExternalPartiton %s" % self.path fp = open(self.path,"wb") fp.seek(0,SEEK_END) previousSize = fp.tell() fp.seek(howMuch - 1,SEEK_CUR) fp.write('\x00') newSize = fp.tell() print >> sys.stderr, "ExternalPartition grew %d bytes, has size %s" % (newSize-previousSize,newSize) fp.close() self.__mmap_init__() def _relocateDocIdTermInstanceTables(): """Relocates the existing tables to the end of the file maintaining proper offsets""" rp = open(self.path,"rb") wp = open(self.path,"rb+") wp.seek(0,SEEK_END) for termId in reversed(sorted(self.termIdHash)): header = self.termIdHash[termId] rp.seek(header.offset) wp.seek(-header.length,SEEK_CUR) newOffset = wp.tell() wp.write(rp.read(header.length)) wp.seek(newOffset) self.termIdHash[termId].offset = newOffset rp.close() wp.close() # Main Merge Logic spaceNeeded = sum(map(lambda partition: partition.estimateSizeOnDisk(),partitions)) _growPartitionFile(spaceNeeded) _relocateDocIdTermInstanceTables() wp = open(self.path,"rb+") for termId in termIdList: partitionsHoldingTermId = list() for partition in partitions: if termId in partition: partitionsHoldingTermId.append(partition) # always add self last if termId in self: partitionsHoldingTermId.append(self) if len(partitionsHoldingTermId) == 0: continue else: newOffset = wp.tell() if len(partitionsHoldingTermId) == 1: partition = partitionsHoldingTermId[0] #print >> sys.stderr, "Merge single instance of termId %d from %s" % (termId,partition.name) header,compressedData = partition.compressTermIdData(termId) wp.write(compressedData) header.offset = newOffset self.termIdHash[termId] = header if partition is not self: partition.deleteTermId(termId) else: table = data.DocIdTermInstanceTable() for partition in partitionsHoldingTermId: #print >> sys.stderr, "Merge multi instance termId %s from %s" % (termId,partition.name) for docIdTermInstanceVector in partition.lookupTermId(termId): docId = docIdTermInstanceVector.docId for termInstance in docIdTermInstanceVector.termInstancesGenerator: table.insertTermInstanceRecord(docId,termInstance) if partition is not self: partition.deleteTermId(termId) header,compressedData = data.compressDocIdTermInstanceTable(table) wp.write(compressedData) header.offset = newOffset self.termIdHash[termId] = header wp.truncate() print >> sys.stderr, "ExternalPartition was truncated to size %d" % wp.tell() wp.close() self.__mmap_init__()