def __init__(self, si, closeDir=False): self.directory = si.dir self.closeDirectory = closeDir self.segment = si.name self.nrms = {} self.deletedDocsDirty = False self.fieldInfos = field.FieldInfos(self.directory, self.segment + '.fnm') self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos) self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos) if SegmentReader.hasDeletions(si): self.deletedDocs = BitVector(self.directory, self.segment + '.del') else: self.deletedDocs = None # makes sure that all index files have been read or are kept open # so that if an index update removes them we'll still have them self.freqStream = self.directory.openFile(self.segment + '.frq') self.proxStream = self.directory.openFile(self.segment + '.prx') self.openNorms()
def __init__(self, si, closeDir=False): self.directory = si.dir self.closeDirectory = closeDir self.segment = si.name self.nrms = {} self.deletedDocsDirty = False self.fieldInfos = field.FieldInfos(self.directory, self.segment + ".fnm") self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos) self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos) if SegmentReader.hasDeletions(si): self.deletedDocs = BitVector(self.directory, self.segment + ".del") else: self.deletedDocs = None # makes sure that all index files have been read or are kept open # so that if an index update removes them we'll still have them self.freqStream = self.directory.openFile(self.segment + ".frq") self.proxStream = self.directory.openFile(self.segment + ".prx") self.openNorms()
def doDelete(self, docNum): if self.deletedDocs is None: self.deletedDocs = BitVector(self.maxDoc()) self.deletedDocsDirty = True self.deletedDocs.set(docNum)
class SegmentReader(IndexReader): # Class methods def hasDeletions(cls, si): return si.dir.fileExists(si.name + '.del') hasDeletions = classmethod(hasDeletions) # instance methods def __init__(self, si, closeDir=False): self.directory = si.dir self.closeDirectory = closeDir self.segment = si.name self.nrms = {} self.deletedDocsDirty = False self.fieldInfos = field.FieldInfos(self.directory, self.segment + '.fnm') self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos) self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos) if SegmentReader.hasDeletions(si): self.deletedDocs = BitVector(self.directory, self.segment + '.del') else: self.deletedDocs = None # makes sure that all index files have been read or are kept open # so that if an index update removes them we'll still have them self.freqStream = self.directory.openFile(self.segment + '.frq') self.proxStream = self.directory.openFile(self.segment + '.prx') self.openNorms() def closeNorms(self): for v in self.nrms.values(): norm = v v.inStream.close() def docFreq(self, t): ti = self.tis.getTerm(t) if ti is None: return 0 else: return ti.docFreq def doClose(self): if self.deletedDocsDirty: self.deletedDocs.write(self.directory, self.segment + ".tmp") self.directory.renameFile(self.segment + ".tmp", self.segment + ".del") self.deletedDocsDirty = False self.fieldsReader.close() self.tis.close() if self.freqStream is not None: self.freqStream.close() if self.proxStream is not None: self.proxStream.close() self.closeNorms() if self.closeDirectory: self.directory.close() def document(self, n): if self.isDeleted(n): raise Exception, 'attempt to access deleted document' return self.fieldsReader.doc(n) def doDelete(self, docNum): if self.deletedDocs is None: self.deletedDocs = BitVector(self.maxDoc()) self.deletedDocsDirty = True self.deletedDocs.set(docNum) def files(self): suffix = ['.fnm','.fdx','.fdt','.tii','.tis','.frq','.prx'] files = map((lambda x: self.segment + x), suffix) if self.directory.fileExists(self.segment + '.del'): files.append(self.segment + '.del') for i in range(len(self.fieldInfos)): fi = self.fieldInfos.fieldInfoInt(i) if fi.isIndexed: files.append(self.segment + '.f' + str(i)) return files def isDeleted(self, n): return (self.deletedDocs is not None and self.deletedDocs.get(n)) def maxDoc(self): return self.fieldsReader.size() def normsField(self, field): norm = self.nrms.get(field, None) if norm is None: return None if norm.bytes is None: bytes = array('B',[0x00]*self.maxDoc()) self.norms(field, bytes, 0) norm.bytes = bytes return norm.bytes def norms(self, field, bytes, offset): normStream = self.normStream(field) if normStream is None: return try: normStream.readBytes(bytes, offset, self.maxDoc()) finally: normStream.close() def normStream(self, field): norm = self.nrms.get(field, None) if norm is None: return None # Cloning???? result = norm.inStream.clone() result.seek(0) return result def numDocs(self): n = self.maxDoc() if self.deletedDocs is not None: n -= self.deletedDocs.count() return n def openNorms(self): for i in range(len(self.fieldInfos)): fi = self.fieldInfos.fieldInfoInt(i) if fi.isIndexed: self.nrms[fi.name]=Norm(self.directory.openFile( (self.segment + '.f' + str(fi.number)))) def termDocs(self): return SegmentTermDocs(self) def termPositions(self): return SegmentTermPositions(self) def terms(self, t = None): if t: return self.tis.terms() else: return self.tis.terms(t) def fieldNames(self): # Experimental for auto-queries # Return a sorted list of all the field names fNames = self.fieldInfos.fieldNames() if not fNames: return [] # Remove the field with no name fNames.remove('') return fNames
class SegmentReader(IndexReader): # Class methods def hasDeletions(cls, si): return si.dir.fileExists(si.name + ".del") hasDeletions = classmethod(hasDeletions) # instance methods def __init__(self, si, closeDir=False): self.directory = si.dir self.closeDirectory = closeDir self.segment = si.name self.nrms = {} self.deletedDocsDirty = False self.fieldInfos = field.FieldInfos(self.directory, self.segment + ".fnm") self.fieldsReader = field.FieldsReader(self.directory, self.segment, self.fieldInfos) self.tis = TermInfosReader(self.directory, self.segment, self.fieldInfos) if SegmentReader.hasDeletions(si): self.deletedDocs = BitVector(self.directory, self.segment + ".del") else: self.deletedDocs = None # makes sure that all index files have been read or are kept open # so that if an index update removes them we'll still have them self.freqStream = self.directory.openFile(self.segment + ".frq") self.proxStream = self.directory.openFile(self.segment + ".prx") self.openNorms() def closeNorms(self): for v in self.nrms.values(): norm = v v.inStream.close() def docFreq(self, t): ti = self.tis.getTerm(t) if ti is None: return 0 else: return ti.docFreq def doClose(self): if self.deletedDocsDirty: self.deletedDocs.write(self.directory, self.segment + ".tmp") self.directory.renameFile(self.segment + ".tmp", self.segment + ".del") self.deletedDocsDirty = False self.fieldsReader.close() self.tis.close() if self.freqStream is not None: self.freqStream.close() if self.proxStream is not None: self.proxStream.close() self.closeNorms() if self.closeDirectory: self.directory.close() def document(self, n): if self.isDeleted(n): raise Exception, "attempt to access deleted document" return self.fieldsReader.doc(n) def doDelete(self, docNum): if self.deletedDocs is None: self.deletedDocs = BitVector(self.maxDoc()) self.deletedDocsDirty = True self.deletedDocs.set(docNum) def files(self): suffix = [".fnm", ".fdx", ".fdt", ".tii", ".tis", ".frq", ".prx"] files = map((lambda x: self.segment + x), suffix) if self.directory.fileExists(self.segment + ".del"): files.append(self.segment + ".del") for i in range(len(self.fieldInfos)): fi = self.fieldInfos.fieldInfoInt(i) if fi.isIndexed: files.append(self.segment + ".f" + str(i)) return files def isDeleted(self, n): return self.deletedDocs is not None and self.deletedDocs.get(n) def maxDoc(self): return self.fieldsReader.size() def normsField(self, field): norm = self.nrms.get(field, None) if norm is None: return None if norm.bytes is None: bytes = array("B", [0x00] * self.maxDoc()) self.norms(field, bytes, 0) norm.bytes = bytes return norm.bytes def norms(self, field, bytes, offset): normStream = self.normStream(field) if normStream is None: return try: normStream.readBytes(bytes, offset, self.maxDoc()) finally: normStream.close() def normStream(self, field): norm = self.nrms.get(field, None) if norm is None: return None # Cloning???? result = norm.inStream.clone() result.seek(0) return result def numDocs(self): n = self.maxDoc() if self.deletedDocs is not None: n -= self.deletedDocs.count() return n def openNorms(self): for i in range(len(self.fieldInfos)): fi = self.fieldInfos.fieldInfoInt(i) if fi.isIndexed: self.nrms[fi.name] = Norm(self.directory.openFile((self.segment + ".f" + str(fi.number)))) def termDocs(self): return SegmentTermDocs(self) def termPositions(self): return SegmentTermPositions(self) def terms(self, t=None): if t: return self.tis.terms() else: return self.tis.terms(t) def fieldNames(self): # Experimental for auto-queries # Return a sorted list of all the field names fNames = self.fieldInfos.fieldNames() if not fNames: return [] # Remove the field with no name fNames.remove("") return fNames