def addToDictionary(self, document: Document): for k in document.getFields(): token = document.getField(k).tokenStream(SimpleAnalyzer()) token.reset() while token.incrementToken(): term = token.getTerm() self.dictionary.add(term.text) token.close()
def addDocument(self, document: Document): fields = document.getFields() numVectorFields = len(fields.keys()) self.codecs.startDocument(numVectorFields) for fieldInfo in fields: tokenStream = document.getField(fieldInfo).tokenStream(self.config.analyzer) # not assure if tokenStream will choose the right function automatically tokenStream.reset() termDict = defaultdict(list) while tokenStream.incrementToken(): term = tokenStream.getTerm() position = tokenStream.getPosition() termDict[term].append(position) self.codecs.startField(fieldInfo,len(termDict)) for term in termDict.keys(): positions = termDict.get(term) self.codecs.startTerm(term,len(positions)) for position, startOffset, endOffset in positions: self.codecs.addPosition(position, startOffset, endOffset) self.codecs.finishTerm() self.codecs.finishField() self.codecs.finishDocument() self.numOfDocs += 1 # interesting about Python self.dictionary.addToDictionary(document)