예제 #1
0
 def addToDictionary(self, document: Document):
     for k in document.getFields():
         token = document.getField(k).tokenStream(SimpleAnalyzer())
         token.reset()
         while token.incrementToken():
             term = token.getTerm()
             self.dictionary.add(term.text)
         token.close()
예제 #2
0
 def addDocument(self, document: Document):
     fields = document.getFields()
     numVectorFields = len(fields.keys())
     self.codecs.startDocument(numVectorFields)
     for fieldInfo in fields:
         tokenStream = document.getField(fieldInfo).tokenStream(self.config.analyzer)  # not assure if tokenStream will choose the right function automatically
         tokenStream.reset()
         termDict = defaultdict(list)
         while tokenStream.incrementToken():
             term = tokenStream.getTerm()
             position = tokenStream.getPosition()
             termDict[term].append(position)
         self.codecs.startField(fieldInfo,len(termDict))
         for term in termDict.keys():
             positions = termDict.get(term)
             self.codecs.startTerm(term,len(positions))
             for position, startOffset, endOffset in positions:
                 self.codecs.addPosition(position, startOffset, endOffset)
             self.codecs.finishTerm()
         self.codecs.finishField()
     self.codecs.finishDocument()
     self.numOfDocs += 1  # interesting about Python
     self.dictionary.addToDictionary(document)