Example #1
0
    def __init__(self, directory: Directory):
        self.d = directory
        self.codecs = DummyTermVectorsReader(self.d)
        self.numOfDocs = 0
        self.indexFormTitle = dict()  # key: term value:list list[0] is total frequency list[1] is a dict key is docID  value is a list list[0] is frequency list[1] is a list
        self.indexFormAbstract = dict()
        self.indexFormContents = dict()

        #  construct the form
        while True:  # assume if there is no document return 0
            numFields = self.codecs.getDocument()
            if numFields == 0:
                break

            for i in range(numFields):
                fieldName, numTerms = self.codecs.getField()
                if fieldName == 'title':
                    self.handleField(numTerms, self.indexFormTitle, self.numOfDocs)
                elif fieldName == 'abstract':
                    self.handleField(numTerms, self.indexFormAbstract, self.numOfDocs)
                elif fieldName == 'contents':
                    self.handleField(numTerms, self.indexFormContents, self.numOfDocs)
                self.codecs.finishField()

            self.codecs.finishDocument()
            self.numOfDocs += 1
        self.dictionary = Dictionary(directory)
        self.dictionary.load()
Example #2
0
class IndexReader(object):
    def __init__(self, directory: Directory):
        self.d = directory
        self.codecs = DummyTermVectorsReader(self.d)
        self.numOfDocs = 0
        self.indexFormTitle = dict()  # key: term value:list list[0] is total frequency list[1] is a dict key is docID  value is a list list[0] is frequency list[1] is a list
        self.indexFormAbstract = dict()
        self.indexFormContents = dict()

        #  construct the form
        while True:  # assume if there is no document return 0
            numFields = self.codecs.getDocument()
            if numFields == 0:
                break

            for i in range(numFields):
                fieldName, numTerms = self.codecs.getField()
                if fieldName == 'title':
                    self.handleField(numTerms, self.indexFormTitle, self.numOfDocs)
                elif fieldName == 'abstract':
                    self.handleField(numTerms, self.indexFormAbstract, self.numOfDocs)
                elif fieldName == 'contents':
                    self.handleField(numTerms, self.indexFormContents, self.numOfDocs)
                self.codecs.finishField()

            self.codecs.finishDocument()
            self.numOfDocs += 1
        self.dictionary = Dictionary(directory)
        self.dictionary.load()

    def handleField(self, numTerms:int, dictInput: dict, numOfDocs:int):  # termInfo[0] is frequency and termInfo[1] is the term
        if numTerms > 0:
            sum = 0
            termList = []
            for i in range(numTerms):
                termInfo = self.codecs.getTerm()
                termList.append(termInfo[1])
                sum += termInfo[0] ** 2
                tempList1 = []
                for j in range(termInfo[0]):
                    tempList1.append(self.codecs.getPosition())
                positionList = [termInfo[0], tempList1]
                if dictInput.get(termInfo[1]) is None:  # new term
                    record = [termInfo[0], {numOfDocs: positionList}]
                else:
                    record = dictInput.get(termInfo[1])
                    record[0] += termInfo[0]
                    tempDict = {numOfDocs: positionList}
                    record[1].update(tempDict)
                dictInput[termInfo[1]] = record
                self.codecs.finishTerm()
            sum = math.sqrt(sum)
            for term in termList:
                record = dictInput[term]
                positionList = record[1][numOfDocs]
                positionList[0] /= sum
        else:
            pass

    def docFreq(self, term: Term):
        if term.field is 'title':
            return len(self.indexFormTitle.get(term)[1])
        elif term.field is 'abstract':
            return len(self.indexFormAbstract.get(term)[1])
        else:
            return len(self.indexFormContents.get(term)[1])

    def getTermVector(self, docID: int, field:str):
        result = {}
        if field is 'title':
            self.getTermVectorField(docID, self.indexFormTitle, result)
        elif field is 'abstract':
            self.getTermVectorField(docID, self.indexFormAbstract, result)
        else:
            self.getTermVectorField(docID, self.indexFormContents, result)
        return result

    def getTermVectorField(self, docID: int, indexForm: dict, result: dict):
        for key in indexForm.keys():
            if docID in indexForm.get(key)[1]:
                result[key] = indexForm.get(key)[1].get(docID)[0]
        return result

    def termPosition(self, term: Term):
        if term.field is 'title':
            tempDict = self.indexFormTitle.get(term)
        elif term.field is 'abstract':
            tempDict = self.indexFormAbstract.get(term)
        else:
            tempDict = self.indexFormContents.get(term)

        '''
        newDict = {}
        for key in tempDict.keys():
            newDict[key] = tempDict.get(key)[1]
        return newDict
        '''

        if tempDict is not None:
            return tempDict[1]
        else:
            return None


    def numDocs(self) -> int:
        return self.numOfDocs

    def totalTermFreq(self,term: Term) -> int:
        if term.field is 'title':
            return self.indexFormTitle.get(term)[0]
        elif term.field is 'abstract':
            return self.indexFormAbstract.get(term)[0]
        else:
            return self.indexFormContents.get(term)[0]