def searchengine(directory): stopWords = set(stopwords.words("english")) # stemming ps = PorterStemmer() # create InvertedIndex obj invertedIndex = InvertedIndex() # build the corpus Corp = Corpus() corpus = Corp.buildCorpus(directory) for docId in corpus: doc = corpus[docId] content = doc.getContent() # tokenize tokens = word_tokenize(content) for token in tokens: token = token.lower() # apply stemming token = ps.stem(token) # remove stopwords if token in stopWords: continue # add to index invertedIndex.addTerm(token, docId) return invertedIndex, corpus
def buildCorpus(self): corpus = Corpus() return corpus.buildCorpus(self.__directory)