class SearchEngine: def __init__(self, index, stopwords): self.parser = BooleanParser() self.index = index self.stopwords = stopwords def search(self, query, lang, stem=True): normQuery = remove_nonletters(query, ' ', ['(', ')']) parsedQuery, terms, wordsTerms = self._parse_query( normQuery, stem, lang) documents = self.index.get_documents(parsedQuery) rankedResults = score(terms, documents, self.index, lang) sortedResults = sorted(rankedResults, key=lambda doc: doc['score'], reverse=True) return { 'documents': sortedResults, 'terms': terms, 'pureQuery': query, 'parsedQuery': parsedQuery, 'wordsTerms': wordsTerms } def nostemSearch(self, query, lang): return self.search(query, lang, False) def _parse_query(self, query, stem, lang): self.parser.stem = False noStemParsedQuery = self.parser.parse(query, lang, self.stopwords) wordsTerms = self.parser.terms(noStemParsedQuery) self.parser.stem = stem pquery = self.parser.parse(query, lang, self.stopwords) terms = self.parser.terms(pquery) return pquery, terms, wordsTerms
class SearchEngine: def __init__(self, index, stopwords): self.parser = BooleanParser() self.index = index self.stopwords = stopwords def search(self, query, lang, stem = True): normQuery = remove_nonletters(query, ' ', ['(', ')']) parsedQuery, terms, wordsTerms = self._parse_query(normQuery, stem, lang) documents = self.index.get_documents(parsedQuery) rankedResults = score(terms, documents, self.index, lang) sortedResults = sorted(rankedResults, key=lambda doc: doc['score'], reverse=True) return {'documents':sortedResults, 'terms':terms, 'pureQuery':query, 'parsedQuery':parsedQuery, 'wordsTerms':wordsTerms} def nostemSearch(self, query, lang): return self.search(query, lang, False) def _parse_query(self, query, stem, lang): self.parser.stem = False noStemParsedQuery = self.parser.parse(query, lang, self.stopwords) wordsTerms = self.parser.terms(noStemParsedQuery) self.parser.stem = stem pquery = self.parser.parse(query, lang, self.stopwords) terms = self.parser.terms(pquery) return pquery, terms, wordsTerms
def test_get_documents(self): parse = BooleanParser().parse fun = lambda x: self.index.get_documents(parse(x)) lfun = lambda x: len(fun(x)) self.assertEqual(lfun('rovnice průměr'), 1) self.assertEqual(lfun('průměr NOT úhlopříčky'), 1) self.assertEqual(lfun('rovnice'), 8) self.assertEqual(lfun('rovnice NOT spojitost'), 5) self.assertEqual(lfun('(statistika OR pythagorova)'), 3)
class BooleanParserTest(unittest.TestCase): parser = BooleanParser() def test_pure_parse(self): fun = lambda x: repr(self.parser._pure_parse(x)) ass = self.assertEqual ass(fun('arg1 AND arg2 OR arg3'), "(('arg1' AND 'arg2') OR 'arg3')") ass(fun('(ARG AND ARG (ARG OR NOT ARG))'), "('ARG' AND 'ARG' AND ('ARG' OR (NOT('ARG'))))") ass(fun(''), '()') ass(fun('OR OR (AND test NOT)'), "(() OR () OR ('test' AND ''))") def test_parse(self): ass = self.assertEqual fun = lambda x: repr(self.parser.parse(x)) ass(fun('OR OR (AND test NOT)'), "'test'") ass( fun('((star AND wars) AND NOT trek) OR ((star AND trek) OR TOS)'), "((('star' AND 'wars') AND NOT('trek')) OR (('star' AND 'trek') OR 'tos'))" ) ass(fun(''), "''")
def __init__(self, index, stopwords): self.parser = BooleanParser() self.index = index self.stopwords = stopwords