class Indexer: def __init__(self, docs_dir, docs_size): self.docLoader = DocLoader(docs_dir, docs_size) self.tokenizer = Tokenizer() self.stemmer = Stemmer() self.dictionary = Dictionary(load=False) self._clean() self._setup(docs_size) def _setup(self, docs_size): for doc_id in range(1, docs_size + 1): doc = self.docLoader.getDoc(doc_id) tokens = self.tokenizer.tokenizeDoc(doc) print("tokens: ") for token in tokens: print(token) normalized_words = self.stemmer.normalize_list(tokens) print("normalized_words: ") for token in normalized_words: print(token) for token in normalized_words: self.dictionary.addToken(token, doc_id) @staticmethod def _clean(): if os.path.exists(os.path.dirname("./dist")): try: shutil.rmtree("./dist") except (FileNotFoundError, FileExistsError) as e: print("error")
class SearchEngine: _dictionary: Dictionary _tokenizer: Tokenizer _stemmer: Stemmer _query_result: QueryResult def __init__(self): self._dictionary = Dictionary(load=True) self._tokenizer = Tokenizer() self._stemmer = Stemmer() self._query_result = QueryResult() print(self._dictionary) def _search_for_token(self, token: Token): pl = self._dictionary.getPostingList(token.getWord()) print(pl) if pl is not None: self._query_result.addToResults(token, pl) def listen(self): inp = input("Enter Your Query: ") # inp = "هفته" query_tokens = self._tokenizer.tokenizeDoc(inp) normalized_query_tokens = self._stemmer.normalize_list(query_tokens) for p in normalized_query_tokens: self._search_for_token(p) self._query_result.buildCandidates() self._query_result.printKBestCandidates()