def build(cls): command_print("Build started") cls.crawler = WebCrawler() # run method for index cls.crawler.scrape_index_pages() # run method for all country pages cls.crawler.scrape_country_pages() # run method for all continent pages cls.crawler.scrape_continent_pages() # create the index from memory cls.crawler.create_index_file() command_print("Build completed")
def print_or_find(cls, search_query, is_find_command): if cls.inverted_index: result_dict = cls.inverted_index.get_documents(search_query, is_find_command) if not result_dict: command_print("No result found against query '{}'".format(search_query)) for document_url, count in result_dict.items(): command_print(document_url, count) else: command_print("Inverted Index is not loaded.")
def get_documents(self, search_query, is_find_command): """ Return documents related to words in search query :param is_find_command: :param search_query: :return: """ result_documents = {} final_keys_set = None result_document_counter = Counter({}) query_words = search_query.split(" ") max_query_words = 1 if is_find_command: max_query_words = MAX_FIND_QUERY_WORDS if len(query_words) > max_query_words: command_print("Search query can't have more than {} words".format(max_query_words)) else: # Iterating each query word one by one and getting its documents and count dictionary for query_word in query_words: if query_word: current_word_documents = self.inverted_index_dict.get(query_word, {}) result_copy_dict = current_word_documents.copy() if not final_keys_set: final_keys_set = set(result_copy_dict.keys()) else: final_keys_set = final_keys_set & set(result_copy_dict.keys()) result_document_counter += Counter(result_copy_dict) if final_keys_set: summed_counter_dict = dict(result_document_counter) result_documents = {key: summed_counter_dict[key] for key in final_keys_set if key in summed_counter_dict} if is_find_command: result_documents = dict(sorted(result_documents.items(), key=operator.itemgetter(1), reverse=True)) return result_documents
def __new__(cls): if not os.path.isfile(INVERTED_INDEX_FILE_NAME): command_print(INVERTED_INDEX_FILE_NAME, "File doesn't exist. Please Use build command first") else: return super(InvertedIndex, cls).__new__(cls)
def load(cls): cls.inverted_index = InvertedIndex() if cls.inverted_index: command_print("Inverted index loaded")
else: command_print("Inverted Index is not loaded.") if __name__ == '__main__': command = "" main_obj = Main() print("Following commands are valid available commands:") print("1) build") print("2) load") print("3) find") print("4) print") print("5) quit") while True: command = input("> ") if command.strip().lower() == "quit": break elif command.startswith("build"): main_obj.build() elif command.startswith("load"): main_obj.load() # print and find used the same command so we find out if # the user entered find or print elif command.startswith("find "): main_obj.print_or_find(command.split("find ", 1)[1], True) elif command.startswith("print "): main_obj.print_or_find(command.split("print ", 1)[1], False) else: command_print("Invalid Command:", command) command = ""