def do_index(self, cfg_file="index.cfg"): logging.info("Execution begin") cfg = ConfigReader.read_cfg(cfg_file) logging.info("Configuration file read") inv_list_file = cfg['LEIA'][0] model_file = cfg['ESCREVA'][0] inv_list = self.read_inv_list(inv_list_file) logging.info("Inverted list read: " + str(len(inv_list)) + " terms") self.weight_function.generate_model(inv_list) logging.info("Model generated") with open(model_file, 'wb') as pick_file: pickle.dump(self.weight_function, pick_file) logging.info("Model saved") logging.info("Execution ended")
def process_queries(self, cfg_file_name='pc.cfg'): logging.info("Execution begin") configs = ConfigReader.read_cfg(cfg_file_name) logging.info("Configuration file read") begin = time.perf_counter() for xml_file_name in configs['LEIA']: xml_file = minidom.parse(xml_file_name) query_list = xml_file.getElementsByTagName('QUERY') for query in query_list: self._process_xml_query(query) end = time.perf_counter() elapsed = end - begin logging.info("Queries processed: " + str(len(self.expected_docs_by_query)) + " queries read from " + str(len(configs['LEIA'])) + " file(s)") logging.info("Query processor performance: " + str(len(self.expected_docs_by_query)/elapsed) + " queries per second.") self.write_queries(configs['CONSULTAS'][0], configs['ESPERADOS'][0]) logging.info("Query processing saved") logging.info("Execution ended")
def do_search(self, config_file_name="busca.cfg"): logging.info("Execution begin") cfg = ConfigReader.read_cfg(config_file_name) logging.info("Configuration file read") model_file_name = cfg["MODELO"][0] queries_file_name = cfg["CONSULTAS"][0] results_file_name = cfg["RESULTADOS"][0] self.model = pickle.load(open(model_file_name, 'rb')) logging.info("Model loaded") queries = dict() with open(queries_file_name) as queries_file: for l in queries_file.readlines(): temp = l.split(';') query_id = temp[0] query = temp[1] queries[query_id] = query n_queries = len(queries) logging.info("Queries file loaded: " + str(n_queries) + " loaded") query_results = dict() begin_time = time.perf_counter() for query_id in queries: similarities = self.model.retrieve(queries[query_id]) query_results[query_id] = similarities end_time = time.perf_counter() elapsed_time = end_time - begin_time logging.info("Retrieval done") logging.info("Retrieval performance: " + str(n_queries/elapsed_time) + " queries per seconds") with open(results_file_name, 'w') as results_file: for query_id in query_results: self._write(query_id, query_results[query_id], results_file) logging.info("Queries results saved") logging.info("Execution ended")
def parse_corpus(self, cfg_file): logging.info("Execution begin") configs = ConfigReader.read_cfg(cfg_file) logging.info("Configuration file read") corpus = dict() for in_file in configs["LEIA"]: document = self.read_xml(in_file) corpus.update(document) logging.info("Corpus read: " + str(len(corpus)) + " documents readed from " + str(len(configs["LEIA"])) + " files") begin = time.perf_counter() inv_list = self.generate_inverted_list(corpus) end = time.perf_counter() elapsed = end - begin logging.info("Inverted list generated: " + str(len(inv_list)) + " terms collected") logging.info("Inverted list performance: " + str(len(corpus)/elapsed) + " documents per second") self.write_inverted_list(inv_list, configs["ESCREVE"][0]) logging.info("Inverted list saved") logging.info("Execution ended")