def generate_inverted_list(self, docs_dict): inv_list = dict() for (doc_id, doc) in docs_dict.items(): term_list = Preprocessor.preprocessor_tokenizer(doc) for term in term_list: if term not in inv_list: inv_list[term] = [] inv_list[term].append(doc_id) return inv_list
def _process_xml_query(self, xml_node): query_id = xml_node.getElementsByTagName("QueryNumber")[0].firstChild.nodeValue query = xml_node.getElementsByTagName("QueryText")[0].firstChild.nodeValue processed_query = " ".join(Preprocessor.preprocessor_tokenizer(query)) self.query_dict[query_id] = processed_query records = xml_node.getElementsByTagName("Records")[0] relevant_documents_list = list() for item in records.getElementsByTagName("Item"): doc_id = item.firstChild.nodeValue scores = item.getAttribute("score") votes = 0 for i in range(len(scores)): if scores[i]!='0': votes += 1 relevant_documents_list.append((doc_id, votes)) self.expected_docs_by_query[query_id] = relevant_documents_list
def query_vector(self, query): terms = Preprocessor.preprocessor_tokenizer(query) counter = collections.Counter(terms) query_vector = dict() query_vector_magnitude = 0 for term in terms: if term not in self.document_frequency: continue max_tf = counter.most_common(1)[0][1] # it returns an list of item + frequency tf = counter[term] df = self.document_frequency[term] val = (0.5 + 0.5*tf/max_tf)*math.log10(self.n_terms/df) query_vector[term] = val query_vector_magnitude += val*val # normalizing step query_vector_magnitude = math.sqrt(query_vector_magnitude) for term in query_vector: query_vector[term] /= query_vector_magnitude return query_vector