def generate_inverted_list(self, docs_dict):
     inv_list = dict()
     for (doc_id, doc) in docs_dict.items():
         term_list = Preprocessor.preprocessor_tokenizer(doc)
         for term in term_list:
             if term not in inv_list:
                 inv_list[term] = []
             inv_list[term].append(doc_id)
     return inv_list
    def _process_xml_query(self, xml_node):
        query_id = xml_node.getElementsByTagName("QueryNumber")[0].firstChild.nodeValue
        query = xml_node.getElementsByTagName("QueryText")[0].firstChild.nodeValue

        processed_query = " ".join(Preprocessor.preprocessor_tokenizer(query))
        self.query_dict[query_id] = processed_query

        records = xml_node.getElementsByTagName("Records")[0]
        relevant_documents_list = list()
        for item in records.getElementsByTagName("Item"):
            doc_id = item.firstChild.nodeValue

            scores = item.getAttribute("score")
            votes = 0
            for i in range(len(scores)):
                if scores[i]!='0':
                    votes += 1

            relevant_documents_list.append((doc_id, votes))
        self.expected_docs_by_query[query_id] = relevant_documents_list
Example #3
0
    def query_vector(self, query):
        terms = Preprocessor.preprocessor_tokenizer(query)

        counter = collections.Counter(terms)
        query_vector = dict()
        query_vector_magnitude = 0
        for term in terms:
            if term not in self.document_frequency:
                continue

            max_tf = counter.most_common(1)[0][1] # it returns an list of item + frequency
            tf = counter[term]
            df = self.document_frequency[term]
            val = (0.5 + 0.5*tf/max_tf)*math.log10(self.n_terms/df)
            query_vector[term] = val

            query_vector_magnitude += val*val

        # normalizing step
        query_vector_magnitude = math.sqrt(query_vector_magnitude)
        for term in query_vector:
            query_vector[term] /= query_vector_magnitude

        return query_vector