Exemple #1
0
    def _summarize (self, doc, query_tfidf):
        
        """
        Returns a dynamic summary based upon the given query tfidf. The
        returned summary also has the query terms highlighted.

        Returns a summary using the top 3 most relevant terms.
        """
        
        blocks = [line for line in doc['text'].split('\n') if len(line) > 0]
        rankings = []

        for block in blocks:
            
            block_terms = tokenize(block)
            block_term_occurences = term_occurences(block_terms)
            block_term_table = self._find_terms(block_terms)

            block_tfidf = self._generate_tfidf(
                block_term_occurences, 
                block_term_table
            )
            
            terms = self._vector_term_unison(block_tfidf,query_tfidf)
            block_vector = []
            query_vector = []
            
            for term in terms:
                block_term_tfidf = block_tfidf[term] if term in block_tfidf else 0.0
                query_term_tfidf = query_tfidf[term] if term in query_tfidf else 0.0

                block_vector.append(block_term_tfidf)
                query_vector.append(query_term_tfidf)

            similarity = self._calculate_similarity(block_vector, query_vector)
            rankings.append((similarity, block))
        
        rankings = sorted(rankings, key=itemgetter(0), reverse=True)
        summary = ' '.join([ranking[1] for ranking in rankings[:2]])

        return highlight(summary, query_tfidf.keys())
Exemple #2
0
    def search (self, query, page = 1, num_page = 10):

        """
        Performs a search for the given query on our database.

        Returns a tuple with the number of results and a list of the actual
        results.
        """
       
        #   First we need to tokenize our given query into terms
        logging.info('Performing Query: %s' % (query))
        query_terms = tokenize(query)
        query_term_occurences = term_occurences(query_terms)
        logging.debug('Tokenized Query: %s' % (query_terms))

        #   Okay, now construct our tfidf vector for the query for cosine
        #   similarity comparison when required.
        term_table = self._find_terms(query_terms)
        query_tfidf = self._generate_tfidf(query_term_occurences, term_table)
        logging.debug('Query TFIDF: %s' % (query_tfidf))

        document_rankings = self._perform_search(query_tfidf, term_table)
        documents = []
        for rank, doc_id in document_rankings[(page - 1)*num_page:page*num_page]:
            
            start_time = time.time() 
            doc = self.documents[doc_id]
            documents.append(dict(
                similarity = rank,
                text = self._summarize(doc, query_tfidf),
                title = highlight(doc['title'], query_tfidf.keys()),
                doc_id = doc_id
            ))

            logging.debug('%.5fms to summarize document id: %s' % (
                (time.time() - start_time) * 1000,
                doc_id
            ))
        
        return len(document_rankings), documents