Esempio n. 1
0
    def _get_documents (self, tokens):

        """
        Returns a list of unique documents that are sorted by their score
        from highest to lowest.
        """
        
        # Transform our list of token pairs (context_id, term_id) into a
        # map of term_id -> context_id. We do this for optimizations
        # further down the road.
        token_map = dict()
        for context_id, term_id in tokens:
            token_map[term_id] = context_id

        # Retrieve our dictionary of Term -> [(doc,context), ...] from the
        # database
        start_time = time.time()
        term_doc_map = TermModel.get_term_doc_map(token_map.keys())
        logging.debug('Took %.4fs to retrieve data structure for %d terms' % (
            time.time() - start_time,
            len(term_doc_map)
        ))

        # Once we have our dictionary mapping we need to group based on
        # document id's and their terms. We will build a dictionary mapping
        # document id to a list of terms and the context in which those
        # terms occur. map[DOC_ID] = [[context_id, term_id], ...]
        #
        # At this point we will also remove all document ids if given
        # contexts are specified in our token list. This will handle the
        # case were a user has specified that a term must occur in a
        # certain context.
        start_time = time.time()
        doc_term_map = self._organize(token_map, term_doc_map)
        logging.debug('Took %.4fs to rearrange data into structure for %d docs' % (
            time.time() - start_time,
            len(doc_term_map)
        ))
        
        # Recall document data for each document relevant to our query,
        # then we build a map structure for mapping doc_id -> doc_data
        start_time = time.time()
        docs = self._retrieve_documents(doc_term_map)
        logging.debug('Took %.4fs to retrieve %d document data from database' % (
            time.time() - start_time,
            len(docs)
        ))
        return docs