Example #1
0
    def _retrieve_documents (self, doc_term_map):
    
        """
        Takes the doc_term_mapping produced by the organize method and then
        builds a proper dictionary mapping id -> doc_data and terms that
        occur in the given document.

        Returns a dictionary of this document data.
        """

        docs = DocumentModel.get(doc_term_map.keys())
        united = list()

        for doc in docs:
            
            doc['terms'] = doc_term_map[doc['_id']]
            united.append(doc)

        return united
Example #2
0
        # We dont need to check for these terms being unique because this
        # is the only way they can contained in this context
        unique_terms.extend([
            [parse_data['meta']['name'], context_id_map['file']],
            [parse_data['meta']['language'], context_id_map['language']]
        ])
        
        # Construct our document datastructure with its document tree
        # and embed it properly!
        tree_id = TreeModel.insert(tree)
        doc = dict(
            tree_id=tree_id,
            signals=parse_data['tree'].signals,
            meta=parse_data['meta'],
        )
        doc_id = DocumentModel.insert(doc)

        # Record our TermID -> DocID mapping so we can atomically update
        # the database later with these pointers in the final phase.
        for term, context in unique_terms:
            if term not in self.terms:
                self.terms[term] = []

            self.terms[term].append([doc_id, context])

        logging.debug('Indexed file %s' % (parse_data['meta']['name']))

    def _crawl_tree (self, parse_node):

        """
        Recursively crawls the tree building a unique list of terms and the