def _retrieve_documents (self, doc_term_map): """ Takes the doc_term_mapping produced by the organize method and then builds a proper dictionary mapping id -> doc_data and terms that occur in the given document. Returns a dictionary of this document data. """ docs = DocumentModel.get(doc_term_map.keys()) united = list() for doc in docs: doc['terms'] = doc_term_map[doc['_id']] united.append(doc) return united
# We dont need to check for these terms being unique because this # is the only way they can contained in this context unique_terms.extend([ [parse_data['meta']['name'], context_id_map['file']], [parse_data['meta']['language'], context_id_map['language']] ]) # Construct our document datastructure with its document tree # and embed it properly! tree_id = TreeModel.insert(tree) doc = dict( tree_id=tree_id, signals=parse_data['tree'].signals, meta=parse_data['meta'], ) doc_id = DocumentModel.insert(doc) # Record our TermID -> DocID mapping so we can atomically update # the database later with these pointers in the final phase. for term, context in unique_terms: if term not in self.terms: self.terms[term] = [] self.terms[term].append([doc_id, context]) logging.debug('Indexed file %s' % (parse_data['meta']['name'])) def _crawl_tree (self, parse_node): """ Recursively crawls the tree building a unique list of terms and the