def _crawl_tree (self, parse_node): """ Recursively crawls the tree building a unique list of terms and the node """ # Build our list of term contexts that point to a term. Once done, # uniqify the list by looking at the term values. # # NOTE: There must be a better way to do this, also, so we can # create more contexts easily without making a large amount of # modifications. terms = [ [TermModel.get_word_id(token), context_id_map['docstring']] \ for token in list(set(tokenize(parse_node.docstring))) ] terms.extend([ [TermModel.get_word_id(token), context_id_map['comment']] \ for token in list(set(tokenize(parse_node.comments))) ]) terms.extend([ [TermModel.get_word_id(token), context_id_map['library']] \ for token in parse_node.imports ]) if parse_node.type == 'Class': terms.append([ TermModel.get_word_id(token), context_id_map['class'] ]) terms = uniqify_terms(terms) # NOTE: We copy the list of terms so that we dont modify the list # of terms for this treenode as we navigate upwards. Therefore, # unique terms from other nodes wont end up in the unique terms for # this node. node = dict( name=parse_node.name, type=parse_node.type, source=parse_node.source, signals=parse_node.signals, terms=terms[:], children = [], ) for child in parse_node.children: child_node, child_terms = self._crawl_tree(child) node['children'].append(child_node) terms.extend(child_terms) return node, uniqify_terms(terms)
def update (self): """ The final phase is to save all relationships between the created documents and terms. NOTE: We don't have transactional support, so, if this fails, we will have a bunch fo documents with missing Term->Document pointers. """ for term, docs in self.terms.iteritems(): TermModel.update_doclist(term, docs) logging.debug('Updated doc_list for term id %s with %d docs' % ( term, len(docs) )) logging.info('Completed updating Term->Document list')
def _get_documents (self, tokens): """ Returns a list of unique documents that are sorted by their score from highest to lowest. """ # Transform our list of token pairs (context_id, term_id) into a # map of term_id -> context_id. We do this for optimizations # further down the road. token_map = dict() for context_id, term_id in tokens: token_map[term_id] = context_id # Retrieve our dictionary of Term -> [(doc,context), ...] from the # database start_time = time.time() term_doc_map = TermModel.get_term_doc_map(token_map.keys()) logging.debug('Took %.4fs to retrieve data structure for %d terms' % ( time.time() - start_time, len(term_doc_map) )) # Once we have our dictionary mapping we need to group based on # document id's and their terms. We will build a dictionary mapping # document id to a list of terms and the context in which those # terms occur. map[DOC_ID] = [[context_id, term_id], ...] # # At this point we will also remove all document ids if given # contexts are specified in our token list. This will handle the # case were a user has specified that a term must occur in a # certain context. start_time = time.time() doc_term_map = self._organize(token_map, term_doc_map) logging.debug('Took %.4fs to rearrange data into structure for %d docs' % ( time.time() - start_time, len(doc_term_map) )) # Recall document data for each document relevant to our query, # then we build a map structure for mapping doc_id -> doc_data start_time = time.time() docs = self._retrieve_documents(doc_term_map) logging.debug('Took %.4fs to retrieve %d document data from database' % ( time.time() - start_time, len(docs) )) return docs
def translate (query): """ Returns a list of pairs (token-type-id, token-id) using the lexicon provided through the TermModel class. Takes in a non-sanitized query string. Uses process and match functionality of this module. """ # Convert query stirng into list of term type and their sanitized # values sanitized = process(query) logging.debug('Tokenized query "%s" into %s' % ( query, sanitized )) # Converts term types and terms into their corresponding integer values # from the database and lexicon. return map( lambda x: (context_id_map[x[0]], TermModel.get_word_id(x[1])), sanitized )