def _crawl_tree (self, parse_node): """ Recursively crawls the tree building a unique list of terms and the node """ # Build our list of term contexts that point to a term. Once done, # uniqify the list by looking at the term values. # # NOTE: There must be a better way to do this, also, so we can # create more contexts easily without making a large amount of # modifications. terms = [ [TermModel.get_word_id(token), context_id_map['docstring']] \ for token in list(set(tokenize(parse_node.docstring))) ] terms.extend([ [TermModel.get_word_id(token), context_id_map['comment']] \ for token in list(set(tokenize(parse_node.comments))) ]) terms.extend([ [TermModel.get_word_id(token), context_id_map['library']] \ for token in parse_node.imports ]) if parse_node.type == 'Class': terms.append([ TermModel.get_word_id(token), context_id_map['class'] ]) terms = uniqify_terms(terms) # NOTE: We copy the list of terms so that we dont modify the list # of terms for this treenode as we navigate upwards. Therefore, # unique terms from other nodes wont end up in the unique terms for # this node. node = dict( name=parse_node.name, type=parse_node.type, source=parse_node.source, signals=parse_node.signals, terms=terms[:], children = [], ) for child in parse_node.children: child_node, child_terms = self._crawl_tree(child) node['children'].append(child_node) terms.extend(child_terms) return node, uniqify_terms(terms)
def translate (query): """ Returns a list of pairs (token-type-id, token-id) using the lexicon provided through the TermModel class. Takes in a non-sanitized query string. Uses process and match functionality of this module. """ # Convert query stirng into list of term type and their sanitized # values sanitized = process(query) logging.debug('Tokenized query "%s" into %s' % ( query, sanitized )) # Converts term types and terms into their corresponding integer values # from the database and lexicon. return map( lambda x: (context_id_map[x[0]], TermModel.get_word_id(x[1])), sanitized )