def _crawl_tree (self, parse_node): """ Recursively crawls the tree building a unique list of terms and the node """ # Build our list of term contexts that point to a term. Once done, # uniqify the list by looking at the term values. # # NOTE: There must be a better way to do this, also, so we can # create more contexts easily without making a large amount of # modifications. terms = [ [TermModel.get_word_id(token), context_id_map['docstring']] \ for token in list(set(tokenize(parse_node.docstring))) ] terms.extend([ [TermModel.get_word_id(token), context_id_map['comment']] \ for token in list(set(tokenize(parse_node.comments))) ]) terms.extend([ [TermModel.get_word_id(token), context_id_map['library']] \ for token in parse_node.imports ]) if parse_node.type == 'Class': terms.append([ TermModel.get_word_id(token), context_id_map['class'] ]) terms = uniqify_terms(terms) # NOTE: We copy the list of terms so that we dont modify the list # of terms for this treenode as we navigate upwards. Therefore, # unique terms from other nodes wont end up in the unique terms for # this node. node = dict( name=parse_node.name, type=parse_node.type, source=parse_node.source, signals=parse_node.signals, terms=terms[:], children = [], ) for child in parse_node.children: child_node, child_terms = self._crawl_tree(child) node['children'].append(child_node) terms.extend(child_terms) return node, uniqify_terms(terms)
def process (query): """ Returns a list of tokenized strings that have been processed and classified (tagged). Returns a list of pairs (token-type, token-value) where token type can be keyword, library, lang. """ # Maps the function match to each token of the query. Then filters the # results of that mapping process so no invalid tokens are included in # the list of (token_type, sanitized_token) return filter(lambda x: x[0] != 'invalid', map(match, tokenize(query)))
def _summarize (self, doc, query_tfidf): """ Returns a dynamic summary based upon the given query tfidf. The returned summary also has the query terms highlighted. Returns a summary using the top 3 most relevant terms. """ blocks = [line for line in doc['text'].split('\n') if len(line) > 0] rankings = [] for block in blocks: block_terms = tokenize(block) block_term_occurences = term_occurences(block_terms) block_term_table = self._find_terms(block_terms) block_tfidf = self._generate_tfidf( block_term_occurences, block_term_table ) terms = self._vector_term_unison(block_tfidf,query_tfidf) block_vector = [] query_vector = [] for term in terms: block_term_tfidf = block_tfidf[term] if term in block_tfidf else 0.0 query_term_tfidf = query_tfidf[term] if term in query_tfidf else 0.0 block_vector.append(block_term_tfidf) query_vector.append(query_term_tfidf) similarity = self._calculate_similarity(block_vector, query_vector) rankings.append((similarity, block)) rankings = sorted(rankings, key=itemgetter(0), reverse=True) summary = ' '.join([ranking[1] for ranking in rankings[:2]]) return highlight(summary, query_tfidf.keys())
def search (self, query, page = 1, num_page = 10): """ Performs a search for the given query on our database. Returns a tuple with the number of results and a list of the actual results. """ # First we need to tokenize our given query into terms logging.info('Performing Query: %s' % (query)) query_terms = tokenize(query) query_term_occurences = term_occurences(query_terms) logging.debug('Tokenized Query: %s' % (query_terms)) # Okay, now construct our tfidf vector for the query for cosine # similarity comparison when required. term_table = self._find_terms(query_terms) query_tfidf = self._generate_tfidf(query_term_occurences, term_table) logging.debug('Query TFIDF: %s' % (query_tfidf)) document_rankings = self._perform_search(query_tfidf, term_table) documents = [] for rank, doc_id in document_rankings[(page - 1)*num_page:page*num_page]: start_time = time.time() doc = self.documents[doc_id] documents.append(dict( similarity = rank, text = self._summarize(doc, query_tfidf), title = highlight(doc['title'], query_tfidf.keys()), doc_id = doc_id )) logging.debug('%.5fms to summarize document id: %s' % ( (time.time() - start_time) * 1000, doc_id )) return len(document_rankings), documents