def test_enclosing_run_length_memoization(self): q1 = BasenameRanker().rank_query("bar", "bar.txt") q2 = BasenameRanker().rank_query("bar", "rebar.txt") r = BasenameRanker() r1 = r.rank_query("bar", "bar.txt") r2 = r.rank_query("bar", "rebar.txt") self.assertEquals(q1, r1) self.assertEquals(q2, r2)
def test_basename_only_query_rank_results(self): shard_manager = FakeDBShardManager( ["foo/bar.txt", "foo/rebar.txt", "blah/baz.txt"]) query_cache = QueryCache() res = MockQuery("bar").execute_nocache(shard_manager, query_cache) self.assertEquals(set(["foo/bar.txt", "foo/rebar.txt"]), set(res.filenames)) self.assertEquals([ BasenameRanker().rank_query("bar", os.path.basename(res.filenames[0])), BasenameRanker().rank_query("bar", os.path.basename(res.filenames[1])) ], res.ranks)
def execute_nocache(self, shard_manager, query_cache): # What we'll actually return truncated = False slashIdx = self.text.rfind('/') if slashIdx != -1: dirpart_query = self.text[:slashIdx] basename_query = self.text[slashIdx + 1:] else: dirpart_query = '' basename_query = self.text lower_dirpart_query = dirpart_query.lower() # Get the files files = [] if len(basename_query): basename_hits, truncated = shard_manager.search_basenames( basename_query) for hit in basename_hits: hit_files = shard_manager.files_by_lower_basename[hit] for f in hit_files: if _is_dirmatch(lower_dirpart_query, f): files.append(f) else: i = 0 start = time.time() timeout = start + self._dir_search_timeout for f in shard_manager.files: if _is_dirmatch(lower_dirpart_query, f): files.append(f) i += 1 if i % 1000 == 0: if time.time() >= timeout: truncated = True break # Rank the results trace_begin("rank_results") hits = [] basename_ranker = BasenameRanker() for f in files: basename = os.path.basename(f) rank = basename_ranker.rank_query(basename_query, basename) hits.append((f, rank)) trace_end("rank_results") return QueryResult(hits=hits, truncated=truncated)
def execute_nocache(self, shard_manager, query_cache): # What we'll actually return truncated = False slashIdx = self.text.rfind('/') if slashIdx != -1: dirpart_query = self.text[:slashIdx] basename_query = self.text[slashIdx+1:] else: dirpart_query = '' basename_query = self.text lower_dirpart_query = dirpart_query.lower() # Get the files files = [] if len(basename_query): basename_hits, truncated = shard_manager.search_basenames(basename_query) for hit in basename_hits: hit_files = shard_manager.files_by_lower_basename[hit] for f in hit_files: if _is_dirmatch(lower_dirpart_query, f): files.append(f) else: i = 0 start = time.time() timeout = start + self._dir_search_timeout for f in shard_manager.files: if _is_dirmatch(lower_dirpart_query, f): files.append(f) i += 1 if i % 1000 == 0: if time.time() >= timeout: truncated = True break # Rank the results trace_begin("rank_results") hits = [] basename_ranker = BasenameRanker() for f in files: basename = os.path.basename(f) rank = basename_ranker.rank_query(basename_query, basename) hits.append((f,rank)) trace_end("rank_results") return QueryResult(hits=hits, truncated=truncated)
def __init__(self, basenames): reload(sys) sys.setdefaultencoding('utf8') # The basenames come out of a hashtable so they are usually pretty badly # shuffled around. Sort them here so that we get somewhat predictable results # as a query is incrementally refined. basenames.sort() # Build the lower basenames list, removing dupes as needed. lower_basenames = set() for basename in basenames: lower_basename = basename.lower() lower_basenames.add(lower_basename) # Build two giant strings that contain all the basenames [and lowercase basenames] # concatenated together. This is what we will use to handle fuzzy queries. self.basenames_unsplit = (u"\n" + u"\n".join(basenames) + u"\n") self.lower_basenames_unsplit = (u"\n" + u"\n".join(lower_basenames) + u"\n") assert type(self.lower_basenames_unsplit) == unicode self._basename_ranker = BasenameRanker() wordstarts = {} for basename in basenames: start_letters = self._basename_ranker.get_start_letters(basename) if len(start_letters) <= 1: continue lower_basename = basename.lower() for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd ws = ''.join(start_letters[0:2 + i]) if ws not in wordstarts: wordstarts[ws] = [] loss = len(start_letters) - (2 + i) wordstarts[ws].append((lower_basename, loss)) # now, order the actual entries so high qualities are at front self.basenames_by_wordstarts = {} for ws, items in wordstarts.iteritems(): items.sort(lambda x, y: cmp(x[1], y[1])) self.basenames_by_wordstarts[ws] = [i[0] for i in items]
def test_enclosing_run_length_memoization_real_user_data(self): # This test content causes the memoizer to cache a particular a,b # combination whose score in the cached context has a non-wordstart boost, # but then in a subsequent query, asks for the same value without the # wordstart. See test_enclosing_run_length_memoization for a reduced test # case. q = 'cclayertreehos.h' items = [ 'CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h', 'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h', 'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h', 'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h', 'CACFLayerTreeHost.h' ] for x in items: stateless_ranker = BasenameRanker() rStateless = stateless_ranker.rank_query(q, x) rStateful = self.ranker.rank_query(q, x) self.assertEquals( rStateless, rStateful, "For %s, expected %f=%f" % (x, rStateless, rStateful))
def test_enclosing_run_length_memoization_real_user_data(self): # This test content causes the memoizer to cache a particular a,b # combination whose score in the cached context has a non-wordstart boost, # but then in a subsequent query, asks for the same value without the # wordstart. See test_enclosing_run_length_memoization for a reduced test # case. q = 'cclayertreehos.h' items = ['CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h', 'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h', 'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h', 'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h', 'CACFLayerTreeHost.h'] for x in items: stateless_ranker = BasenameRanker() rStateless = stateless_ranker.rank_query(q, x) rStateful = self.ranker.rank_query(q, x) self.assertEquals(rStateless, rStateful,"For %s, expected %f=%f" % (x, rStateless, rStateful))
def __init__(self, basenames): # The basenames come out of a hashtable so they are usually pretty badly # shuffled around. Sort them here so that we get somewhat predictable results # as a query is incrementally refined. basenames.sort() # Build the lower basenames list, removing dupes as needed. lower_basenames = set() for basename in basenames: lower_basename = basename.lower() lower_basenames.add(lower_basename) # Build two giant strings that contain all the basenames [and lowercase basenames] # concatenated together. This is what we will use to handle fuzzy queries. self.basenames_unsplit = ("\n" + "\n".join(basenames) + "\n").encode('utf8') self.lower_basenames_unsplit = ("\n" + "\n".join(lower_basenames) + "\n").encode('utf8') assert type(self.lower_basenames_unsplit) == str self._basename_ranker = BasenameRanker() wordstarts = {} for basename in basenames: start_letters = self._basename_ranker.get_start_letters(basename) if len(start_letters) <= 1: continue lower_basename = basename.lower() for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd ws = ''.join(start_letters[0:2+i]) if ws not in wordstarts: wordstarts[ws] = [] loss = len(start_letters) - (2 + i) wordstarts[ws].append((lower_basename, loss)) # now, order the actual entries so high qualities are at front self.basenames_by_wordstarts = {} for ws,items in wordstarts.iteritems(): items.sort(lambda x,y: cmp(x[1],y[1])) self.basenames_by_wordstarts[ws] = [i[0] for i in items]
def setUp(self): # self.basenames = json.load(open('test_data/cr_files_basenames.json')) self.ranker = BasenameRanker()
class BasenameRankerTest(unittest.TestCase): def setUp(self): # self.basenames = json.load(open('test_data/cr_files_basenames.json')) self.ranker = BasenameRanker() def test_is_wordstart(self): def check(s, expectations): assert len(s) == len(expectations) for i in range(len(s)): self.assertEquals(expectations[i], self.ranker._is_wordstart(s, i), "disagreement on index %i" % i) check("foo", [True, False, False]) check("fooBar", [True, False, False, True, False, False]) check("o", [True]) check("_", [True]) check("F", [True]) check("FooBar", [True, False, False, True, False, False]) check("Foo_Bar", [True, False, False, False, True, False, False]) check("_Bar", [True, True, False, False]) check("_bar", [True, True, False, False]) check("foo_bar", [True, False, False, False, True, False, False]) check(".h", [True, False]) check("a.h", [True, False, False]) check("__b", [True, False, True]) check("foo__bar", [True, False, False, False, False, True, False, False]) check("Foo3D", [True, False, False, True, True]) check("Foo33", [True, False, False, True, False]) check("x3d", [True, True, False ]) # I could be convinced that 'd' is a wordstart. check("AAb", [True, True, False]) check("CCFra", [True, True, True, False, False]) def test_get_word_starts(self): data = { # This comment simply helps map indice to values # 1234567 '': [], 'abc': [0], 'abd_def': [0, 4], 'ab_cd_ef': [0, 3, 6], 'ab_': [0], 'AA': [0, 1], 'AAbA': [0, 1, 3], 'Abc': [0], 'AbcDef': [0, 3], 'Abc_Def': [0, 4], } for word, expected_starts in data.items(): starts = self.ranker.get_starts(word) self.assertEquals(expected_starts, starts, "for %s, expect %s" % (word, expected_starts)) def assertBasicRankAndWordHitCountIs(self, expected_rank, expected_word_count, query, candidate): res = self.ranker._get_basic_rank(query, candidate) self.assertEquals(expected_rank, res[0]) self.assertEquals(expected_word_count, res[1]) def test_query_hits_on_word_starts(self): self.assertBasicRankAndWordHitCountIs( 8, 4, 'rwhv', 'render_widget_host_view.cc') # test +1 for hitting all words self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh', 'render_widget_host_view.cc') self.assertBasicRankAndWordHitCountIs( 5.5, 2, 'wvi', 'render_widget_host_view_win.cc') # eew self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc') self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc') def test_basic_rank_pays_attention_to_case(self): # these test that we aren't losing catching case transpitions self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf") self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf") def test_basic_rank_works_at_all(self): # these are generic tests self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs( 10, 5, "rwhvm", "render_widget_host_view_mac.mm") self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate', 'CCFrameRateController.cpp') def test_basic_rank_query_case_doesnt_influence_rank_query(self): a = self.ranker._get_basic_rank("Rwhvm", "render_widget_host_view_mac.h") b = self.ranker._get_basic_rank("rwhvm", "Render_widget_host_view_mac.h") self.assertEquals(a, b) def test_basic_rank_isnt_only_greedy(self): # this checks that we consider _mac and as a wordstart rather than macmm self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_macmm") def test_basic_rank_on_corner_cases(self): self.assertBasicRankAndWordHitCountIs(0, 0, "", "") self.assertBasicRankAndWordHitCountIs(0, 0, "", "x") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "") self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x") self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd") def test_basic_rank_on_mixed_wordstarts_and_full_words(self): self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") def test_basic_rank_overconditioned_query(self): self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw') def test_rank_corner_cases(self): # empty self.assertEquals(0, self.ranker.rank_query('foo', '')) self.assertEquals(0, self.ranker.rank_query('', 'foo')) # undersized self.assertEquals(0, self.ranker.rank_query('foo', 'm')) self.assertEquals(0, self.ranker.rank_query('f', 'oom')) # overconditioned self.assertEquals(6, self.ranker.rank_query('test_thread_tab.py', 'tw')) def test_rank_subclasses_lower_ranked_than_base(self): # this tests that hitting all words counts higher than hitting some of the words base_rank = self.ranker.rank_query("rwhvm", "render_widget_host_view.h") subclass_rank = self.ranker.rank_query( "rwhvm", "render_widget_host_view_subclass.h") self.assertTrue(base_rank > subclass_rank) def test_rank_order_for_hierarchy_puts_bases_first(self): names = [ 'render_widget_host_view_mac.h', 'render_widget_host_view_mac.mm', 'render_widget_host_view_mac_delegate.h', 'render_widget_host_view_mac_unittest.mm', 'render_widget_host_view_mac_editcommand_helper.mm', 'render_widget_host_view_mac_editcommand_helper.h' 'render_widget_host_view_mac_editcommand_helper_unittest.mm', ] self._assertRankDecreasesOrStaysTheSame("rwhvm", names) def _assertRankDecreasesOrStaysTheSame(self, query, names): """ Makes suer that the first element in the array has highest rank and subsequent items have decreasing or equal rank. """ ranks = [self.ranker.rank_query(query, n) for n in names] nw = [self.ranker.get_num_words(n) for n in names] basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names] for i in range(1, len(ranks)): changeInRank = ranks[i] - ranks[i - 1] self.assertTrue(changeInRank <= 0) def test_rank_order_prefers_capitals(self): # Ensure we still prefer capitals for simple queries The heuristics that # deal with order_puts_tests_second tends to break this. self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc') def test_rank_order_puts_tests_second(self): q = "ccframerate" a1 = self.ranker.rank_query(q, 'CCFrameRateController.cpp') a2 = self.ranker.rank_query(q, 'CCFrameRateController.h') b = self.ranker.rank_query(q, 'CCFrameRateControllerTest.cpp') # This is a hard test to pass because ccframera(te) ties to (Te)st # if you weight non-word matches equally. self.assertTrue(a1 > b) self.assertTrue(a2 > b) q = "chrome_switches" a1 = self.ranker.rank_query(q, 'chrome_switches.cc') a2 = self.ranker.rank_query(q, 'chrome_switches.h') b = self.ranker.rank_query(q, 'chrome_switches_uitest.cc') self.assertTrue(a1 > b) self.assertTrue(a2 > b) def test_rank_order_for_hierarchy_puts_prefixed_second(self): q = "ccframerate" a = self.ranker.rank_query(q, 'CCFrameRateController.cpp') b1 = self.ranker.rank_query( q, 'webcore_platform.CCFrameRateController.o.d') b2 = self.ranker.rank_query( q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d') self.assertTrue(a > b1) # FAILS because ccframera(te) ties to (Te)st # self.assertTrue(a > b2); def test_rank_order_puts_tests_second_2(self): q = "ccdelaybassedti" a1 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.cpp') a2 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.h') b = self.ranker.rank_query(q, 'CCDelayBasedTimeSourceTest.cpp') self.assertTrue(a1 > b) self.assertTrue(a2 > b) q = "LayerTexture" a = self.ranker.rank_query(q, 'LayerTexture.cpp') b = self.ranker.rank_query(q, 'LayerTextureSubImage.cpp') self.assertTrue(a > b) def test_refinement_improves_rank_query(self): a = self.ranker.rank_query('render_', 'render_widget.cc') b = self.ranker.rank_query('render_widget', 'render_widget.cc') self.assertTrue(b > a) def test_document_rank(self): a = self.ranker.rank_query('document.cpp', 'Document.cpp') b = self.ranker.rank_query('document.cpp', 'AccessibleDocument.cpp') self.assertTrue(a > b) def test_enclosing_run_length_memoization_real_user_data(self): # This test content causes the memoizer to cache a particular a,b # combination whose score in the cached context has a non-wordstart boost, # but then in a subsequent query, asks for the same value without the # wordstart. See test_enclosing_run_length_memoization for a reduced test # case. q = 'cclayertreehos.h' items = [ 'CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h', 'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h', 'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h', 'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h', 'CACFLayerTreeHost.h' ] for x in items: stateless_ranker = BasenameRanker() rStateless = stateless_ranker.rank_query(q, x) rStateful = self.ranker.rank_query(q, x) self.assertEquals( rStateless, rStateful, "For %s, expected %f=%f" % (x, rStateless, rStateful)) def test_enclosing_run_length_memoization(self): q1 = BasenameRanker().rank_query("bar", "bar.txt") q2 = BasenameRanker().rank_query("bar", "rebar.txt") r = BasenameRanker() r1 = r.rank_query("bar", "bar.txt") r2 = r.rank_query("bar", "rebar.txt") self.assertEquals(q1, r1) self.assertEquals(q2, r2)
class DBIndexShard(object): def __init__(self, basenames): # The basenames come out of a hashtable so they are usually pretty badly # shuffled around. Sort them here so that we get somewhat predictable results # as a query is incrementally refined. basenames.sort() # Build the lower basenames list, removing dupes as needed. lower_basenames = set() for basename in basenames: lower_basename = basename.lower() lower_basenames.add(lower_basename) # Build two giant strings that contain all the basenames [and lowercase basenames] # concatenated together. This is what we will use to handle fuzzy queries. self.basenames_unsplit = ("\n" + "\n".join(basenames) + "\n").encode('utf8') self.lower_basenames_unsplit = ("\n" + "\n".join(lower_basenames) + "\n").encode('utf8') assert type(self.lower_basenames_unsplit) == str self._basename_ranker = BasenameRanker() wordstarts = {} for basename in basenames: start_letters = self._basename_ranker.get_start_letters(basename) if len(start_letters) <= 1: continue lower_basename = basename.lower() for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd ws = ''.join(start_letters[0:2+i]) if ws not in wordstarts: wordstarts[ws] = [] loss = len(start_letters) - (2 + i) wordstarts[ws].append((lower_basename, loss)) # now, order the actual entries so high qualities are at front self.basenames_by_wordstarts = {} for ws,items in wordstarts.iteritems(): items.sort(lambda x,y: cmp(x[1],y[1])) self.basenames_by_wordstarts[ws] = [i[0] for i in items] @traced def search_basenames(self, query): """ Searches index for basenames matching the query. Returns (hits, truncated) where: hits is an array of basenames that matched. truncated is a bool indicated whether not all possible matches were found. """ lower_query = query.lower() lower_hits = set() max_hits_hint = 25 # add exact matches first trace_begin("exact") self.add_all_matching( lower_hits, query, self.get_exact_match_filter(lower_query), max_hits_hint ) trace_end("exact") # add in word starts trace_begin("wordstarts") self.add_all_wordstarts_matching( lower_hits, query, max_hits_hint ) trace_end("wordstarts") # add in substring matches trace_begin("substrings") self.add_all_matching( lower_hits, query, self.get_substring_filter(lower_query), max_hits_hint ) trace_end("substrings") # add in superfuzzy matches ONLY if we have no high-quality hit has_hq = False for lower_hit in lower_hits: rank = self._basename_ranker.rank_query(query, lower_hit) if rank > 2: has_hq = True break if not has_hq: trace_begin("superfuzzy") self.add_all_matching( lower_hits, query, self.get_superfuzzy_filter(lower_query), max_hits_hint ) trace_end("superfuzzy") return lower_hits, len(lower_hits) == max_hits_hint def add_all_wordstarts_matching( self, lower_hits, query, max_hits_hint ): lower_query = query.lower() if lower_query in self.basenames_by_wordstarts: for basename in self.basenames_by_wordstarts[lower_query]: lower_hits.add(basename) if len(lower_hits) >= max_hits_hint: return def get_exact_match_filter(self, query): query = re.escape(query.lower()) # abc -> abc(\..*)? flt = "\n%s(?:\\..*)?\n" % query return (flt, False) def get_delimited_wordstart_filter(self, query): query = [re.escape(query[i]) for i in range(len(query))] # abc -> ^a.*_b.*_c # abc -> .*_a.*_b.*_c tmp = [] tmp.append("(?:(?:%s)|(?:.*_%s))" % (query[0], query[0])) for i in range(1, len(query)): c = query[i] tmp.append("_%s" % query[i]) flt = "\n%s.*\n" % '.*'.join(tmp) return (flt, False) def get_camelcase_wordstart_filter(self, query): query = query.upper() query = [re.escape(query[i]) for i in range(len(query))] # abc -> A.*B.*C # .*[^A-Z]A.* tmp = [] tmp.append("(?:(?:%s)|(?:.*[^A-Z\n]%s))" % (query[0], query[0])) for i in range(1, len(query)): tmp.append("[^A-Z\n]%s" % query[i]) flt = "\n.*%s.*\n" % '.*'.join(tmp) return (flt, True) def get_substring_filter(self, query): query = re.escape(query.lower()) # abc -> *abc* flt = "\n.*%s.*\n" % query return (flt, False) def get_superfuzzy_filter(self, query): tmp = [] for i in range(len(query)): tmp.append(re.escape(query[i])) flt = "\n.*%s.*\n" % '.*'.join(tmp) return (flt, False) def add_all_matching(self, lower_hits, query, flt_tuple, max_hits_hint): """ lower_hits is the dictionary to put results in query is the query string originally entered by user, used by ranking flt_tuple is [filter_regex, case_sensitive_bool] max_hits_hint is largest hits should grow before matching terminates. """ flt, case_sensitive = flt_tuple regex = re.compile(flt) base = 0 if not case_sensitive: index = self.lower_basenames_unsplit else: index = self.basenames_unsplit while True: m = regex.search(index, base) if m: hit = m.group(0)[1:-1] if hit.find('\n') != -1: raise Exception("Somethign is messed up with flt=[%s] query=[%s] hit=[%s]" % (flt,query,hit)) if case_sensitive: hit = hit.lower() lower_hits.add(hit) base = m.end() - 1 if len(lower_hits) >= max_hits_hint: truncated = True break else: break
class BasenameRankerTest(unittest.TestCase): def setUp(self): # self.basenames = json.load(open('test_data/cr_files_basenames.json')) self.ranker = BasenameRanker() def test_is_wordstart(self): def check(s, expectations): assert len(s) == len(expectations) for i in range(len(s)): self.assertEquals(expectations[i], self.ranker._is_wordstart(s, i), "disagreement on index %i" % i) check("foo", [True, False, False]) check("fooBar", [True, False, False, True, False, False]) check("o", [True]) check("_", [True]) check("F", [True]) check("FooBar", [True, False, False, True, False, False]) check("Foo_Bar", [True, False, False, False, True, False, False]) check("_Bar", [True, True, False, False]) check("_bar", [True, True, False, False]) check("foo_bar", [True, False, False, False, True, False, False]) check(".h", [True, False]) check("a.h", [True, False, False]) check("__b", [True, False, True]) check("foo__bar", [True, False, False, False, False, True, False, False]) check("Foo3D", [True, False, False, True, True]) check("Foo33", [True, False, False, True, False]) check("x3d", [True, True, False]) # I could be convinced that 'd' is a wordstart. check("AAb", [True, True, False]) check("CCFra", [True, True, True, False, False]) def test_get_word_starts(self): data = { # This comment simply helps map indice to values # 1234567 '' : [], 'abc' : [0], 'abd_def' : [0, 4], 'ab_cd_ef' : [0, 3, 6], 'ab_' : [0], 'AA': [0, 1], 'AAbA': [0,1,3], 'Abc': [0], 'AbcDef': [0,3], 'Abc_Def': [0,4], } for word, expected_starts in data.items(): starts = self.ranker.get_starts(word) self.assertEquals(expected_starts, starts, "for %s, expect %s" % (word, expected_starts)) def assertBasicRankAndWordHitCountIs(self, expected_rank, expected_word_count, query, candidate): res = self.ranker._get_basic_rank(query, candidate) self.assertEquals(expected_rank, res[0]) self.assertEquals(expected_word_count, res[1]) def test_query_hits_on_word_starts(self): self.assertBasicRankAndWordHitCountIs(8, 4, 'rwhv', 'render_widget_host_view.cc') # test +1 for hitting all words self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh', 'render_widget_host_view.cc') self.assertBasicRankAndWordHitCountIs(5.5, 2, 'wvi', 'render_widget_host_view_win.cc') # eew self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc') self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc') self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc') def test_basic_rank_pays_attention_to_case(self): # these test that we aren't losing catching case transpitions self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf") self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf") def test_basic_rank_works_at_all(self): # these are generic tests self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.mm") self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate', 'CCFrameRateController.cpp') def test_basic_rank_query_case_doesnt_influence_rank_query(self): a = self.ranker._get_basic_rank("Rwhvm", "render_widget_host_view_mac.h") b = self.ranker._get_basic_rank("rwhvm", "Render_widget_host_view_mac.h") self.assertEquals(a, b) def test_basic_rank_isnt_only_greedy(self): # this checks that we consider _mac and as a wordstart rather than macmm self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_macmm") def test_basic_rank_on_corner_cases(self): self.assertBasicRankAndWordHitCountIs(0, 0, "", "") self.assertBasicRankAndWordHitCountIs(0, 0, "", "x") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "") self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x") self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx") self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd") def test_basic_rank_on_mixed_wordstarts_and_full_words(self): self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv", "render_widget_host_view.h") self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac", "render_widget_host_view_mac.h") self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h") def test_basic_rank_overconditioned_query(self): self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw') def test_rank_corner_cases(self): # empty self.assertEquals(0, self.ranker.rank_query('foo', '')) self.assertEquals(0, self.ranker.rank_query('', 'foo')) # undersized self.assertEquals(0, self.ranker.rank_query('foo', 'm')) self.assertEquals(0, self.ranker.rank_query('f', 'oom')) # overconditioned self.assertEquals(6, self.ranker.rank_query('test_thread_tab.py', 'tw')) def test_rank_subclasses_lower_ranked_than_base(self): # this tests that hitting all words counts higher than hitting some of the words base_rank = self.ranker.rank_query("rwhvm", "render_widget_host_view.h") subclass_rank = self.ranker.rank_query("rwhvm", "render_widget_host_view_subclass.h") self.assertTrue(base_rank > subclass_rank) def test_rank_order_for_hierarchy_puts_bases_first(self): names = ['render_widget_host_view_mac.h', 'render_widget_host_view_mac.mm', 'render_widget_host_view_mac_delegate.h', 'render_widget_host_view_mac_unittest.mm', 'render_widget_host_view_mac_editcommand_helper.mm', 'render_widget_host_view_mac_editcommand_helper.h' 'render_widget_host_view_mac_editcommand_helper_unittest.mm', ] self._assertRankDecreasesOrStaysTheSame("rwhvm", names) def _assertRankDecreasesOrStaysTheSame(self, query, names): """ Makes suer that the first element in the array has highest rank and subsequent items have decreasing or equal rank. """ ranks = [self.ranker.rank_query(query, n) for n in names] nw = [self.ranker.get_num_words(n) for n in names] basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names] for i in range(1, len(ranks)): changeInRank = ranks[i] - ranks[i-1] self.assertTrue(changeInRank <= 0) def test_rank_order_prefers_capitals(self): # Ensure we still prefer capitals for simple queries The heuristics that # deal with order_puts_tests_second tends to break this. self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc') def test_rank_order_puts_tests_second(self): q = "ccframerate" a1 = self.ranker.rank_query(q, 'CCFrameRateController.cpp') a2 = self.ranker.rank_query(q, 'CCFrameRateController.h') b = self.ranker.rank_query(q, 'CCFrameRateControllerTest.cpp') # This is a hard test to pass because ccframera(te) ties to (Te)st # if you weight non-word matches equally. self.assertTrue(a1 > b); self.assertTrue(a2 > b); q = "chrome_switches" a1 = self.ranker.rank_query(q, 'chrome_switches.cc') a2 = self.ranker.rank_query(q, 'chrome_switches.h') b = self.ranker.rank_query(q, 'chrome_switches_uitest.cc') self.assertTrue(a1 > b); self.assertTrue(a2 > b); def test_rank_order_for_hierarchy_puts_prefixed_second(self): q = "ccframerate" a = self.ranker.rank_query(q, 'CCFrameRateController.cpp') b1 = self.ranker.rank_query(q, 'webcore_platform.CCFrameRateController.o.d') b2 = self.ranker.rank_query(q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d') self.assertTrue(a > b1); # FAILS because ccframera(te) ties to (Te)st # self.assertTrue(a > b2); def test_rank_order_puts_tests_second_2(self): q = "ccdelaybassedti" a1 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.cpp') a2 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.h') b = self.ranker.rank_query(q, 'CCDelayBasedTimeSourceTest.cpp') self.assertTrue(a1 > b); self.assertTrue(a2 > b); q = "LayerTexture" a = self.ranker.rank_query(q, 'LayerTexture.cpp') b = self.ranker.rank_query(q, 'LayerTextureSubImage.cpp') self.assertTrue(a > b) def test_refinement_improves_rank_query(self): a = self.ranker.rank_query('render_', 'render_widget.cc') b = self.ranker.rank_query('render_widget', 'render_widget.cc') self.assertTrue(b > a) def test_document_rank(self): a = self.ranker.rank_query('document.cpp', 'Document.cpp') b = self.ranker.rank_query('document.cpp', 'AccessibleDocument.cpp') self.assertTrue(a > b) def test_enclosing_run_length_memoization_real_user_data(self): # This test content causes the memoizer to cache a particular a,b # combination whose score in the cached context has a non-wordstart boost, # but then in a subsequent query, asks for the same value without the # wordstart. See test_enclosing_run_length_memoization for a reduced test # case. q = 'cclayertreehos.h' items = ['CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h', 'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h', 'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h', 'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h', 'CACFLayerTreeHost.h'] for x in items: stateless_ranker = BasenameRanker() rStateless = stateless_ranker.rank_query(q, x) rStateful = self.ranker.rank_query(q, x) self.assertEquals(rStateless, rStateful,"For %s, expected %f=%f" % (x, rStateless, rStateful)) def test_enclosing_run_length_memoization(self): q1 = BasenameRanker().rank_query("bar", "bar.txt") q2 = BasenameRanker().rank_query("bar", "rebar.txt") r = BasenameRanker() r1 = r.rank_query("bar", "bar.txt") r2 = r.rank_query("bar", "rebar.txt") self.assertEquals(q1, r1) self.assertEquals(q2, r2)