Ejemplo n.º 1
0
  def test_enclosing_run_length_memoization(self):
    q1 = BasenameRanker().rank_query("bar", "bar.txt")
    q2 = BasenameRanker().rank_query("bar", "rebar.txt")

    r = BasenameRanker()
    r1 = r.rank_query("bar", "bar.txt")
    r2 = r.rank_query("bar", "rebar.txt")

    self.assertEquals(q1, r1)
    self.assertEquals(q2, r2)
Ejemplo n.º 2
0
    def test_basename_only_query_rank_results(self):
        shard_manager = FakeDBShardManager(
            ["foo/bar.txt", "foo/rebar.txt", "blah/baz.txt"])
        query_cache = QueryCache()

        res = MockQuery("bar").execute_nocache(shard_manager, query_cache)
        self.assertEquals(set(["foo/bar.txt", "foo/rebar.txt"]),
                          set(res.filenames))
        self.assertEquals([
            BasenameRanker().rank_query("bar",
                                        os.path.basename(res.filenames[0])),
            BasenameRanker().rank_query("bar",
                                        os.path.basename(res.filenames[1]))
        ], res.ranks)
Ejemplo n.º 3
0
    def execute_nocache(self, shard_manager, query_cache):
        # What we'll actually return
        truncated = False

        slashIdx = self.text.rfind('/')
        if slashIdx != -1:
            dirpart_query = self.text[:slashIdx]
            basename_query = self.text[slashIdx + 1:]
        else:
            dirpart_query = ''
            basename_query = self.text
        lower_dirpart_query = dirpart_query.lower()

        # Get the files
        files = []
        if len(basename_query):
            basename_hits, truncated = shard_manager.search_basenames(
                basename_query)
            for hit in basename_hits:
                hit_files = shard_manager.files_by_lower_basename[hit]
                for f in hit_files:
                    if _is_dirmatch(lower_dirpart_query, f):
                        files.append(f)
        else:
            i = 0
            start = time.time()
            timeout = start + self._dir_search_timeout
            for f in shard_manager.files:
                if _is_dirmatch(lower_dirpart_query, f):
                    files.append(f)
                i += 1
                if i % 1000 == 0:
                    if time.time() >= timeout:
                        truncated = True
                        break

        # Rank the results
        trace_begin("rank_results")
        hits = []
        basename_ranker = BasenameRanker()
        for f in files:
            basename = os.path.basename(f)
            rank = basename_ranker.rank_query(basename_query, basename)
            hits.append((f, rank))
        trace_end("rank_results")

        return QueryResult(hits=hits, truncated=truncated)
Ejemplo n.º 4
0
  def execute_nocache(self, shard_manager, query_cache):
    # What we'll actually return
    truncated = False

    slashIdx = self.text.rfind('/')
    if slashIdx != -1:
      dirpart_query = self.text[:slashIdx]
      basename_query = self.text[slashIdx+1:]
    else:
      dirpart_query = ''
      basename_query = self.text
    lower_dirpart_query = dirpart_query.lower()

    # Get the files
    files = []
    if len(basename_query):
      basename_hits, truncated = shard_manager.search_basenames(basename_query)
      for hit in basename_hits:
        hit_files = shard_manager.files_by_lower_basename[hit]
        for f in hit_files:
          if _is_dirmatch(lower_dirpart_query, f):
            files.append(f)
    else:
      i = 0
      start = time.time()
      timeout = start + self._dir_search_timeout
      for f in shard_manager.files:
        if _is_dirmatch(lower_dirpart_query, f):
          files.append(f)
        i += 1
        if i % 1000 == 0:
          if time.time() >= timeout:
            truncated = True
            break

    # Rank the results
    trace_begin("rank_results")
    hits = []
    basename_ranker = BasenameRanker()
    for f in files:
      basename = os.path.basename(f)
      rank = basename_ranker.rank_query(basename_query, basename)
      hits.append((f,rank))
    trace_end("rank_results")

    return QueryResult(hits=hits, truncated=truncated)
Ejemplo n.º 5
0
    def __init__(self, basenames):
        reload(sys)
        sys.setdefaultencoding('utf8')

        # The basenames come out of a hashtable so they are usually pretty badly
        # shuffled around. Sort them here so that we get somewhat predictable results
        # as a query is incrementally refined.
        basenames.sort()

        # Build the lower basenames list, removing dupes as needed.
        lower_basenames = set()
        for basename in basenames:
            lower_basename = basename.lower()
            lower_basenames.add(lower_basename)

        # Build two giant strings that contain all the basenames [and lowercase basenames]
        # concatenated together. This is what we will use to handle fuzzy queries.
        self.basenames_unsplit = (u"\n" + u"\n".join(basenames) + u"\n")
        self.lower_basenames_unsplit = (u"\n" + u"\n".join(lower_basenames) +
                                        u"\n")
        assert type(self.lower_basenames_unsplit) == unicode

        self._basename_ranker = BasenameRanker()
        wordstarts = {}
        for basename in basenames:
            start_letters = self._basename_ranker.get_start_letters(basename)
            if len(start_letters) <= 1:
                continue
            lower_basename = basename.lower()
            for i in range(len(start_letters) + 1 - 2):  # abcd -> ab abc abcd
                ws = ''.join(start_letters[0:2 + i])
                if ws not in wordstarts:
                    wordstarts[ws] = []
                loss = len(start_letters) - (2 + i)
                wordstarts[ws].append((lower_basename, loss))

        # now, order the actual entries so high qualities are at front
        self.basenames_by_wordstarts = {}
        for ws, items in wordstarts.iteritems():
            items.sort(lambda x, y: cmp(x[1], y[1]))
            self.basenames_by_wordstarts[ws] = [i[0] for i in items]
Ejemplo n.º 6
0
 def test_enclosing_run_length_memoization_real_user_data(self):
     # This test content causes the memoizer to cache a particular a,b
     # combination whose score in the cached context has a non-wordstart boost,
     # but then in a subsequent query, asks for the same value without the
     # wordstart. See test_enclosing_run_length_memoization for a reduced test
     # case.
     q = 'cclayertreehos.h'
     items = [
         'CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h',
         'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h',
         'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h',
         'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h',
         'CACFLayerTreeHost.h'
     ]
     for x in items:
         stateless_ranker = BasenameRanker()
         rStateless = stateless_ranker.rank_query(q, x)
         rStateful = self.ranker.rank_query(q, x)
         self.assertEquals(
             rStateless, rStateful,
             "For %s, expected %f=%f" % (x, rStateless, rStateful))
Ejemplo n.º 7
0
 def test_enclosing_run_length_memoization_real_user_data(self):
   # This test content causes the memoizer to cache a particular a,b
   # combination whose score in the cached context has a non-wordstart boost,
   # but then in a subsequent query, asks for the same value without the
   # wordstart. See test_enclosing_run_length_memoization for a reduced test
   # case.
   q = 'cclayertreehos.h'
   items = ['CCLayerTreeHostImpl.h',
            'CCLayerTreeHostCommon.h',
            'AbstractCACFLayerTreeHost.h',
            'CCLayerTreeHost.h',
            'LegacyCACFLayerTreeHost.h',
            'CACFLayerTreeHostClient.h',
            'WKCACFViewLayerTreeHost.h',
            'FakeCCLayerTreeHostClient.h',
            'CACFLayerTreeHost.h']
   for x in items:
     stateless_ranker = BasenameRanker()
     rStateless = stateless_ranker.rank_query(q, x)
     rStateful = self.ranker.rank_query(q, x)
     self.assertEquals(rStateless, rStateful,"For %s, expected %f=%f" % (x, rStateless, rStateful))
Ejemplo n.º 8
0
    def test_enclosing_run_length_memoization(self):
        q1 = BasenameRanker().rank_query("bar", "bar.txt")
        q2 = BasenameRanker().rank_query("bar", "rebar.txt")

        r = BasenameRanker()
        r1 = r.rank_query("bar", "bar.txt")
        r2 = r.rank_query("bar", "rebar.txt")

        self.assertEquals(q1, r1)
        self.assertEquals(q2, r2)
Ejemplo n.º 9
0
  def __init__(self, basenames):
    # The basenames come out of a hashtable so they are usually pretty badly
    # shuffled around. Sort them here so that we get somewhat predictable results
    # as a query is incrementally refined.
    basenames.sort()

    # Build the lower basenames list, removing dupes as needed.
    lower_basenames = set()
    for basename in basenames:
      lower_basename = basename.lower()
      lower_basenames.add(lower_basename)

    # Build two giant strings that contain all the basenames [and lowercase basenames]
    # concatenated together. This is what we will use to handle fuzzy queries.
    self.basenames_unsplit = ("\n" + "\n".join(basenames) + "\n").encode('utf8')
    self.lower_basenames_unsplit = ("\n" + "\n".join(lower_basenames) + "\n").encode('utf8')
    assert type(self.lower_basenames_unsplit) == str

    self._basename_ranker = BasenameRanker()
    wordstarts = {}
    for basename in basenames:
      start_letters = self._basename_ranker.get_start_letters(basename)
      if len(start_letters) <= 1:
        continue
      lower_basename = basename.lower()
      for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd
        ws = ''.join(start_letters[0:2+i])
        if ws not in wordstarts:
          wordstarts[ws] = []
        loss = len(start_letters) - (2 + i)
        wordstarts[ws].append((lower_basename, loss))

    # now, order the actual entries so high qualities are at front
    self.basenames_by_wordstarts = {}
    for ws,items in wordstarts.iteritems():
      items.sort(lambda x,y: cmp(x[1],y[1]))
      self.basenames_by_wordstarts[ws] = [i[0] for i in items]
Ejemplo n.º 10
0
 def setUp(self):
     #    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
     self.ranker = BasenameRanker()
Ejemplo n.º 11
0
class BasenameRankerTest(unittest.TestCase):
    def setUp(self):
        #    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
        self.ranker = BasenameRanker()

    def test_is_wordstart(self):
        def check(s, expectations):
            assert len(s) == len(expectations)
            for i in range(len(s)):
                self.assertEquals(expectations[i],
                                  self.ranker._is_wordstart(s, i),
                                  "disagreement on index %i" % i)

        check("foo", [True, False, False])
        check("fooBar", [True, False, False, True, False, False])
        check("o", [True])
        check("_", [True])
        check("F", [True])
        check("FooBar", [True, False, False, True, False, False])
        check("Foo_Bar", [True, False, False, False, True, False, False])
        check("_Bar", [True, True, False, False])
        check("_bar", [True, True, False, False])
        check("foo_bar", [True, False, False, False, True, False, False])

        check(".h", [True, False])
        check("a.h", [True, False, False])
        check("__b", [True, False, True])
        check("foo__bar",
              [True, False, False, False, False, True, False, False])

        check("Foo3D", [True, False, False, True, True])
        check("Foo33", [True, False, False, True, False])

        check("x3d", [True, True, False
                      ])  # I could be convinced that 'd' is a wordstart.

        check("AAb", [True, True, False])
        check("CCFra", [True, True, True, False, False])

    def test_get_word_starts(self):
        data = {
            # This comment simply helps map indice to values
            # 1234567
            '': [],
            'abc': [0],
            'abd_def': [0, 4],
            'ab_cd_ef': [0, 3, 6],
            'ab_': [0],
            'AA': [0, 1],
            'AAbA': [0, 1, 3],
            'Abc': [0],
            'AbcDef': [0, 3],
            'Abc_Def': [0, 4],
        }
        for word, expected_starts in data.items():
            starts = self.ranker.get_starts(word)
            self.assertEquals(expected_starts, starts,
                              "for %s, expect %s" % (word, expected_starts))

    def assertBasicRankAndWordHitCountIs(self, expected_rank,
                                         expected_word_count, query,
                                         candidate):
        res = self.ranker._get_basic_rank(query, candidate)
        self.assertEquals(expected_rank, res[0])
        self.assertEquals(expected_word_count, res[1])

    def test_query_hits_on_word_starts(self):
        self.assertBasicRankAndWordHitCountIs(
            8, 4, 'rwhv',
            'render_widget_host_view.cc')  # test +1 for hitting all words
        self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh',
                                              'render_widget_host_view.cc')
        self.assertBasicRankAndWordHitCountIs(
            5.5, 2, 'wvi', 'render_widget_host_view_win.cc')  # eew
        self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc')
        self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc')
        self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc')
        self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc')
        self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc')
        self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc')

    def test_basic_rank_pays_attention_to_case(self):
        # these test that we aren't losing catching case transpitions
        self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf")
        self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf")

    def test_basic_rank_works_at_all(self):
        # these are generic tests
        self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv",
                                              "render_widget_host_view.h")
        self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm",
                                              "render_widget_host_view_mac.h")
        self.assertBasicRankAndWordHitCountIs(
            10, 5, "rwhvm", "render_widget_host_view_mac.mm")

        self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate',
                                              'CCFrameRateController.cpp')

    def test_basic_rank_query_case_doesnt_influence_rank_query(self):
        a = self.ranker._get_basic_rank("Rwhvm",
                                        "render_widget_host_view_mac.h")
        b = self.ranker._get_basic_rank("rwhvm",
                                        "Render_widget_host_view_mac.h")
        self.assertEquals(a, b)

    def test_basic_rank_isnt_only_greedy(self):
        # this checks that we consider _mac and as a wordstart rather than macmm
        self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm",
                                              "render_widget_host_view_macmm")

    def test_basic_rank_on_corner_cases(self):
        self.assertBasicRankAndWordHitCountIs(0, 0, "", "")
        self.assertBasicRankAndWordHitCountIs(0, 0, "", "x")
        self.assertBasicRankAndWordHitCountIs(0, 0, "x", "")
        self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x")
        self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx")
        self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd")

    def test_basic_rank_on_mixed_wordstarts_and_full_words(self):
        self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv",
                                              "render_widget_host_view.h")
        self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv",
                                              "render_widget_host_view.h")

        self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv",
                                              "render_widget_host_view_mac.h")
        self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac",
                                              "render_widget_host_view_mac.h")

        self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm",
                                              "render_widget_host_view_mac.h")

    def test_basic_rank_overconditioned_query(self):
        self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw')

    def test_rank_corner_cases(self):
        # empty
        self.assertEquals(0, self.ranker.rank_query('foo', ''))
        self.assertEquals(0, self.ranker.rank_query('', 'foo'))

        # undersized
        self.assertEquals(0, self.ranker.rank_query('foo', 'm'))
        self.assertEquals(0, self.ranker.rank_query('f', 'oom'))

        # overconditioned
        self.assertEquals(6, self.ranker.rank_query('test_thread_tab.py',
                                                    'tw'))

    def test_rank_subclasses_lower_ranked_than_base(self):
        # this tests that hitting all words counts higher than hitting some of the words
        base_rank = self.ranker.rank_query("rwhvm",
                                           "render_widget_host_view.h")
        subclass_rank = self.ranker.rank_query(
            "rwhvm", "render_widget_host_view_subclass.h")
        self.assertTrue(base_rank > subclass_rank)

    def test_rank_order_for_hierarchy_puts_bases_first(self):
        names = [
            'render_widget_host_view_mac.h',
            'render_widget_host_view_mac.mm',
            'render_widget_host_view_mac_delegate.h',
            'render_widget_host_view_mac_unittest.mm',
            'render_widget_host_view_mac_editcommand_helper.mm',
            'render_widget_host_view_mac_editcommand_helper.h'
            'render_widget_host_view_mac_editcommand_helper_unittest.mm',
        ]
        self._assertRankDecreasesOrStaysTheSame("rwhvm", names)

    def _assertRankDecreasesOrStaysTheSame(self, query, names):
        """
    Makes suer that the first element in the array has highest rank
    and subsequent items have decreasing or equal rank.
    """
        ranks = [self.ranker.rank_query(query, n) for n in names]
        nw = [self.ranker.get_num_words(n) for n in names]
        basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names]
        for i in range(1, len(ranks)):
            changeInRank = ranks[i] - ranks[i - 1]
            self.assertTrue(changeInRank <= 0)

    def test_rank_order_prefers_capitals(self):
        # Ensure we still prefer capitals for simple queries The heuristics that
        # deal with order_puts_tests_second tends to break this.
        self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc')

    def test_rank_order_puts_tests_second(self):
        q = "ccframerate"
        a1 = self.ranker.rank_query(q, 'CCFrameRateController.cpp')
        a2 = self.ranker.rank_query(q, 'CCFrameRateController.h')
        b = self.ranker.rank_query(q, 'CCFrameRateControllerTest.cpp')

        # This is a hard test to pass because ccframera(te) ties to (Te)st
        # if you weight non-word matches equally.
        self.assertTrue(a1 > b)
        self.assertTrue(a2 > b)

        q = "chrome_switches"
        a1 = self.ranker.rank_query(q, 'chrome_switches.cc')
        a2 = self.ranker.rank_query(q, 'chrome_switches.h')
        b = self.ranker.rank_query(q, 'chrome_switches_uitest.cc')
        self.assertTrue(a1 > b)
        self.assertTrue(a2 > b)

    def test_rank_order_for_hierarchy_puts_prefixed_second(self):
        q = "ccframerate"
        a = self.ranker.rank_query(q, 'CCFrameRateController.cpp')
        b1 = self.ranker.rank_query(
            q, 'webcore_platform.CCFrameRateController.o.d')
        b2 = self.ranker.rank_query(
            q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d')
        self.assertTrue(a > b1)
        # FAILS because ccframera(te) ties to (Te)st
        # self.assertTrue(a > b2);

    def test_rank_order_puts_tests_second_2(self):
        q = "ccdelaybassedti"
        a1 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.cpp')
        a2 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.h')
        b = self.ranker.rank_query(q, 'CCDelayBasedTimeSourceTest.cpp')
        self.assertTrue(a1 > b)
        self.assertTrue(a2 > b)

        q = "LayerTexture"
        a = self.ranker.rank_query(q, 'LayerTexture.cpp')
        b = self.ranker.rank_query(q, 'LayerTextureSubImage.cpp')
        self.assertTrue(a > b)

    def test_refinement_improves_rank_query(self):
        a = self.ranker.rank_query('render_', 'render_widget.cc')
        b = self.ranker.rank_query('render_widget', 'render_widget.cc')
        self.assertTrue(b > a)

    def test_document_rank(self):
        a = self.ranker.rank_query('document.cpp', 'Document.cpp')
        b = self.ranker.rank_query('document.cpp', 'AccessibleDocument.cpp')
        self.assertTrue(a > b)

    def test_enclosing_run_length_memoization_real_user_data(self):
        # This test content causes the memoizer to cache a particular a,b
        # combination whose score in the cached context has a non-wordstart boost,
        # but then in a subsequent query, asks for the same value without the
        # wordstart. See test_enclosing_run_length_memoization for a reduced test
        # case.
        q = 'cclayertreehos.h'
        items = [
            'CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h',
            'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h',
            'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h',
            'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h',
            'CACFLayerTreeHost.h'
        ]
        for x in items:
            stateless_ranker = BasenameRanker()
            rStateless = stateless_ranker.rank_query(q, x)
            rStateful = self.ranker.rank_query(q, x)
            self.assertEquals(
                rStateless, rStateful,
                "For %s, expected %f=%f" % (x, rStateless, rStateful))

    def test_enclosing_run_length_memoization(self):
        q1 = BasenameRanker().rank_query("bar", "bar.txt")
        q2 = BasenameRanker().rank_query("bar", "rebar.txt")

        r = BasenameRanker()
        r1 = r.rank_query("bar", "bar.txt")
        r2 = r.rank_query("bar", "rebar.txt")

        self.assertEquals(q1, r1)
        self.assertEquals(q2, r2)
Ejemplo n.º 12
0
class DBIndexShard(object):
  def __init__(self, basenames):
    # The basenames come out of a hashtable so they are usually pretty badly
    # shuffled around. Sort them here so that we get somewhat predictable results
    # as a query is incrementally refined.
    basenames.sort()

    # Build the lower basenames list, removing dupes as needed.
    lower_basenames = set()
    for basename in basenames:
      lower_basename = basename.lower()
      lower_basenames.add(lower_basename)

    # Build two giant strings that contain all the basenames [and lowercase basenames]
    # concatenated together. This is what we will use to handle fuzzy queries.
    self.basenames_unsplit = ("\n" + "\n".join(basenames) + "\n").encode('utf8')
    self.lower_basenames_unsplit = ("\n" + "\n".join(lower_basenames) + "\n").encode('utf8')
    assert type(self.lower_basenames_unsplit) == str

    self._basename_ranker = BasenameRanker()
    wordstarts = {}
    for basename in basenames:
      start_letters = self._basename_ranker.get_start_letters(basename)
      if len(start_letters) <= 1:
        continue
      lower_basename = basename.lower()
      for i in range(len(start_letters) + 1 - 2): # abcd -> ab abc abcd
        ws = ''.join(start_letters[0:2+i])
        if ws not in wordstarts:
          wordstarts[ws] = []
        loss = len(start_letters) - (2 + i)
        wordstarts[ws].append((lower_basename, loss))

    # now, order the actual entries so high qualities are at front
    self.basenames_by_wordstarts = {}
    for ws,items in wordstarts.iteritems():
      items.sort(lambda x,y: cmp(x[1],y[1]))
      self.basenames_by_wordstarts[ws] = [i[0] for i in items]

  @traced
  def search_basenames(self, query):
    """
    Searches index for basenames matching the query.

    Returns (hits, truncated) where:
       hits is an array of basenames that matched.
       truncated is a bool indicated whether not all possible matches were found.
    """
    lower_query = query.lower()

    lower_hits = set()

    max_hits_hint = 25

    # add exact matches first
    trace_begin("exact")
    self.add_all_matching( lower_hits, query, self.get_exact_match_filter(lower_query), max_hits_hint )
    trace_end("exact")

    # add in word starts
    trace_begin("wordstarts")
    self.add_all_wordstarts_matching( lower_hits, query, max_hits_hint )
    trace_end("wordstarts")

    # add in substring matches
    trace_begin("substrings")
    self.add_all_matching( lower_hits, query, self.get_substring_filter(lower_query), max_hits_hint )
    trace_end("substrings")

    # add in superfuzzy matches ONLY if we have no high-quality hit
    has_hq = False
    for lower_hit in lower_hits:
      rank = self._basename_ranker.rank_query(query, lower_hit)
      if rank > 2:
        has_hq = True
        break
    if not has_hq:
      trace_begin("superfuzzy")
      self.add_all_matching( lower_hits, query, self.get_superfuzzy_filter(lower_query), max_hits_hint )
      trace_end("superfuzzy")

    return lower_hits, len(lower_hits) == max_hits_hint

  def add_all_wordstarts_matching( self, lower_hits, query, max_hits_hint ):
    lower_query = query.lower()
    if lower_query in self.basenames_by_wordstarts:
      for basename in self.basenames_by_wordstarts[lower_query]:
        lower_hits.add(basename)
        if len(lower_hits) >= max_hits_hint:
          return


  def get_exact_match_filter(self, query):
    query = re.escape(query.lower())
    # abc -> abc(\..*)?
    flt = "\n%s(?:\\..*)?\n" % query
    return (flt, False)

  def get_delimited_wordstart_filter(self, query):
    query = [re.escape(query[i]) for i in range(len(query))]
    # abc -> ^a.*_b.*_c
    # abc -> .*_a.*_b.*_c
    tmp = []
    tmp.append("(?:(?:%s)|(?:.*_%s))" % (query[0], query[0]))
    for i in range(1, len(query)):
      c = query[i]
      tmp.append("_%s" % query[i])
    flt = "\n%s.*\n" % '.*'.join(tmp)
    return (flt, False)

  def get_camelcase_wordstart_filter(self, query):
    query = query.upper()
    query = [re.escape(query[i]) for i in range(len(query))]
    # abc -> A.*B.*C
    #        .*[^A-Z]A.*
    tmp = []
    tmp.append("(?:(?:%s)|(?:.*[^A-Z\n]%s))" % (query[0], query[0]))
    for i in range(1, len(query)):
      tmp.append("[^A-Z\n]%s" % query[i])
    flt = "\n.*%s.*\n" % '.*'.join(tmp)
    return (flt, True)

  def get_substring_filter(self, query):
    query = re.escape(query.lower())
    # abc -> *abc*
    flt = "\n.*%s.*\n" % query
    return (flt, False)

  def get_superfuzzy_filter(self, query):
    tmp = []
    for i in range(len(query)):
      tmp.append(re.escape(query[i]))
    flt = "\n.*%s.*\n" % '.*'.join(tmp)
    return (flt, False)

  def add_all_matching(self, lower_hits, query, flt_tuple, max_hits_hint):
    """
    lower_hits is the dictionary to put results in
    query is the query string originally entered by user, used by ranking
    flt_tuple is [filter_regex, case_sensitive_bool]
    max_hits_hint is largest hits should grow before matching terminates.
    """
    flt, case_sensitive = flt_tuple

    regex = re.compile(flt)
    base = 0
    if not case_sensitive:
      index = self.lower_basenames_unsplit
    else:
      index = self.basenames_unsplit
    while True:
      m = regex.search(index, base)
      if m:
        hit = m.group(0)[1:-1]
        if hit.find('\n') != -1:
          raise Exception("Somethign is messed up with flt=[%s] query=[%s] hit=[%s]" % (flt,query,hit))
        if case_sensitive:
          hit = hit.lower()
        lower_hits.add(hit)
        base = m.end() - 1
        if len(lower_hits) >= max_hits_hint:
          truncated = True
          break
      else:
        break
Ejemplo n.º 13
0
  def setUp(self):
#    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
    self.ranker = BasenameRanker()
Ejemplo n.º 14
0
class BasenameRankerTest(unittest.TestCase):
  def setUp(self):
#    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
    self.ranker = BasenameRanker()

  def test_is_wordstart(self):
    def check(s, expectations):
      assert len(s) == len(expectations)
      for i in range(len(s)):
        self.assertEquals(expectations[i], self.ranker._is_wordstart(s, i), "disagreement on index %i" % i)

    check("foo", [True, False, False])
    check("fooBar", [True, False, False, True, False, False])
    check("o", [True])
    check("_", [True])
    check("F", [True])
    check("FooBar", [True, False, False, True, False, False])
    check("Foo_Bar", [True, False, False, False, True, False, False])
    check("_Bar", [True, True, False, False])
    check("_bar", [True, True, False, False])
    check("foo_bar", [True, False, False, False, True, False, False])

    check(".h", [True, False])
    check("a.h", [True, False, False])
    check("__b", [True, False, True])
    check("foo__bar", [True, False, False, False, False, True, False, False])

    check("Foo3D", [True, False, False, True, True])
    check("Foo33", [True, False, False, True, False])

    check("x3d", [True, True,  False]) # I could be convinced that 'd' is a wordstart.

    check("AAb", [True, True, False])
    check("CCFra", [True, True, True, False, False])

  def test_get_word_starts(self):
    data = {
      # This comment simply helps map indice to values
      # 1234567
      '' : [],
      'abc' : [0],
      'abd_def' : [0, 4],
      'ab_cd_ef' : [0, 3, 6],
      'ab_' : [0],
      'AA': [0, 1],
      'AAbA': [0,1,3],
      'Abc': [0],
      'AbcDef': [0,3],
      'Abc_Def': [0,4],
      }
    for word, expected_starts in data.items():
      starts = self.ranker.get_starts(word)
      self.assertEquals(expected_starts, starts, "for %s, expect %s" % (word, expected_starts))

  def assertBasicRankAndWordHitCountIs(self, expected_rank, expected_word_count, query, candidate):
    res = self.ranker._get_basic_rank(query, candidate)
    self.assertEquals(expected_rank, res[0])
    self.assertEquals(expected_word_count, res[1])

  def test_query_hits_on_word_starts(self):
    self.assertBasicRankAndWordHitCountIs(8, 4, 'rwhv', 'render_widget_host_view.cc') # test +1 for hitting all words
    self.assertBasicRankAndWordHitCountIs(6, 3, 'rwh', 'render_widget_host_view.cc')
    self.assertBasicRankAndWordHitCountIs(5.5, 2, 'wvi', 'render_widget_host_view_win.cc') # eew
    self.assertBasicRankAndWordHitCountIs(2, 1, 'w', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(2, 1, 'v', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(5, 2, 'evi', 'WebViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(4, 2, 'wv', 'eWbViewImpl.cc')
    self.assertBasicRankAndWordHitCountIs(6, 0, 'ebewp', 'WebViewImpl.cc')


  def test_basic_rank_pays_attention_to_case(self):
    # these test that we aren't losing catching case transpitions
    self.assertBasicRankAndWordHitCountIs(4.5, 1, "rw", "rwf")
    self.assertBasicRankAndWordHitCountIs(4, 2, "rw", "rWf")

  def test_basic_rank_works_at_all(self):
    # these are generic tests
    self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view.h")
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h")
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.mm")

    self.assertBasicRankAndWordHitCountIs(29, 4, 'ccframerate', 'CCFrameRateController.cpp')


  def test_basic_rank_query_case_doesnt_influence_rank_query(self):
    a = self.ranker._get_basic_rank("Rwhvm", "render_widget_host_view_mac.h")
    b = self.ranker._get_basic_rank("rwhvm", "Render_widget_host_view_mac.h")
    self.assertEquals(a, b)

  def test_basic_rank_isnt_only_greedy(self):
    # this checks that we consider _mac and as a wordstart rather than macmm
    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_macmm")

  def test_basic_rank_on_corner_cases(self):
    self.assertBasicRankAndWordHitCountIs(0, 0, "", "")
    self.assertBasicRankAndWordHitCountIs(0, 0, "", "x")
    self.assertBasicRankAndWordHitCountIs(0, 0, "x", "")
    self.assertBasicRankAndWordHitCountIs(2, 1, "x", "x")
    self.assertBasicRankAndWordHitCountIs(1, 0, "x", "yx")
    self.assertBasicRankAndWordHitCountIs(0, 0, "x", "abcd")

  def test_basic_rank_on_mixed_wordstarts_and_full_words(self):
    self.assertBasicRankAndWordHitCountIs(17, 3, "enderwhv", "render_widget_host_view.h")
    self.assertBasicRankAndWordHitCountIs(15, 2, "idgethv", "render_widget_host_view.h")

    self.assertBasicRankAndWordHitCountIs(8, 4, "rwhv", "render_widget_host_view_mac.h")
    self.assertBasicRankAndWordHitCountIs(14, 5, "rwhvmac", "render_widget_host_view_mac.h")

    self.assertBasicRankAndWordHitCountIs(10, 5, "rwhvm", "render_widget_host_view_mac.h")

  def test_basic_rank_overconditioned_query(self):
    self.assertBasicRankAndWordHitCountIs(2, 1, 'test_thread_tab.py', 'tw')

  def test_rank_corner_cases(self):
    # empty
    self.assertEquals(0, self.ranker.rank_query('foo', ''))
    self.assertEquals(0, self.ranker.rank_query('', 'foo'))

    # undersized
    self.assertEquals(0, self.ranker.rank_query('foo', 'm'))
    self.assertEquals(0, self.ranker.rank_query('f', 'oom'))

    # overconditioned
    self.assertEquals(6, self.ranker.rank_query('test_thread_tab.py', 'tw'))

  def test_rank_subclasses_lower_ranked_than_base(self):
    # this tests that hitting all words counts higher than hitting some of the words
    base_rank = self.ranker.rank_query("rwhvm", "render_widget_host_view.h")
    subclass_rank = self.ranker.rank_query("rwhvm", "render_widget_host_view_subclass.h")
    self.assertTrue(base_rank > subclass_rank)

  def test_rank_order_for_hierarchy_puts_bases_first(self):
    names = ['render_widget_host_view_mac.h',
             'render_widget_host_view_mac.mm',
             'render_widget_host_view_mac_delegate.h',
             'render_widget_host_view_mac_unittest.mm',
             'render_widget_host_view_mac_editcommand_helper.mm',
             'render_widget_host_view_mac_editcommand_helper.h'
             'render_widget_host_view_mac_editcommand_helper_unittest.mm',
             ]
    self._assertRankDecreasesOrStaysTheSame("rwhvm", names)

  def _assertRankDecreasesOrStaysTheSame(self, query, names):
    """
    Makes suer that the first element in the array has highest rank
    and subsequent items have decreasing or equal rank.
    """
    ranks = [self.ranker.rank_query(query, n) for n in names]
    nw = [self.ranker.get_num_words(n) for n in names]
    basic_ranks = [self.ranker._get_basic_rank(query, n) for n in names]
    for i in range(1, len(ranks)):
      changeInRank = ranks[i] - ranks[i-1]
      self.assertTrue(changeInRank <= 0)

  def test_rank_order_prefers_capitals(self):
    # Ensure we still prefer capitals for simple queries The heuristics that
    # deal with order_puts_tests_second tends to break this.
    self.assertBasicRankAndWordHitCountIs(6, 3, 'wvi', 'WebViewImpl.cc')

  def test_rank_order_puts_tests_second(self):
    q = "ccframerate"
    a1 = self.ranker.rank_query(q, 'CCFrameRateController.cpp')
    a2 = self.ranker.rank_query(q, 'CCFrameRateController.h')
    b = self.ranker.rank_query(q, 'CCFrameRateControllerTest.cpp')

    # This is a hard test to pass because ccframera(te) ties to (Te)st
    # if you weight non-word matches equally.
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

    q = "chrome_switches"
    a1 = self.ranker.rank_query(q, 'chrome_switches.cc')
    a2 = self.ranker.rank_query(q, 'chrome_switches.h')
    b = self.ranker.rank_query(q, 'chrome_switches_uitest.cc')
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

  def test_rank_order_for_hierarchy_puts_prefixed_second(self):
    q = "ccframerate"
    a = self.ranker.rank_query(q, 'CCFrameRateController.cpp')
    b1 = self.ranker.rank_query(q, 'webcore_platform.CCFrameRateController.o.d')
    b2 = self.ranker.rank_query(q, 'webkit_unit_tests.CCFrameRateControllerTest.o.d')
    self.assertTrue(a > b1);
    # FAILS because ccframera(te) ties to (Te)st
    # self.assertTrue(a > b2);

  def test_rank_order_puts_tests_second_2(self):
    q = "ccdelaybassedti"
    a1 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.cpp')
    a2 = self.ranker.rank_query(q, 'CCDelayBasedTimeSource.h')
    b = self.ranker.rank_query(q, 'CCDelayBasedTimeSourceTest.cpp')
    self.assertTrue(a1 > b);
    self.assertTrue(a2 > b);

    q = "LayerTexture"
    a = self.ranker.rank_query(q, 'LayerTexture.cpp')
    b = self.ranker.rank_query(q, 'LayerTextureSubImage.cpp')
    self.assertTrue(a > b)

  def test_refinement_improves_rank_query(self):
    a = self.ranker.rank_query('render_', 'render_widget.cc')
    b = self.ranker.rank_query('render_widget', 'render_widget.cc')
    self.assertTrue(b > a)

  def test_document_rank(self):
    a = self.ranker.rank_query('document.cpp', 'Document.cpp')
    b = self.ranker.rank_query('document.cpp', 'AccessibleDocument.cpp')
    self.assertTrue(a > b)

  def test_enclosing_run_length_memoization_real_user_data(self):
    # This test content causes the memoizer to cache a particular a,b
    # combination whose score in the cached context has a non-wordstart boost,
    # but then in a subsequent query, asks for the same value without the
    # wordstart. See test_enclosing_run_length_memoization for a reduced test
    # case.
    q = 'cclayertreehos.h'
    items = ['CCLayerTreeHostImpl.h',
             'CCLayerTreeHostCommon.h',
             'AbstractCACFLayerTreeHost.h',
             'CCLayerTreeHost.h',
             'LegacyCACFLayerTreeHost.h',
             'CACFLayerTreeHostClient.h',
             'WKCACFViewLayerTreeHost.h',
             'FakeCCLayerTreeHostClient.h',
             'CACFLayerTreeHost.h']
    for x in items:
      stateless_ranker = BasenameRanker()
      rStateless = stateless_ranker.rank_query(q, x)
      rStateful = self.ranker.rank_query(q, x)
      self.assertEquals(rStateless, rStateful,"For %s, expected %f=%f" % (x, rStateless, rStateful))

  def test_enclosing_run_length_memoization(self):
    q1 = BasenameRanker().rank_query("bar", "bar.txt")
    q2 = BasenameRanker().rank_query("bar", "rebar.txt")

    r = BasenameRanker()
    r1 = r.rank_query("bar", "bar.txt")
    r2 = r.rank_query("bar", "rebar.txt")

    self.assertEquals(q1, r1)
    self.assertEquals(q2, r2)