Ejemplo n.º 1
0
    def test_enclosing_run_length_memoization(self):
        q1 = BasenameRanker().rank_query("bar", "bar.txt")
        q2 = BasenameRanker().rank_query("bar", "rebar.txt")

        r = BasenameRanker()
        r1 = r.rank_query("bar", "bar.txt")
        r2 = r.rank_query("bar", "rebar.txt")

        self.assertEquals(q1, r1)
        self.assertEquals(q2, r2)
Ejemplo n.º 2
0
    def test_basename_only_query_rank_results(self):
        shard_manager = FakeDBShardManager(
            ["foo/bar.txt", "foo/rebar.txt", "blah/baz.txt"])
        query_cache = QueryCache()

        res = MockQuery("bar").execute_nocache(shard_manager, query_cache)
        self.assertEquals(set(["foo/bar.txt", "foo/rebar.txt"]),
                          set(res.filenames))
        self.assertEquals([
            BasenameRanker().rank_query("bar",
                                        os.path.basename(res.filenames[0])),
            BasenameRanker().rank_query("bar",
                                        os.path.basename(res.filenames[1]))
        ], res.ranks)
Ejemplo n.º 3
0
    def execute_nocache(self, shard_manager, query_cache):
        # What we'll actually return
        truncated = False

        slashIdx = self.text.rfind('/')
        if slashIdx != -1:
            dirpart_query = self.text[:slashIdx]
            basename_query = self.text[slashIdx + 1:]
        else:
            dirpart_query = ''
            basename_query = self.text
        lower_dirpart_query = dirpart_query.lower()

        # Get the files
        files = []
        if len(basename_query):
            basename_hits, truncated = shard_manager.search_basenames(
                basename_query)
            for hit in basename_hits:
                hit_files = shard_manager.files_by_lower_basename[hit]
                for f in hit_files:
                    if _is_dirmatch(lower_dirpart_query, f):
                        files.append(f)
        else:
            i = 0
            start = time.time()
            timeout = start + self._dir_search_timeout
            for f in shard_manager.files:
                if _is_dirmatch(lower_dirpart_query, f):
                    files.append(f)
                i += 1
                if i % 1000 == 0:
                    if time.time() >= timeout:
                        truncated = True
                        break

        # Rank the results
        trace_begin("rank_results")
        hits = []
        basename_ranker = BasenameRanker()
        for f in files:
            basename = os.path.basename(f)
            rank = basename_ranker.rank_query(basename_query, basename)
            hits.append((f, rank))
        trace_end("rank_results")

        return QueryResult(hits=hits, truncated=truncated)
Ejemplo n.º 4
0
    def __init__(self, basenames):
        reload(sys)
        sys.setdefaultencoding('utf8')

        # The basenames come out of a hashtable so they are usually pretty badly
        # shuffled around. Sort them here so that we get somewhat predictable results
        # as a query is incrementally refined.
        basenames.sort()

        # Build the lower basenames list, removing dupes as needed.
        lower_basenames = set()
        for basename in basenames:
            lower_basename = basename.lower()
            lower_basenames.add(lower_basename)

        # Build two giant strings that contain all the basenames [and lowercase basenames]
        # concatenated together. This is what we will use to handle fuzzy queries.
        self.basenames_unsplit = (u"\n" + u"\n".join(basenames) + u"\n")
        self.lower_basenames_unsplit = (u"\n" + u"\n".join(lower_basenames) +
                                        u"\n")
        assert type(self.lower_basenames_unsplit) == unicode

        self._basename_ranker = BasenameRanker()
        wordstarts = {}
        for basename in basenames:
            start_letters = self._basename_ranker.get_start_letters(basename)
            if len(start_letters) <= 1:
                continue
            lower_basename = basename.lower()
            for i in range(len(start_letters) + 1 - 2):  # abcd -> ab abc abcd
                ws = ''.join(start_letters[0:2 + i])
                if ws not in wordstarts:
                    wordstarts[ws] = []
                loss = len(start_letters) - (2 + i)
                wordstarts[ws].append((lower_basename, loss))

        # now, order the actual entries so high qualities are at front
        self.basenames_by_wordstarts = {}
        for ws, items in wordstarts.iteritems():
            items.sort(lambda x, y: cmp(x[1], y[1]))
            self.basenames_by_wordstarts[ws] = [i[0] for i in items]
Ejemplo n.º 5
0
 def test_enclosing_run_length_memoization_real_user_data(self):
     # This test content causes the memoizer to cache a particular a,b
     # combination whose score in the cached context has a non-wordstart boost,
     # but then in a subsequent query, asks for the same value without the
     # wordstart. See test_enclosing_run_length_memoization for a reduced test
     # case.
     q = 'cclayertreehos.h'
     items = [
         'CCLayerTreeHostImpl.h', 'CCLayerTreeHostCommon.h',
         'AbstractCACFLayerTreeHost.h', 'CCLayerTreeHost.h',
         'LegacyCACFLayerTreeHost.h', 'CACFLayerTreeHostClient.h',
         'WKCACFViewLayerTreeHost.h', 'FakeCCLayerTreeHostClient.h',
         'CACFLayerTreeHost.h'
     ]
     for x in items:
         stateless_ranker = BasenameRanker()
         rStateless = stateless_ranker.rank_query(q, x)
         rStateful = self.ranker.rank_query(q, x)
         self.assertEquals(
             rStateless, rStateful,
             "For %s, expected %f=%f" % (x, rStateless, rStateful))
Ejemplo n.º 6
0
 def setUp(self):
     #    self.basenames = json.load(open('test_data/cr_files_basenames.json'))
     self.ranker = BasenameRanker()