Example #1
0
def test_within_delete():
    st = gwrite(enlist("abc def ghi"))
    gr = greader(st)
    assert_equal(set(dawg.within(gr, "df")), set(["def"]))

    st = gwrite(enlist("0"))
    gr = greader(st)
    assert_equal(list(dawg.within(gr, "01")), ["0"])
Example #2
0
def test_within_replace():
    st = gwrite(enlist("abc def ghi"))
    gr = greader(st)
    assert_equal(set(dawg.within(gr, "dez")), set(["def"]))

    st = gwrite(enlist("00 01 10 11"))
    gr = greader(st)
    s = set(dawg.within(gr, "00"))
    assert_equal(s, set(["00", "10", "01"]), s)
Example #3
0
    def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None):
        """Returns a generator of words in the given field within ``maxdist``
        Damerau-Levenshtein edit distance of the given text.
        
        :param maxdist: the maximum edit distance.
        :param prefix: require suggestions to share a prefix of this length
            with the given word. This is often justifiable since most
            misspellings do not involve the first letter of the word.
            Using a prefix dramatically decreases the time it takes to generate
            the list of words.
        :param seen: an optional set object. Words that appear in the set will
            not be yielded.
        """

        if self.has_word_graph(fieldname):
            node = self.word_graph(fieldname)
            for word in within(node, text, maxdist, prefix=prefix, seen=seen):
                yield word
        else:
            if seen is None:
                seen = set()
            for word in self.expand_prefix(fieldname, text[:prefix]):
                if word in seen:
                    continue
                if (word == text
                    or distance(word, text, limit=maxdist) <= maxdist):
                    yield word
                    seen.add(word)
Example #4
0
    def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None):
        """Returns a generator of words in the given field within ``maxdist``
        Damerau-Levenshtein edit distance of the given text.
        
        :param maxdist: the maximum edit distance.
        :param prefix: require suggestions to share a prefix of this length
            with the given word. This is often justifiable since most
            misspellings do not involve the first letter of the word.
            Using a prefix dramatically decreases the time it takes to generate
            the list of words.
        :param seen: an optional set object. Words that appear in the set will
            not be yielded.
        """

        if self.has_word_graph(fieldname):
            node = self.word_graph(fieldname)
            for word in within(node, text, maxdist, prefix=prefix, seen=seen):
                yield word
        else:
            if seen is None:
                seen = set()
            for word in self.expand_prefix(fieldname, text[:prefix]):
                if word in seen:
                    continue
                k = distance(word, text, limit=maxdist)
                if k <= maxdist:
                    yield word
                    seen.add(word)
Example #5
0
    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self, fieldname, text, maxdist,
                                            prefix=prefix)

        return dawg.within(self._graph, text, k=maxdist, prefix=prefix,
                           address=self._graph.root(fieldname))
Example #6
0
def test_within():
    with TempStorage() as st:
        gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st)
        gr = greader(st)
        s = set(dawg.within(gr, "01", k=1))
        gr.close()
    assert_equal(
        s, set(["0", "00", "01", "011", "010", "001", "10", "101", "1", "11"]))
Example #7
0
 def _suggestions(self, text, maxdist, prefix, seen):
     for sug in dawg.within(self.word_graph,
                            text,
                            maxdist,
                            prefix=prefix,
                            seen=seen):
         # Higher scores are better, so negate the edit distance
         yield (0 - maxdist, sug)
Example #8
0
    def terms_within(self, fieldname, text, maxdist, prefix=0):
        if not self.has_word_graph(fieldname):
            # This reader doesn't have a graph stored, use the slow method
            return IndexReader.terms_within(self,
                                            fieldname,
                                            text,
                                            maxdist,
                                            prefix=prefix)

        return dawg.within(self._graph,
                           text,
                           k=maxdist,
                           prefix=prefix,
                           address=self._graph.root(fieldname))
Example #9
0
def test_within_unicode():
    domain = [
        u("\u280b\u2817\u2801\u281d\u2809\u2811"),
        u("\u65e5\u672c"),
        u("\uc774\uc124\ud76c"),
    ]

    st = RamStorage()
    gw = dawg.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    gr = dawg.GraphReader(st.open_file("test"))
    s = list(dawg.within(gr, u("\uc774.\ud76c")))
    assert_equal(s, [u("\uc774\uc124\ud76c")])
Example #10
0
 def _suggestions(self, text, maxdist, prefix, seen):
     for sug in dawg.within(self.word_graph, text, maxdist, prefix=prefix,
                            seen=seen):
         # Higher scores are better, so negate the edit distance
         yield (0 - maxdist, sug)
Example #11
0
def test_within_prefix():
    st = gwrite(enlist("aabc aadc babc badc"))
    gr = greader(st)
    s = set(dawg.within(gr, "aaxc", prefix=2))
    assert_equal(s, set(["aabc", "aadc"]))
Example #12
0
def test_within_k2():
    st = gwrite(enlist("abc bac cba"))
    gr = greader(st)
    s = set(dawg.within(gr, "cb", k=2))
    assert_equal(s, set(["abc", "cba"]))
Example #13
0
def test_within_transpose():
    st = gwrite(enlist("abc def ghi"))
    gr = greader(st)
    s = set(dawg.within(gr, "dfe"))
    assert_equal(s, set(["def"]))
Example #14
0
def test_within_insert():
    st = gwrite(enlist("00 01 10 11"))
    gr = greader(st)
    s = set(dawg.within(gr, "0"))
    assert_equal(s, set(["00", "01", "10"]))
Example #15
0
def test_within_match():
    st = gwrite(enlist("abc def ghi"))
    gr = greader(st)
    assert_equal(set(dawg.within(gr, "def")), set(["def"]))
Example #16
0
 def _suggestions(self, text, maxdist, prefix, seen):
     ranking = self.ranking
     for sug in dawg.within(self.word_graph, text, maxdist, prefix=prefix,
                            seen=seen):
         yield (ranking(sug, maxdist), sug)