class TamilDTriesForward(unittest.TestCase):
    def setUp(self):
        rhymie = [(u"மாங்குயில்",u"ல்யிகுங்மா"),(u"பூங்குயில்",u"ல்யிகுங்பூ"),(u"அல்லவா",u"வாலல்அ"),\
                  (u"செல்வாயா",u"யாவால்செ"),(u"சொல்வாயா",u"யாவால்சொ")]
        self.obj = DTrie()
        self.count = len(rhymie)
        for k,v in rhymie:
            self.obj.add(v)
        return
        
    def test_all_words(self):
        self.assertTrue(len(self.obj.getAllWords()),self.count)
        
    def word_n_prefix_test(self,pfx,wordlist,no):
        itr = 0
        for word in self.obj.getAllWordsPrefix(pfx):
            itr = itr + 1
            self.assertTrue(word in wordlist)
        self.assertEqual(itr,no)
        return
    
    def test_fwd_dictionaries(self):
        kuyils = [u"ல்யிகுங்பூ",u"ல்யிகுங்மா"]
        pfx = u"ல்யிகு"
        self.word_n_prefix_test(pfx,kuyils,2)
        
    def test_fwd_dictionaries_two(self):
        verbie = [u"யாவால்செ",u"யாவால்சொ"]
        pfx = u"யாவா"
        self.word_n_prefix_test(pfx,verbie,2)
Ejemplo n.º 2
0
class TamilDTriesForward(unittest.TestCase):
    def setUp(self):
        rhymie = [
            (u"மாங்குயில்", u"ல்யிகுங்மா"),
            (u"பூங்குயில்", u"ல்யிகுங்பூ"),
            (u"அல்லவா", u"வாலல்அ"),
            (u"செல்வாயா", u"யாவால்செ"),
            (u"சொல்வாயா", u"யாவால்சொ"),
        ]
        self.obj = DTrie()
        self.count = len(rhymie)
        for k, v in rhymie:
            self.obj.add(v)
        return

    def test_all_words(self):
        self.assertTrue(len(self.obj.getAllWords()), self.count)

    def word_n_prefix_test(self, pfx, wordlist, no):
        itr = 0
        for word in self.obj.getAllWordsPrefix(pfx):
            itr = itr + 1
            self.assertTrue(word in wordlist)
        self.assertEqual(itr, no)
        return

    def test_fwd_dictionaries(self):
        kuyils = [u"ல்யிகுங்பூ", u"ல்யிகுங்மா"]
        pfx = u"ல்யிகு"
        self.word_n_prefix_test(pfx, kuyils, 2)

    def test_fwd_dictionaries_two(self):
        verbie = [u"யாவால்செ", u"யாவால்சொ"]
        pfx = u"யாவா"
        self.word_n_prefix_test(pfx, verbie, 2)
 def setUp(self):
     rhymie = [(u"மாங்குயில்",u"ல்யிகுங்மா"),(u"பூங்குயில்",u"ல்யிகுங்பூ"),(u"அல்லவா",u"வாலல்அ"),\
               (u"செல்வாயா",u"யாவால்செ"),(u"சொல்வாயா",u"யாவால்சொ")]
     self.obj = DTrie()
     self.count = len(rhymie)
     for k, v in rhymie:
         self.obj.add(v)
     return
 def test_pattiyal(self):
     obj = DTrie()
     in_words = u"டைட்டானிக் படத்தில் வரும் ஜேக் மற்றும் ரோஸ் போன்று தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split()
     list(map( obj.add, in_words )) # Python 2-3
     all_words_and_reverse = copy.copy(in_words)
     all_words_and_reverse.extend( [utf8.reverse_word( word)  for word in in_words] )
     actual = [obj.isWord(word) for word in all_words_and_reverse]
     expected = [i<len(in_words) for i in range(0,2*len(in_words))]
     self.assertEqual( actual, expected )
 def test_pattiyal(self):
     obj = DTrie()
     in_words = u"டைட்டானிக் படத்தில் வரும் ஜேக் மற்றும் ரோஸ் போன்று தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split()
     list(map( obj.add, in_words )) # Python 2-3
     all_words_and_reverse = copy.copy(in_words)
     all_words_and_reverse.extend( [utf8.reverse_word( word)  for word in in_words] )
     actual = [obj.isWord(word) for word in all_words_and_reverse]
     expected = [i<len(in_words) for i in range(0,2*len(in_words))]
     self.assertEqual( actual, expected )
    def test_prefix(self):
        obj = DTrie()
        actual_words = ['abx','abc','abcd','bbc']
        [obj.add(w) for w in actual_words]
        for w in actual_words:
            self.assertTrue(obj.isWord(w))
        self.assertEqual(len(obj.getAllWords()),4)
        
        for w in actual_words:
            self.assertFalse( obj.isWord( w+'NA' ) )
            self.assertFalse( obj.isWord( w+'DA' ) )
        
        for pfx in ["ab","bb","bd"]:
            self.assertFalse( obj.isWord( pfx ) )

        for pfx in ['a','ab','b','bb']:
            self.assertTrue( obj.hasWordPrefix(pfx) )
        
        val = []
        for pfx in ['c','ac','ad','cb','z']:
            #print("===> last/test ==> %s"%pfx)
            val.append( obj.hasWordPrefix(pfx) )
        self.assertFalse( any(val) )
        
        return
Ejemplo n.º 7
0
    def test_prefix(self):
        obj = DTrie()
        actual_words = ["abx", "abc", "abcd", "bbc"]
        [obj.add(w) for w in actual_words]
        for w in actual_words:
            self.assertTrue(obj.isWord(w))
        self.assertEqual(len(obj.getAllWords()), 4)

        for w in actual_words:
            self.assertFalse(obj.isWord(w + "NA"))
            self.assertFalse(obj.isWord(w + "DA"))

        for pfx in ["ab", "bb", "bd"]:
            self.assertFalse(obj.isWord(pfx))

        for pfx in ["a", "ab", "b", "bb"]:
            self.assertTrue(obj.hasWordPrefix(pfx))

        val = []
        for pfx in ["c", "ac", "ad", "cb", "z"]:
            # print("===> last/test ==> %s"%pfx)
            val.append(obj.hasWordPrefix(pfx))
        self.assertFalse(any(val))

        return
Ejemplo n.º 8
0
def test_ngram(request,ng):
    obj = DTrie()
    prev_letter = u''
    # per-line processor - remove spaces
    for char in get_letters(u"".join(re.split('\s+',ng)).lower()):
        if (prev_letter.isalpha() and char.isalpha()) or ( utf8.is_tamil_unicode(prev_letter) and utf8.is_tamil_unicode(char)):
            bigram = u"".join([prev_letter,char])
            obj.add(bigram) # update previous
        prev_letter = char
    actual = obj.getAllWordsAndCount()
    json_string = json.dumps(actual,ensure_ascii = False)
    #creating a Response object to set the content type and the encoding
    response = HttpResponse(json_string,content_type="application/json; charset=utf-8" )
    return response
Ejemplo n.º 9
0
 def __unused__(self):
     self.trie_english = DTrie.buildEnglishTrie()
     with codecs.open(DICTIONARY_DATA_FILES["english"], "r", "utf-8") as fp:
         for l in fp.readlines():
             l = l.strip()
             self.trie_english.add(
                 filter(lambda lx: lx in string.letters, l.lower()))
 def setUp(self):
     rhymie = [(u"மாங்குயில்",u"ல்யிகுங்மா"),(u"பூங்குயில்",u"ல்யிகுங்பூ"),(u"அல்லவா",u"வாலல்அ"),\
               (u"செல்வாயா",u"யாவால்செ"),(u"சொல்வாயா",u"யாவால்சொ")]
     self.obj = DTrie()
     self.count = len(rhymie)
     for k,v in rhymie:
         self.obj.add(v)
     return
Ejemplo n.º 11
0
 def test_stuff_3letter(self):
     obj = DTrie()
     self.assertFalse( obj.isWord('apple') )
     try:
         obj.add('')
     except AssertionError as exp: 
         pass
     actual_words = ['a','ab','abc','bbc']
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue( obj.isWord(w) )
     self.assertEqual( sorted(obj.getAllWords()),sorted(actual_words))
     self.assertEqual( obj.getAllWordsPrefix('ab'), ['ab','abc'] )
     return
 def test_trie_counts_and_prefix(self):
     obj = DTrie()
     actual_words = ['a','ab','abc','abc','bbc']
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue(obj.isWord(w))
     self.assertEqual(len(obj.getAllWords()),4)
     self.assertEqual( obj.getAllWordsPrefix('ab'),['ab','abc'] )
     self.assertEqual(obj.getWordCount('abc'),2)
     obj = DTrie()
     list(map(obj.add,['foo','bar','bar','baz']))
     self.assertEqual((obj.getWordCount('bar'),\
                      obj.getWordCount('baz'),\
                      obj.getWordCount('foo')),(2,1,1))
Ejemplo n.º 13
0
 def test_stuff_3letter(self):
     obj = DTrie()
     self.assertFalse(obj.isWord("apple"))
     try:
         obj.add("")
     except AssertionError as exp:
         pass
     actual_words = ["a", "ab", "abc", "bbc"]
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue(obj.isWord(w))
     self.assertEqual(sorted(obj.getAllWords()), sorted(actual_words))
     self.assertEqual(obj.getAllWordsPrefix("ab"), ["ab", "abc"])
     return
 def test_stuff_3letter(self):
     obj = DTrie()
     self.assertFalse( obj.isWord('apple') )
     try:
         obj.add('')
     except AssertionError as exp: 
         pass
     actual_words = ['a','ab','abc','bbc']
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue( obj.isWord(w) )
     self.assertEqual( sorted(obj.getAllWords()),sorted(actual_words))
     self.assertEqual( obj.getAllWordsPrefix('ab'), ['ab','abc'] )
     return
Ejemplo n.º 15
0
 def test_count(self):
     obj = DTrie()
     foundE = False
     try:
         self.assertEqual(obj.getWordCount('foo'),0)
     except Exception as e:
         foundE = str(e).find("does not exist in Trie") >= 0
     self.assertTrue( foundE )
     obj = DTrie()
     words = ['foo','bar','bar','bar','baz']
     [obj.add(w) for w in words]
     self.assertEqual(obj.getWordCount('bar'),3)
     self.assertEqual(obj.getWordCount('foo'),1)
     self.assertEqual(obj.getWordCount('baz'),1)
Ejemplo n.º 16
0
 def test_load_dictionary(self):
     obj = DTrie()
     obj.loadWordFile(DICTIONARY_DATA_FILES['tamilvu'])
     self.assertEqual(len(obj.getAllWords()),63896)
     count = 0
     for word in obj.getAllWordsIterable():
         count = count + 1
     self.assertEqual(count,63896)
     words = obj.getAllWordsPrefix(u'பெரு')
     print(len(words))
     #for w in words:
     #    print(w)
     self.assertEqual( len(words), 215 )
 def test_load_dictionary(self):
     obj = DTrie()
     obj.loadWordFile(DICTIONARY_DATA_FILES['tamilvu'])
     self.assertEqual(len(obj.getAllWords()),63896)
     count = 0
     for word in obj.getAllWordsIterable():
         count = count + 1
     self.assertEqual(count,63896)
     words = obj.getAllWordsPrefix(u'பெரு')
     print(len(words))
     #for w in words:
     #    print(w)
     self.assertEqual( len(words), 215 )
 def test_count(self):
     obj = DTrie()
     foundE = False
     try:
         self.assertEqual(obj.getWordCount('foo'),0)
     except Exception as e:
         foundE = str(e).find("does not exist in Trie") >= 0
     self.assertTrue( foundE )
     obj = DTrie()
     words = ['foo','bar','bar','bar','baz']
     [obj.add(w) for w in words]
     self.assertEqual(obj.getWordCount('bar'),3)
     self.assertEqual(obj.getWordCount('foo'),1)
     self.assertEqual(obj.getWordCount('baz'),1)
Ejemplo n.º 19
0
 def test_trie_counts_and_prefix(self):
     obj = DTrie()
     actual_words = ['a','ab','abc','abc','bbc']
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue(obj.isWord(w))
     self.assertEqual(len(obj.getAllWords()),4)
     self.assertEqual( obj.getAllWordsPrefix('ab'),['ab','abc'] )
     self.assertEqual(obj.getWordCount('abc'),2)
     obj = DTrie()
     list(map(obj.add,['foo','bar','bar','baz']))
     self.assertEqual((obj.getWordCount('bar'),\
                      obj.getWordCount('baz'),\
                      obj.getWordCount('foo')),(2,1,1))
    def test_ngram(self):
        with codecs.open("data/gettysburg.txt","r","utf-8") as f:
            data = f.readlines()
        obj = DTrie()
        
        # driver for file data
        for line in data:
            prev_letter = u''
            # per-line processor - remove spaces
            for char in u"".join(re.split('\s+',line)).lower():
                if prev_letter.isalpha() and char.isalpha():
                    bigram = u"".join([prev_letter,char])
                    obj.add(bigram)
                # update previous
                prev_letter = char
        
        expected = {u'ab': 1,
        u'ac': 2,
        u'ad': 6,
        u'af': 1,
        u'ag': 3,
        u'ai': 2,
        u'ak': 1,
        u'al': 9,
        u'an': 15,
        u'ap': 1,
        u'ar': 10,
        u'as': 5,
        u'at': 36,
        u'au': 1,
        u'av': 8,
        u'ay': 1,
        u'ba': 1,
        u'be': 5,
        u'bi': 1,
        u'bl': 1,
        u'bo': 1,
        u'br': 2,
        u'bu': 3,
        u'by': 1,
        u'ca': 12,
        u'ce': 4,
        u'ch': 2,
        u'ci': 1,
        u'co': 7,
        u'cr': 4,
        u'ct': 1,
        u'da': 1,
        u'dd': 7,
        u'de': 14,
        u'dh': 2,
        u'di': 12,
        u'do': 4,
        u'dp': 1,
        u'dr': 1,
        u'ds': 3,
        u'dt': 3,
        u'du': 1,
        u'dv': 1,
        u'dw': 2,
        u'ea': 16,
        u'eb': 1,
        u'ec': 8,
        u'ed': 25,
        u'ee': 3,
        u'ef': 4,
        u'eg': 3,
        u'eh': 6,
        u'ei': 4,
        u'el': 4,
        u'em': 5,
        u'en': 9,
        u'eo': 5,
        u'ep': 4,
        u'eq': 1,
        u'er': 22,
        u'es': 9,
        u'et': 13,
        u'eu': 2,
        u'ev': 4,
        u'ew': 4,
        u'ey': 3,
        u'fa': 3,
        u'fd': 1,
        u'ff': 1,
        u'fi': 5,
        u'fo': 10,
        u'fr': 3,
        u'ft': 3,
        u'fu': 1,
        u'ga': 6,
        u'gb': 1,
        u'ge': 6,
        u'gg': 1,
        u'gh': 4,
        u'gl': 1,
        u'go': 3,
        u'gp': 1,
        u'gr': 5,
        u'gw': 1,
        u'ha': 24,
        u'he': 33,
        u'hf': 1,
        u'hi': 7,
        u'hl': 1,
        u'ho': 8,
        u'ht': 5,
        u'hu': 1,
        u'ib': 1,
        u'ic': 8,
        u'id': 1,
        u'ie': 3,
        u'ig': 2,
        u'il': 2,
        u'in': 16,
        u'io': 9,
        u'ir': 2,
        u'is': 9,
        u'it': 8,
        u'iv': 7,
        u'ke': 1,
        u'kr': 1,
        u'kw': 1,
        u'la': 3,
        u'ld': 4,
        u'le': 6,
        u'lh': 1,
        u'li': 6,
        u'll': 8,
        u'lm': 2,
        u'ln': 2,
        u'lo': 3,
        u'lr': 1,
        u'lt': 1,
        u'lv': 1,
        u'lw': 1,
        u'ly': 2,
        u'ma': 1,
        u'mb': 1,
        u'me': 7,
        u'mi': 1,
        u'mt': 1,
        u'na': 10,
        u'nc': 4,
        u'nd': 9,
        u'ne': 4,
        u'nf': 1,
        u'ng': 9,
        u'ni': 2,
        u'nl': 2,
        u'nm': 1,
        u'nn': 4,
        u'no': 11,
        u'ns': 4,
        u'nt': 6,
        u'nv': 1,
        u'ny': 2,
        u'oa': 1,
        u'ob': 3,
        u'oc': 1,
        u'od': 3,
        u'of': 6,
        u'og': 1,
        u'oh': 1,
        u'ol': 1,
        u'om': 4,
        u'on': 20,
        u'oo': 2,
        u'op': 5,
        u'or': 17,
        u'os': 3,
        u'ot': 13,
        u'ou': 7,
        u'ov': 2,
        u'ow': 3,
        u'pe': 5,
        u'pl': 4,
        u'po': 4,
        u'pr': 2,
        u'qu': 1,
        u'ra': 7,
        u'rd': 1,
        u're': 27,
        u'rf': 3,
        u'rg': 4,
        u'ri': 1,
        u'rk': 1,
        u'rl': 3,
        u'rn': 1,
        u'ro': 6,
        u'rp': 2,
        u'rs': 5,
        u'rt': 10,
        u'ru': 3,
        u'rw': 2,
        u'sa': 4,
        u'sb': 2,
        u'sc': 2,
        u'se': 10,
        u'sf': 2,
        u'sg': 1,
        u'sh': 6,
        u'si': 1,
        u'sk': 1,
        u'sn': 1,
        u'so': 4,
        u'sr': 1,
        u'ss': 1,
        u'st': 7,
        u'su': 1,
        u'ta': 2,
        u'tb': 1,
        u'tc': 4,
        u'td': 1,
        u'te': 11,
        u'tf': 3,
        u'th': 47,
        u'ti': 17,
        u'tl': 3,
        u'tn': 2,
        u'to': 11,
        u'tp': 1,
        u'tr': 2,
        u'tt': 9,
        u'tw': 5,
        u'ty': 2,
        u'ua': 1,
        u'ug': 3,
        u'ul': 2,
        u'un': 3,
        u'ur': 6,
        u'us': 5,
        u'ut': 2,
        u'va': 2,
        u've': 17,
        u'vi': 3,
        u'vo': 2,
        u'wa': 2,
        u'wb': 1,
        u'we': 11,
        u'wh': 8,
        u'wi': 1,
        u'wn': 1,
        u'wo': 2,
        u'ww': 1,
        u'ya': 1,
        u'yd': 1,
        u'ye': 1,
        u'yg': 1,
        u'yh': 1,
        u'yn': 1,
        u'yr': 1,
        u'ys': 1,
        u'yt': 1,
        u'yw': 1}

        actual = obj.getAllWordsAndCount()
        # pprint( actual )
        self.assertEqual(expected,actual)
        return
Ejemplo n.º 21
0
    def test_ngram(self):
        with codecs.open("data/gettysburg.txt", "r", "utf-8") as f:
            data = f.readlines()
        obj = DTrie()

        # driver for file data
        for line in data:
            prev_letter = u''
            # per-line processor - remove spaces
            for char in u"".join(re.split('\s+', line)).lower():
                if prev_letter.isalpha() and char.isalpha():
                    bigram = u"".join([prev_letter, char])
                    obj.add(bigram)
                # update previous
                prev_letter = char

        expected = {
            u'ab': 1,
            u'ac': 2,
            u'ad': 6,
            u'af': 1,
            u'ag': 3,
            u'ai': 2,
            u'ak': 1,
            u'al': 9,
            u'an': 15,
            u'ap': 1,
            u'ar': 10,
            u'as': 5,
            u'at': 36,
            u'au': 1,
            u'av': 8,
            u'ay': 1,
            u'ba': 1,
            u'be': 5,
            u'bi': 1,
            u'bl': 1,
            u'bo': 1,
            u'br': 2,
            u'bu': 3,
            u'by': 1,
            u'ca': 12,
            u'ce': 4,
            u'ch': 2,
            u'ci': 1,
            u'co': 7,
            u'cr': 4,
            u'ct': 1,
            u'da': 1,
            u'dd': 7,
            u'de': 14,
            u'dh': 2,
            u'di': 12,
            u'do': 4,
            u'dp': 1,
            u'dr': 1,
            u'ds': 3,
            u'dt': 3,
            u'du': 1,
            u'dv': 1,
            u'dw': 2,
            u'ea': 16,
            u'eb': 1,
            u'ec': 8,
            u'ed': 25,
            u'ee': 3,
            u'ef': 4,
            u'eg': 3,
            u'eh': 6,
            u'ei': 4,
            u'el': 4,
            u'em': 5,
            u'en': 9,
            u'eo': 5,
            u'ep': 4,
            u'eq': 1,
            u'er': 22,
            u'es': 9,
            u'et': 13,
            u'eu': 2,
            u'ev': 4,
            u'ew': 4,
            u'ey': 3,
            u'fa': 3,
            u'fd': 1,
            u'ff': 1,
            u'fi': 5,
            u'fo': 10,
            u'fr': 3,
            u'ft': 3,
            u'fu': 1,
            u'ga': 6,
            u'gb': 1,
            u'ge': 6,
            u'gg': 1,
            u'gh': 4,
            u'gl': 1,
            u'go': 3,
            u'gp': 1,
            u'gr': 5,
            u'gw': 1,
            u'ha': 24,
            u'he': 33,
            u'hf': 1,
            u'hi': 7,
            u'hl': 1,
            u'ho': 8,
            u'ht': 5,
            u'hu': 1,
            u'ib': 1,
            u'ic': 8,
            u'id': 1,
            u'ie': 3,
            u'ig': 2,
            u'il': 2,
            u'in': 16,
            u'io': 9,
            u'ir': 2,
            u'is': 9,
            u'it': 8,
            u'iv': 7,
            u'ke': 1,
            u'kr': 1,
            u'kw': 1,
            u'la': 3,
            u'ld': 4,
            u'le': 6,
            u'lh': 1,
            u'li': 6,
            u'll': 8,
            u'lm': 2,
            u'ln': 2,
            u'lo': 3,
            u'lr': 1,
            u'lt': 1,
            u'lv': 1,
            u'lw': 1,
            u'ly': 2,
            u'ma': 1,
            u'mb': 1,
            u'me': 7,
            u'mi': 1,
            u'mt': 1,
            u'na': 10,
            u'nc': 4,
            u'nd': 9,
            u'ne': 4,
            u'nf': 1,
            u'ng': 9,
            u'ni': 2,
            u'nl': 2,
            u'nm': 1,
            u'nn': 4,
            u'no': 11,
            u'ns': 4,
            u'nt': 6,
            u'nv': 1,
            u'ny': 2,
            u'oa': 1,
            u'ob': 3,
            u'oc': 1,
            u'od': 3,
            u'of': 6,
            u'og': 1,
            u'oh': 1,
            u'ol': 1,
            u'om': 4,
            u'on': 20,
            u'oo': 2,
            u'op': 5,
            u'or': 17,
            u'os': 3,
            u'ot': 13,
            u'ou': 7,
            u'ov': 2,
            u'ow': 3,
            u'pe': 5,
            u'pl': 4,
            u'po': 4,
            u'pr': 2,
            u'qu': 1,
            u'ra': 7,
            u'rd': 1,
            u're': 27,
            u'rf': 3,
            u'rg': 4,
            u'ri': 1,
            u'rk': 1,
            u'rl': 3,
            u'rn': 1,
            u'ro': 6,
            u'rp': 2,
            u'rs': 5,
            u'rt': 10,
            u'ru': 3,
            u'rw': 2,
            u'sa': 4,
            u'sb': 2,
            u'sc': 2,
            u'se': 10,
            u'sf': 2,
            u'sg': 1,
            u'sh': 6,
            u'si': 1,
            u'sk': 1,
            u'sn': 1,
            u'so': 4,
            u'sr': 1,
            u'ss': 1,
            u'st': 7,
            u'su': 1,
            u'ta': 2,
            u'tb': 1,
            u'tc': 4,
            u'td': 1,
            u'te': 11,
            u'tf': 3,
            u'th': 47,
            u'ti': 17,
            u'tl': 3,
            u'tn': 2,
            u'to': 11,
            u'tp': 1,
            u'tr': 2,
            u'tt': 9,
            u'tw': 5,
            u'ty': 2,
            u'ua': 1,
            u'ug': 3,
            u'ul': 2,
            u'un': 3,
            u'ur': 6,
            u'us': 5,
            u'ut': 2,
            u'va': 2,
            u've': 17,
            u'vi': 3,
            u'vo': 2,
            u'wa': 2,
            u'wb': 1,
            u'we': 11,
            u'wh': 8,
            u'wi': 1,
            u'wn': 1,
            u'wo': 2,
            u'ww': 1,
            u'ya': 1,
            u'yd': 1,
            u'ye': 1,
            u'yg': 1,
            u'yh': 1,
            u'yn': 1,
            u'yr': 1,
            u'ys': 1,
            u'yt': 1,
            u'yw': 1
        }

        actual = obj.getAllWordsAndCount()
        # pprint( actual )
        self.assertEqual(expected, actual)
        return
Ejemplo n.º 22
0
    def test_ngram(self):
        with codecs.open("data/gettysburg.txt", "r", "utf-8") as f:
            data = f.readlines()
        obj = DTrie()

        # driver for file data
        for line in data:
            prev_letter = u""
            # per-line processor - remove spaces
            for char in u"".join(re.split("\s+", line)).lower():
                if prev_letter.isalpha() and char.isalpha():
                    bigram = u"".join([prev_letter, char])
                    obj.add(bigram)
                # update previous
                prev_letter = char

        expected = {
            u"ab": 1,
            u"ac": 2,
            u"ad": 6,
            u"af": 1,
            u"ag": 3,
            u"ai": 2,
            u"ak": 1,
            u"al": 9,
            u"an": 15,
            u"ap": 1,
            u"ar": 10,
            u"as": 5,
            u"at": 36,
            u"au": 1,
            u"av": 8,
            u"ay": 1,
            u"ba": 1,
            u"be": 5,
            u"bi": 1,
            u"bl": 1,
            u"bo": 1,
            u"br": 2,
            u"bu": 3,
            u"by": 1,
            u"ca": 12,
            u"ce": 4,
            u"ch": 2,
            u"ci": 1,
            u"co": 7,
            u"cr": 4,
            u"ct": 1,
            u"da": 1,
            u"dd": 7,
            u"de": 14,
            u"dh": 2,
            u"di": 12,
            u"do": 4,
            u"dp": 1,
            u"dr": 1,
            u"ds": 3,
            u"dt": 3,
            u"du": 1,
            u"dv": 1,
            u"dw": 2,
            u"ea": 16,
            u"eb": 1,
            u"ec": 8,
            u"ed": 25,
            u"ee": 3,
            u"ef": 4,
            u"eg": 3,
            u"eh": 6,
            u"ei": 4,
            u"el": 4,
            u"em": 5,
            u"en": 9,
            u"eo": 5,
            u"ep": 4,
            u"eq": 1,
            u"er": 22,
            u"es": 9,
            u"et": 13,
            u"eu": 2,
            u"ev": 4,
            u"ew": 4,
            u"ey": 3,
            u"fa": 3,
            u"fd": 1,
            u"ff": 1,
            u"fi": 5,
            u"fo": 10,
            u"fr": 3,
            u"ft": 3,
            u"fu": 1,
            u"ga": 6,
            u"gb": 1,
            u"ge": 6,
            u"gg": 1,
            u"gh": 4,
            u"gl": 1,
            u"go": 3,
            u"gp": 1,
            u"gr": 5,
            u"gw": 1,
            u"ha": 24,
            u"he": 33,
            u"hf": 1,
            u"hi": 7,
            u"hl": 1,
            u"ho": 8,
            u"ht": 5,
            u"hu": 1,
            u"ib": 1,
            u"ic": 8,
            u"id": 1,
            u"ie": 3,
            u"ig": 2,
            u"il": 2,
            u"in": 16,
            u"io": 9,
            u"ir": 2,
            u"is": 9,
            u"it": 8,
            u"iv": 7,
            u"ke": 1,
            u"kr": 1,
            u"kw": 1,
            u"la": 3,
            u"ld": 4,
            u"le": 6,
            u"lh": 1,
            u"li": 6,
            u"ll": 8,
            u"lm": 2,
            u"ln": 2,
            u"lo": 3,
            u"lr": 1,
            u"lt": 1,
            u"lv": 1,
            u"lw": 1,
            u"ly": 2,
            u"ma": 1,
            u"mb": 1,
            u"me": 7,
            u"mi": 1,
            u"mt": 1,
            u"na": 10,
            u"nc": 4,
            u"nd": 9,
            u"ne": 4,
            u"nf": 1,
            u"ng": 9,
            u"ni": 2,
            u"nl": 2,
            u"nm": 1,
            u"nn": 4,
            u"no": 11,
            u"ns": 4,
            u"nt": 6,
            u"nv": 1,
            u"ny": 2,
            u"oa": 1,
            u"ob": 3,
            u"oc": 1,
            u"od": 3,
            u"of": 6,
            u"og": 1,
            u"oh": 1,
            u"ol": 1,
            u"om": 4,
            u"on": 20,
            u"oo": 2,
            u"op": 5,
            u"or": 17,
            u"os": 3,
            u"ot": 13,
            u"ou": 7,
            u"ov": 2,
            u"ow": 3,
            u"pe": 5,
            u"pl": 4,
            u"po": 4,
            u"pr": 2,
            u"qu": 1,
            u"ra": 7,
            u"rd": 1,
            u"re": 27,
            u"rf": 3,
            u"rg": 4,
            u"ri": 1,
            u"rk": 1,
            u"rl": 3,
            u"rn": 1,
            u"ro": 6,
            u"rp": 2,
            u"rs": 5,
            u"rt": 10,
            u"ru": 3,
            u"rw": 2,
            u"sa": 4,
            u"sb": 2,
            u"sc": 2,
            u"se": 10,
            u"sf": 2,
            u"sg": 1,
            u"sh": 6,
            u"si": 1,
            u"sk": 1,
            u"sn": 1,
            u"so": 4,
            u"sr": 1,
            u"ss": 1,
            u"st": 7,
            u"su": 1,
            u"ta": 2,
            u"tb": 1,
            u"tc": 4,
            u"td": 1,
            u"te": 11,
            u"tf": 3,
            u"th": 47,
            u"ti": 17,
            u"tl": 3,
            u"tn": 2,
            u"to": 11,
            u"tp": 1,
            u"tr": 2,
            u"tt": 9,
            u"tw": 5,
            u"ty": 2,
            u"ua": 1,
            u"ug": 3,
            u"ul": 2,
            u"un": 3,
            u"ur": 6,
            u"us": 5,
            u"ut": 2,
            u"va": 2,
            u"ve": 17,
            u"vi": 3,
            u"vo": 2,
            u"wa": 2,
            u"wb": 1,
            u"we": 11,
            u"wh": 8,
            u"wi": 1,
            u"wn": 1,
            u"wo": 2,
            u"ww": 1,
            u"ya": 1,
            u"yd": 1,
            u"ye": 1,
            u"yg": 1,
            u"yh": 1,
            u"yn": 1,
            u"yr": 1,
            u"ys": 1,
            u"yt": 1,
            u"yw": 1,
        }

        actual = obj.getAllWordsAndCount()
        # pprint( actual )
        self.assertEqual(expected, actual)
        return
Ejemplo n.º 23
0
 def test_trie_counts_and_prefix(self):
     obj = DTrie()
     actual_words = ["a", "ab", "abc", "abc", "bbc"]
     [obj.add(w) for w in actual_words]
     for w in actual_words:
         self.assertTrue(obj.isWord(w))
     self.assertEqual(len(obj.getAllWords()), 4)
     self.assertEqual(obj.getAllWordsPrefix("ab"), ["ab", "abc"])
     self.assertEqual(obj.getWordCount("abc"), 2)
     obj = DTrie()
     list(map(obj.add, ["foo", "bar", "bar", "baz"]))
     self.assertEqual(
         (obj.getWordCount("bar"), obj.getWordCount("baz"), obj.getWordCount("foo")),
         (2, 1, 1),
     )
Ejemplo n.º 24
0
 def test_trie_neg(self):
     obj = DTrie()
     self.assertEqual( obj.getAllWords(), [] )
     self.assertEqual( obj.getAllWordsPrefix('none'), [] )
     self.assertFalse( obj.isWord('fubar',True)[0] )
     self.assertTrue( obj.isWord('fubar',True)[1] )
 def test_trie_neg(self):
     obj = DTrie()
     self.assertEqual( obj.getAllWords(), [] )
     self.assertEqual( obj.getAllWordsPrefix('none'), [] )
     self.assertFalse( obj.isWord('fubar',True)[0] )
     self.assertTrue( obj.isWord('fubar',True)[1] )
Ejemplo n.º 26
0
 def test_words_n_count(self):
     obj = DTrie()
     words = ["foo", "bar", "bar", "bar", "baz"]
     [obj.add(w) for w in words]
     actual = {"foo": 1, "bar": 3, "baz": 1}
     self.assertEqual(obj.getAllWordsAndCount(), actual)
 def test_words_n_count(self):
     obj = DTrie()
     words = ['foo','bar','bar','bar','baz']
     [obj.add(w) for w in words]
     actual = {'foo':1,'bar':3,'baz':1}
     self.assertEqual(obj.getAllWordsAndCount(),actual)
Ejemplo n.º 28
0
 def test_words_n_count(self):
     obj = DTrie()
     words = ['foo','bar','bar','bar','baz']
     [obj.add(w) for w in words]
     actual = {'foo':1,'bar':3,'baz':1}
     self.assertEqual(obj.getAllWordsAndCount(),actual)