Example #1
0
def test_all_legal_characters():
    #check the method that checks that the word consists of legal characters only
    assert mw._isalllegalletters('aeiou') == True
    assert mw._isalllegalletters('whakangākau') == True
    assert mw._isalllegalletters('wakangākau') == True #missing h but all legal characters
    assert mw._isalllegalletters('x') == False
    assert mw._isalllegalletters('whakagākau') == False  #missed the 'n' leaving 'g' which is illegal
Example #2
0
def test_pangakupu_words():

    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:

            all_word_forms_query = "SELECT * FROM pgt_word"
            cursor.execute(all_word_forms_query)
            unique_word_forms = cursor.fetchall() #list of tuples
 
    connection.close()
    all_words_for_iwa = [''.join(x) for x in unique_word_forms] #list of strings

    #word counts
    assert len(all_words_for_iwa) == 11601
    c = Counter(len(x) for x in all_words_for_iwa)
    assert dict(c) == {1: 9,
                       2: 57,
                       3: 255,
                       4: 1099,
                       5: 1169,
                       6: 2691,
                       7: 1568,
                       8: 1949,
                       9: 830,
                       10: 971,
                       11: 451,
                       12: 279,
                       13: 164,
                       14: 54,
                       15: 35,
                       16: 10,
                       17: 6,
                       18: 3,
                       19: 1}

    assert sum(dict(c).values()) == 11601 #recheck the count
    assert sum([k * v for k, v in dict(c).items()]) == 83080 #letter counts
    assert len(set(all_words_for_iwa)) == 11601 #test for uniqueness
    
    #check every entry is lower case
    assert [x if x.lower() == x else 'derp' for x in all_words_for_iwa] == all_words_for_iwa

    #check every entry is free of punctuation
    assert [x if mw._isalllegalletters(x) else 'derp' for x in all_words_for_iwa] == all_words_for_iwa

    #check that the basics for all maori words hold
    for x in all_words_for_iwa:
        assert x == mw.MaoriWord(x).word

    #letter counts
    all_letters_for_iwa = []
    for x in all_words_for_iwa:
        all_letters_for_iwa.extend(mw._aslist(x))
    c = dict(Counter(all_letters_for_iwa))
    assert c == {'a': 14894,
                 'ā': 2252,
                 'e': 5125,
                 'ē': 281,
                 'h': 3970,
                 'i': 6765,
                 'ī': 627,
                 'k': 6882,
                 'm': 2406,
                 'n': 2002,
                 'ng': 1834,
                 'o': 5521,
                 'ō': 1216,
                 'p': 3733,
                 'r': 6270,
                 't': 5880,
                 'u': 5736,
                 'ū': 993,
                 'w': 1245,
                 'wh': 1807}

    assert sum(dict(c).values()) == 79439 #digraphs count as 1 letter

    #cross check letter counts from words vs direct letter counts
    assert 83080 == 79439 + c['ng'] + c['wh'] #digraphs count as 2 letters