Example #1
0
    def index(self):
        db_access_info = pg_utils.get_db_access_info()
        with psycopg2.connect(database=db_access_info[0],
                              user=db_access_info[1],
                              password=db_access_info[2]) as connection:

            with connection.cursor() as cursor:

                word_centre_letter_at_random_query = \
                    ' '.join((
                        "SELECT * FROM pgt_board",
                        "ORDER BY RANDOM() LIMIT 2",
                    ))

                cursor.execute(word_centre_letter_at_random_query)
                word_centre_letter = cursor.fetchall()  # list of 1 tuple 

        connection.close()
        word_for_board = ''.join(word_centre_letter[0][0])
        centre_letter_for_board = ''.join(word_centre_letter[0][1])
        koru = pk.get_koru(word_for_board, centre_letter_for_board)
        word_for_display = ''.join(word_centre_letter[1][0])
        template_id = 'index'
        template = self.env.get_template(template_id + '.html')
        return template.render({"template_id": template_id,
                                "word": word_for_display,
                                "koru": koru})
Example #2
0
def distribute_children():
    '''
    For each word, middle letter combination (board)
    distribute the child words into groups according to frequency

    and then remove those boards that don't have 'enough'
    in any of the 3 groups
    '''

    # get the word frequency data
    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:
            word_frequency_pairs = []

            all_word_frequency_data_query = \
                ' '.join((
                    "SELECT * FROM pgt_word_frequency",
                ))

            cursor.execute(all_word_frequency_data_query)
            word_frequency_pairs = cursor.fetchall()  # list of tuples
            words = [x[0] for x in word_frequency_pairs]

    boards = []
    for k, v in boards_and_children.boards_and_children.items():
        child_word_frequencies = []
        for child_word in v:
            try:
                child_word_index = words.index(child_word)
            except ValueError:
                # the headword in the dictionary doesn't appear
                # anywhere in the example text
                child_word_score = 0
            else:
                child_word_score = word_frequency_pairs[child_word_index][1]

            child_word_frequencies.append(child_word_score)

        intervals = [frozenset(range(PAI_MINIMUM_FREQUENCY, VERY_LARGE_NUMBER)), \
                     frozenset(range(TINO_PAI_MINIMUM_FREQUENCY, PAI_MINIMUM_FREQUENCY)), \
                     frozenset(range(TINO_PAI_RAWA_ATU_MINIMUM_FREQUENCY, \
                                     TINO_PAI_MINIMUM_FREQUENCY))]
        counts = [0] * len(intervals)

        for n in sorted(child_word_frequencies):
          for i, inter in enumerate(intervals):
            if n in inter:
              counts[i] += 1

        if not(counts[0] < MINIMUM_GROUP_SIZE or 
               counts[1] < MINIMUM_GROUP_SIZE or 
               counts[2] < MINIMUM_GROUP_SIZE):
            boards.append(k)

    return boards
Example #3
0
def get_word_frequency_distribution():
    '''
    For each word, middle letter combination 
    split the data into groups
    '''

    # get the word frequency data
    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:
            word_frequency_pairs = []

            all_word_frequency_data_query = \
                ' '.join((
                    "SELECT * FROM pgt_word_frequency",
                ))

            cursor.execute(all_word_frequency_data_query)
            word_frequency_pairs = cursor.fetchall()  # list of tuples
            words = [x[0] for x in word_frequency_pairs]

    scores = {}
    burp = 0
    for k, v in boards_and_children.boards_and_children.items():
        child_word_frequencies = []
        for child_word in v:
            try:
                child_word_index = words.index(child_word)
            except ValueError:
                # the headword in the dictionary doesn't appear
                # anywhere in the example text
                child_word_score = 0
            else:
                child_word_score = word_frequency_pairs[child_word_index][1]

            child_word_frequencies.append(child_word_score)

        scores[k] = sorted(child_word_frequencies)

        intervals = [frozenset(range(24, 100000)), \
                     frozenset(range(4, 24)), \
                     frozenset(range(4))]
        counts = [0] * len(intervals)

        for n in sorted(child_word_frequencies):
          for i, inter in enumerate(intervals):
            if n in inter:
              counts[i] += 1

        if not(counts[0] < 4 or counts[1] < 4 or counts[2] < 4):
                burp = burp +1
                print(k, burp, counts[0], counts[1], counts[2])
Example #4
0
def group_children(children):
    '''
    This function takes a list of  children and sorts them into
    three lists of tuples (word, frequency)
    - pai
    - tino pai
    - tino pai rawa atu
    '''
    # get the word frequency data
    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:
            word_frequency_pairs = []
            for child in children:
                frequency_data_query = \
                    ' '.join((
                        "SELECT kount FROM pgt_word_frequency",
                        "WHERE word = (%s)",                 
                    ))

                cursor.execute(frequency_data_query, (child,))
                frequency = cursor.fetchall()  # list of 1 tuple (assumed)
                if frequency == []:
                    # child word not found in word frequency list
                    frequency_to_use = 0
                else:
                    frequency_to_use = int(frequency[0][0])
                word_frequency_pairs.append((child, frequency_to_use))

    intervals = [frozenset(range(PAI_MINIMUM_FREQUENCY, VERY_LARGE_NUMBER)), \
                 frozenset(range(TINO_PAI_MINIMUM_FREQUENCY, PAI_MINIMUM_FREQUENCY)), \
                 frozenset(range(TINO_PAI_RAWA_ATU_MINIMUM_FREQUENCY, \
                                 TINO_PAI_MINIMUM_FREQUENCY))]

    pai = []
    tino_pai = []
    tino_pai_rawa_atu = []

    for word, frequency in word_frequency_pairs:
        if frequency in intervals[0]:
            pai.append((word, frequency))
        elif frequency in intervals[1]:
            tino_pai.append((word, frequency))
        elif frequency in intervals[2]:
            tino_pai_rawa_atu.append((word, frequency))
        else:
            raise
    return pai, tino_pai, tino_pai_rawa_atu    
Example #5
0
def get_children(input_string, compulsory_letter, minimum_length=3):
    '''
    Returns a list containing all the word forms (children)
    that can be made from the input_string.

    The input string can be of one of two forms
    a) A Māori word
    b) A Koru

    If the latter then any digraphs on the last row need to be reversed.
    '''

    # if minimum length is passed as a string *try* and convert to integer
    minimum_length = int(minimum_length)

    children = []

    # if the input string contains any reversed digraphs, reverse them
    # note that this can only happen with a koru on the last line
    # these 2 are mutually exclusive and will not interfere with eachother

    # swap digraphs around if necessary
    if input_string[6] + input_string[5] in pū.digraphs:
        input_string = list(input_string)
        input_string[6], input_string[5] = input_string[5], input_string[6]
        input_string = ''.join(input_string)
    elif input_string[5] + input_string[4] in pū.digraphs:
        input_string = list(input_string)
        input_string[5], input_string[4] = input_string[4], input_string[5]
        input_string = ''.join(input_string)
    else:
        pass  # no action required as there are no reversed digraphs

    input_string_as_list = mw._aslist(input_string)

    # get the word list
    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:

            all_word_forms_query = "SELECT * FROM pgt_word"
            cursor.execute(all_word_forms_query)
            unique_word_forms = cursor.fetchall()  # list of tuples

    connection.close()

    # list of strings
    unique_word_forms = [''.join(x) for x in unique_word_forms]
    for word in [x for x in unique_word_forms if len(x) >= minimum_length]:

        word_as_list = mw._aslist(word)

        is_child = False
        if not (Counter(word_as_list) - Counter(input_string_as_list)):
            is_child = True

        if is_child and compulsory_letter in word_as_list:
            children.append(word)

    return(children)
Example #6
0
def test_pangakupu_words():

    db_access_info = pg_utils.get_db_access_info()
    with psycopg2.connect(database=db_access_info[0],
                          user=db_access_info[1],
                          password=db_access_info[2]) as connection:

        with connection.cursor() as cursor:

            all_word_forms_query = "SELECT * FROM pgt_word"
            cursor.execute(all_word_forms_query)
            unique_word_forms = cursor.fetchall() #list of tuples
 
    connection.close()
    all_words_for_iwa = [''.join(x) for x in unique_word_forms] #list of strings

    #word counts
    assert len(all_words_for_iwa) == 11601
    c = Counter(len(x) for x in all_words_for_iwa)
    assert dict(c) == {1: 9,
                       2: 57,
                       3: 255,
                       4: 1099,
                       5: 1169,
                       6: 2691,
                       7: 1568,
                       8: 1949,
                       9: 830,
                       10: 971,
                       11: 451,
                       12: 279,
                       13: 164,
                       14: 54,
                       15: 35,
                       16: 10,
                       17: 6,
                       18: 3,
                       19: 1}

    assert sum(dict(c).values()) == 11601 #recheck the count
    assert sum([k * v for k, v in dict(c).items()]) == 83080 #letter counts
    assert len(set(all_words_for_iwa)) == 11601 #test for uniqueness
    
    #check every entry is lower case
    assert [x if x.lower() == x else 'derp' for x in all_words_for_iwa] == all_words_for_iwa

    #check every entry is free of punctuation
    assert [x if mw._isalllegalletters(x) else 'derp' for x in all_words_for_iwa] == all_words_for_iwa

    #check that the basics for all maori words hold
    for x in all_words_for_iwa:
        assert x == mw.MaoriWord(x).word

    #letter counts
    all_letters_for_iwa = []
    for x in all_words_for_iwa:
        all_letters_for_iwa.extend(mw._aslist(x))
    c = dict(Counter(all_letters_for_iwa))
    assert c == {'a': 14894,
                 'ā': 2252,
                 'e': 5125,
                 'ē': 281,
                 'h': 3970,
                 'i': 6765,
                 'ī': 627,
                 'k': 6882,
                 'm': 2406,
                 'n': 2002,
                 'ng': 1834,
                 'o': 5521,
                 'ō': 1216,
                 'p': 3733,
                 'r': 6270,
                 't': 5880,
                 'u': 5736,
                 'ū': 993,
                 'w': 1245,
                 'wh': 1807}

    assert sum(dict(c).values()) == 79439 #digraphs count as 1 letter

    #cross check letter counts from words vs direct letter counts
    assert 83080 == 79439 + c['ng'] + c['wh'] #digraphs count as 2 letters