Ejemplo n.º 1
    def __init__(self, untagged_sents, pos_tags, words_given_pos, \
        words_given_pos_upper, pos2_given_pos1, start_tag):
        Construct a HMM object
        :param untagged_sents: list of untagged sentences for tagging
        :param pos_tags: list of possible POS tags
        :param words_given_pos: nltk.ConditionalFreqDist for P(Wi|Ck) with all words
            converted to lowercase
        :param words_given_pos_upper: nltk.ConditionalFreqDist for P(Wi|Ck) with
            words left in original capitalization
        :param pos2_given_pos1: nltk.ConditionalFreqDist for P(Ci+1|Ci)
        :param default_tag: POS tag to guess for words
        :param start_tag: start tag used to mark sentence beginning

        self.start_tag = start_tag
        self.untagged_sents = untagged_sents
        self.num_untagged_sents = len(untagged_sents)
        self.all_pos_tags = pos_tags
        self.words_given_pos = words_given_pos
        self.words_given_pos_upper = words_given_pos_upper
        self.pos2_given_pos1 = pos2_given_pos1

        # initialize one guesser object to use for the whole test
        self.guesser = Guesser(pos_tags, words_given_pos)
Ejemplo n.º 2
def run_test_set(test_set):
    test_set = dict: {'learn', 'test', 'id'}
            'learn' is a list of files to learn from
            'test' is a list of files to test
            'id' is an id for this test-set
    guesser = Guesser()
    # do some guessing
    correct_guesses = 0
    missed_guesses = 0
    correct_count = 0
    missed_count = 0
    for log_file in test_set['test']:
        tester_log_file = TesterLogFile(log_file)
        for idx, url in enumerate(tester_log_file.load_urls[:-1]):
            info_pairs = guesser.get_guesses(url)
            guessed_urls = [url for [url, weight] in info_pairs]
            local_correct_count = \
                                                               url, idx)
            local_missed_count = len(guessed_urls) - local_correct_count
            if local_correct_count > 0:
                correct_guesses += 1
                missed_guesses += 1
            correct_count += local_correct_count
            missed_count += local_missed_count
    logging.info("Tested set {}: {} hits, {} misses, "
                "{} hit count, {} miss count"
                .format(test_set['id'], correct_guesses, missed_guesses, 
                        correct_count, missed_count))
    return correct_guesses, missed_guesses, correct_count, missed_count
Ejemplo n.º 3
 def __init__(self, untagged_sents, pos_tags, words_given_pos, \
     words_given_pos_upper, pos2_given_pos1, start_tag):
     Construct a HMM object
     :param untagged_sents: list of untagged sentences for tagging
     :param pos_tags: list of possible POS tags
     :param words_given_pos: nltk.ConditionalFreqDist for P(Wi|Ck) with all words
         converted to lowercase
     :param words_given_pos_upper: nltk.ConditionalFreqDist for P(Wi|Ck) with
         words left in original capitalization
     :param pos2_given_pos1: nltk.ConditionalFreqDist for P(Ci+1|Ci)
     :param default_tag: POS tag to guess for words
     :param start_tag: start tag used to mark sentence beginning
     self.start_tag = start_tag
     self.untagged_sents = untagged_sents
     self.num_untagged_sents = len(untagged_sents)
     self.all_pos_tags = pos_tags
     self.words_given_pos = words_given_pos
     self.words_given_pos_upper = words_given_pos_upper
     self.pos2_given_pos1 = pos2_given_pos1
     # initialize one guesser object to use for the whole test
     self.guesser = Guesser(pos_tags, words_given_pos)
Ejemplo n.º 4
english = set()
wrong_guesses = []
alphabet = list('abcdefghijklmnopqrstuvwxyz')

#I'm not totally sure how this works, as it's a friend's code. It takes a
#text file of English words and turns it into a useful set of words for the AI
def read_file(name='wordsEn.txt'):
    fil = open(name)
    words = fil.readlines()
    for w in words: #removes the newlines from every word.

#My first ever class. 
from Guesser import Guesser
guesser_ai = Guesser(english, alphabet, len(word))
guesser_ai.same_len_test() #eliminates any diffenrent length words than target

#Takes a word or phrase and produces a hangman board with the apropiate number
#of underscores, with additional spaces between words.
def generate_board(word):
    global board
    x = 0  #counting where in the word is being tested
    for l in word:  #determines where spaces are in word
        if l == ' ':         #if a letter is a space
            spaces.append(x) #adds the location of that space to list
        x += 1
    length = len(word) #used to determine number of '_'s needed
    spaces.append(length - spaces[-1])
        #determines the location of the last character in the word/phrase
Ejemplo n.º 5
class SudokuSolver():
    guessStack = []
    def __init__(self,game):
        self.guesser = Guesser(self)

    def guessingSolve(self):
        game = self.game
        bad = False
        print('guessing solver')
        while True:
            # try the deterministic solver
            blanks = self.countEmpty()
            if blanks == 0:
                break #done
            [bad, cellIdx,guess] = self.guesser.getGuesses()
            if bad:
                #input('hit enter')
                [ok, cellIdx, guess] = self.guesser.backUp()
                if not ok :
                   print( "No guesses left. Can't solve")
            print( '===>guessing cell',cellIdx,'is',guess)
            pass #while True

    # non-guess solver code
    def solve(self):
        passes = 0
        oldBlanks = self.countEmpty()
        while True:
            blanks = self.countEmpty()
            if blanks == 0:
            passes +=1
            print( 'Solving puzzle. Pass {0}.  Initially {1} empty cells.'
            print('Test 1 - cells with only one option')
            for n in range(self.game.numCells):
                if self.game.cell[n].getv() !=' ':
                    continue # skip already filled cells
                t = self.findOptions(n,1)
                if len(t) == 1:
                    print('Cell',n,'can only be',t[0])
            if not self.game.checkGame():
            blanks = self.countEmpty()
            if blanks == 0:
                'Test 2 - by digits check for only one cell in a row can be it');
            for d in self.game.digList:
                # make sure that the options lists are up to date
                for n in range(self.game.numCells):
                for r in range(self.game.nDigits): # number of digits == cells in row
                    rowDig = []
                    for c in  range(self.game.nDigits):
                        idx = self.rc2idx(r,c)
                        if d in self.game.cell[idx].optList:
                        pass # for c
                    if len(rowDig) == 1:
                        if self.game.cell[rowDig[0]].getv() == ' ':
                            print( 'only',d,'in row',r,'is index',rowDig[0])
                    pass # for r
            if not self.game.checkGame():
            blanks = self.countEmpty()
            if blanks == 0:
                'Test 3 - by digits check for only one cell in a col can be it');
            for d in self.game.digList:
                # make sure that the options lists are up to date
                for n in range(self.game.numCells):
                for c in range(self.game.nDigits): # number of digits == cells in col
                    colDig = []
                    for r in  range(self.game.nDigits):
                        idx = self.rc2idx(r,c)
                        if d in self.game.cell[idx].optList:
                        pass # for r
                    if len(colDig) == 1:
                        if self.game.cell[colDig[0]].getv() == ' ':
                            print( 'only',d,'in col',c,'is index',colDig[0])
                    pass # for c
            if not self.game.checkGame():
            blanks = self.countEmpty()
            if blanks == 0:
                'Test 3 - by digits check for only one cell in a box can be it')
            for d in self.game.digList:
                for b in range(self.game.nDigits): # number of digits == cells in box
                    # make sure that the options lists are up to date
                    for n in range(self.game.numCells):
                    # find a row, col in b, get the index
                    r = (b // 3)* 3
                    c = (b * 3) % 9
                    idx = self.rc2idx(r,c)
                    bList = self.boxList(idx)
                    for idx in bList:
                        c = self.game.cell
                        #print(idx,c[idx].optList, '1' in c[idx].optList)
                        if d in self.game.cell[idx].optList:
                        pass # for idx
                    # this was over too far so it got executed for every idx :(
                    if len(boxDig) == 1:
                        if self.game.cell[boxDig[0]].getv() == ' ':
                            print( 'only',d,'in box',b,'is index',
                    pass # for b
                pass #for d
            if not self.game.checkGame():
            # check if we're done or stuck
            blanks = self.countEmpty()
            if blanks == 0:
            elif oldBlanks == blanks:
                oldBlanks = blanks
            pass # end of while loop
        if blanks == 0:
            print( 'Sudoku solved')
            print('Sudoku solver stuck')

# solver related functions
    def findOptions(self, idx, flag):
        ''' return a list of possible values for cell[idx]. if flag is 1,
 a filled cell only has it's value, otherwise other options show up '''
        if flag == 1:
            v = self.game.cell[idx].getv()
            if v != ' ':
                return [v]
        l = self.game.digits[self.game.nDigits]
        l = list(l[1:len(l)])
        rL = self.rowList(idx)
        rL.remove(idx) # leave active cell's value (if any) on list
        for n in rL:
            v = self.game.cell[n].getv()
            if v in l:

        # for the col of the active cell
        cL = self.colList(idx)
        cL.remove(idx) # leave active cell's value (if any) on list
        for n in cL:
            v = self.game.cell[n].getv()
            if v in l:

        # for the box of the active cell
        bL = self.boxList(idx)
        bL.remove(idx) # leave active cell's value (if any) on list
        for n in bL:
            v = self.game.cell[n].getv()
            if v in l:
        self.game.cell[idx].optList = l
        return l

    def countEmpty(self):
        count = 0
        for n in range(self.game.numCells):
            if self.game.cell[n].getv() == ' ':
                count += 1
        return count

    def rc2idx(self,r,c):
        """ takes a row and column number of a cell and returns its index """
        return c + r * self.game.nDigits

    def rowList(self, idx):
        """ returns a list of indices of cells in the same row as idx """
        rL = []
        r = self.game.cell[idx].row
        for c in range(self.game.nDigits):
            n = self.rc2idx(r,c)
        return rL

    def colList(self, idx):
        """ returns a list of indices of cells in the same column as idx """
        cL = []
        c = self.game.cell[idx].col
        for r in range(self.game.nDigits):
            n = self.rc2idx(r,c)
        return cL

    def boxList(self, idx):
        """ returns a list of indices of cells in the same box as idx """
        bL = []
        r = self.game.cell[idx].row
        c = self.game.cell[idx].col
##        print('boxList({0}) r{1} c{2} b{3}'.format
##              (idx,r,c,self.game.cell[idx].box))
        # find lower limit r, c of box
        n = self.game.n
        r = (r // n) * n
        c = (c // n) * n
##        print('new rc',r,c)
        for r1 in range( r,r+n):
            for c1 in range(c,c+n):
                idx = self.rc2idx(r1,c1)
##        print(bL)
        return bL
Ejemplo n.º 6
 def __init__(self,game):
     self.guesser = Guesser(self)
Ejemplo n.º 7
class HMM:
    "A class for building Hidden Markov Models of tagged word data"

    ######### CLASS VARIABLES #########

    # store a small list of punctuation to help with training P(Ci+1|Ci)
    punct_list = ["''", '``', ',']

    def __init__(self, untagged_sents, pos_tags, words_given_pos, \
        words_given_pos_upper, pos2_given_pos1, start_tag):
        Construct a HMM object
        :param untagged_sents: list of untagged sentences for tagging
        :param pos_tags: list of possible POS tags
        :param words_given_pos: nltk.ConditionalFreqDist for P(Wi|Ck) with all words
            converted to lowercase
        :param words_given_pos_upper: nltk.ConditionalFreqDist for P(Wi|Ck) with
            words left in original capitalization
        :param pos2_given_pos1: nltk.ConditionalFreqDist for P(Ci+1|Ci)
        :param default_tag: POS tag to guess for words
        :param start_tag: start tag used to mark sentence beginning

        self.start_tag = start_tag
        self.untagged_sents = untagged_sents
        self.num_untagged_sents = len(untagged_sents)
        self.all_pos_tags = pos_tags
        self.words_given_pos = words_given_pos
        self.words_given_pos_upper = words_given_pos_upper
        self.pos2_given_pos1 = pos2_given_pos1

        # initialize one guesser object to use for the whole test
        self.guesser = Guesser(pos_tags, words_given_pos)

    ######### `PUBLIC' FUNCTIONS #########

    def tag(self):
        Tag all this object's sentences, return a list of tagged sentences

        msg("Tagging sentences:\n")
        start_time = time.time()  # mark the start time for this process
        tagged_sents = []  # array to hold tagged sentences
        complete = 0  # how many sentences we have tagged

        # initialize variables to track for tagging stats
        total_prob_time = 0  # time spent looking up probabilities
        total_other_time = 0  # time spent doing other things
        total_guess_count = 0  # words we used the guesser to guess POS for
        total_word_count = 0  # num words tagged
        total_unknown_count = 0  # num words with no P(Wi|Ci)

        # tag each sentence and track statistics
        for sent in self.untagged_sents:
            total_word_count += len(sent)
            (tagged_sent, prob_time, other_time, guess_count, unknown_count) = \
            total_prob_time += prob_time
            total_other_time += other_time
            total_guess_count += guess_count
            total_unknown_count += unknown_count
            tagged_sents.append(tagged_sent)  # append tagged sentence to array
            complete += 1  # increment our completed counter for progress bar
            # show nice progress bar
            progress_bar(complete, self.num_untagged_sents,
                         time.time() - start_time)

        # print nice things to the user
        msg("Time spent looking up probabilities: %0.2fs\n" % total_prob_time)
        msg("Total unseen words: %d (%0.2f%% of total)\n" % (total_unknown_count, \
            total_unknown_count / total_word_count * 100))
        msg("Total words guessed: %d (%0.2f%% of unseen)\n" % (total_guess_count, \
            total_guess_count / total_unknown_count * 100))

        return tagged_sents

    def tag_sent(self, words):
        Tag a sentence using the Viterbi algorithm
        :param words: a list of untagged words

        # initialize stats tracking variables
        prob_time = 0
        other_time = 0
        start_time = time.time()
        guess_count = 0
        unknown_count = 0

        # initialize arrays used for algorithm

        # reusable looping list: number of words in our sentence
        words_range = range(len(words))

        # reusable looping list: number of possible POS tags
        pos_range = range(len(self.all_pos_tags))

        # initialize i x j matrix to hold scores
        scores = [[None for j in words_range] for i in pos_range]

        # initialize i x j matrix to hold backpointers
        backpointer = [[None for j in words_range] for i in pos_range]

        # initialize array of POS tags for this sentence
        pos_tags = ['' for j in words_range]

        # initialize array of POS tags for words in sentence
        pos_tag_indices = [None for j in words_range]

        # initialize array of guess states for words in sentence
        guessed_pos = [None for j in words_range]

        # initialize count of words we guessed on for reporting
        guess_count = 0

        # give P(Wi|Ck) trained with lowercase a shorthand name
        cpwp = lambda word, pos: self.words_given_pos[pos].freq(word)

        # give P(Wi|Ck) trained with normal capitalization a shorthand name
        cpwpu = lambda word, pos: self.words_given_pos_upper[pos].freq(word)

        # give P(Ci+1|Ci)   a shorthand name
        cpp2p1 = lambda pos2, pos1: self.pos2_given_pos1[pos1].freq(pos2)

        # loop through words
        for j in words_range:
            word_j = words[j]  # store current word in a local variable

            # determine whether word begins with a capital letter
            is_upper = re.search(r'[A-Z]', word_j[0]) is not None

            # initialize an array to hold the scores for this word not taking into
            # account the word probability, i.e., including only the path and
            # the bare POS probability
            scores_without_word_prob = [0 for i in pos_range]

            # loop through possible POS tags
            for i in pos_range:
                tag_i = self.all_pos_tags[i]  # POS tag for this POS index

                # if this is the first word, perform initial calculation...
                if j == 0:
                    # find P(Wj|Ci) using lowercase since in the first word,
                    # capitalization is not helpful information
                    cpwp_ji = cpwp(word_j.lower(), tag_i)

                    # find P(Ci|'^')
                    cp_istart = cpp2p1(tag_i, self.start_tag)

                    # calculate score using P(Ci|'^') and P(Wj|Ci)
                    scores[i][j] = cp_istart * cpwp_ji

                    # also find bare POS probability, in this case the same as
                    # P(Ci|'^')
                    scores_without_word_prob[i] = cp_istart

                    # initialize backpointer for this word to 0
                    backpointer[i][j] = 0

                # if we're not looking at the first word...
                    start_prob_time = time.time(
                    )  # start our prob lookup timer

                    # initialize an array corresponding to all the POS tags with 1
                    # in each slot. This will hold the probability that POS i is
                    # what it is given that it may have followed any other POS
                    scores_pp2p1 = [-1 for m in pos_range]

                    # we don't actually need to lookup this conditional probability
                    # for every POS, since we know which POS for words[j-1] have the
                    # highest score so far. Thus we only look at those POS in
                    # last_max_indices, which stores the POS indices of the POS that
                    # scored highest for word[j-1]
                    for k in last_max_indices:
                        scores_pp2p1[k] = cpp2p1(tag_i, self.all_pos_tags[k])

                    # now we want to find the highest P(Ci|Ck) score
                    max_pp2p1_score = max(scores_pp2p1)

                    # also, get the POS index (k from Ck) corresponding to it
                    max_k = scores_pp2p1.index(max_pp2p1_score)

                    # now we find P(Wj|Ci)
                    if is_upper:
                        # if Wj is uppercase, look in the uppercase freq table
                        cpwp_ji = cpwpu(word_j, tag_i)
                        # if Wj is lowercase, we know first of all that it can't be
                        # a proper noun, so remove these from the running
                        if tag_i in [self.guesser.tags.proper_noun, \
                            cpwp_ji = 0

                        # otherwise, lookup the probability from the lowercase
                        # freq table
                            cpwp_ji = cpwp(word_j, tag_i)

                    # calculate the score for this word and possible POS as (a) the
                    # best score from the path so far, (b) the best possible score
                    # for the POS under consideration, and (c) P(Wj|Ci)
                    scores[i][j] = scores[max_k][j -
                                                 1] * max_pp2p1_score * cpwp_ji

                    # keep track of the score for this POS without taking into
                    # account P(Wj|Ci), so if word_j is an untrained word, we can
                    # use bare POS frequencies to help
                    scores_without_word_prob[i] = scores[max_k][j-1] * \

                    # assert that the path to this word/POS combo came through the
                    # POS which gave us the highest score in our calculation,
                    # so we can recover the best POS for each word at the end
                    backpointer[i][j] = max_k

                    prob_time += time.time() - start_prob_time
            # end: for i in pos_range

            did_guess = False
            # take care that not all scores for this word are 0
            if self._smoothing_needed(scores, j_value=j):
                # if all the scores are zero, guess that we've never seen this word
                # in training
                unknown_count += 1

                # try to guess a tag for this word based on its form and the bare
                # POS scores (i.e., guess based on form and then based on the
                # previous POS)
                guess_tag = self.guesser.guess(word_j,

                # if we didn't come up with a guess, make sure our smoother doesn't
                # weight any POS over any other
                if guess_tag == None:
                    guess_index = False

                # otherwise, tell our smoother that we have a guess so that it
                # weights the guessed POS highest
                    # determine the index of the guessed POS tag
                    guess_index = self.all_pos_tags.index(guess_tag)
                    did_guess = True
                    guess_count += 1

                # get a smoothed column of scores for scores[j]
                scores = self._smooth_values(scores, j_value=j, \

            # record whether or not we guessed the POS for this word
            guessed_pos[j] = did_guess

            # turn the score column into a 1-dimensional array so we can more easily
            # find the best POS for this word
            scores_for_this_word = [scores[n][j] for n in pos_range]

            # get the POS indices which performed best for this word to pass on to
            # the algorithm for the next word, so it can only compute scores for
            # realistically likely POS
            last_max_indices = indices_of_max(scores_for_this_word)

        # end: for j in words_range

        # recover the POS tag indices for words in the sentence that led to the best
        # final scores
        for j in reversed(words_range):
            # get the column representing scores for each POS possible for words[j]
            col = [scores[i][j] for i in pos_range]

            # our last POS is whichever had the highest score in the last column
            if j == len(words_range) - 1:
                pos_tag_indices[j] = col.index(max(col))

            # otherwise the POS is whichever the backpointer pointed to from the
            # next word
                pos_tag_indices[j] = backpointer[pos_tag_indices[j + 1]][j + 1]

        # get the actual tags for the indices recovered
        pos_tags = [self.all_pos_tags[index] for index in pos_tag_indices]

        # associate POS tags with words
        tagged_sent = [(words[j], pos_tags[j]) for j in words_range]

        # calculate time stats
        end_time = time.time()
        other_time = end_time - start_time - prob_time

        # return a bundle of tag data and other stats
        return (tagged_sent, prob_time, other_time, guess_count, unknown_count)

    ######### `PRIVATE' FUNCTIONS #########

    def _smoothing_needed(self, matrix, j_value):
        Determine whether smoothing is needed for a column of a matrix
        :param matrix: the list of lists to examine
        :param j_value: the index of the column to examine for smoothing, i.e.,
        return max([matrix[i][j_value] for i in range(len(matrix))]) == 0

    def _smooth_values(self, matrix, j_value=0, guess_index=-1):
        Ensure that a column of a matrix is not full of zeroes.
        :param matrix: list of lists of numbers
        :param j_value: matrix column (default: 0)
        :param guess_index: row index to prefer (default: -1)

        row_range = range(len(matrix))  # range for looping through rows

        # fill an array with values from the column
        value_array = [matrix[i][j_value] for i in row_range]

        # assume we have all zeroes
        # if we want to prefer one row, give it a certain value 0 - 1
        if guess_index > -1:
            guess_value = 0.75
            # give everything else the remaining probability distributed evenly
            non_guess_value = (1 - guess_value) / len(matrix)
            for i in row_range:
                # give our preferred row the weighted value
                if i == guess_index:
                    matrix[guess_index][j_value] = guess_value

                # give every other row the rest of the probability distribution
                    matrix[guess_index][j_value] = non_guess_value

        # otherwise, simply split the probability value of 1 evenly over all rows
            for i in row_range:
                matrix[i][j_value] = 1 / len(matrix)

        return matrix
Ejemplo n.º 8
    def __init__(self, data):
        data_handler = Guesser(data, guess_type="structured")
        self.data_handler = data_handler
        # Set up the GUI
        self.top = Tk()
        self.top.wm_title("Attribute chooser")

        # Get all attribute labels
        attributes = data_handler.get_attributes()
        self.attributes_frame = Frame(self.top, relief=GROOVE)
        self.attributes_frame.grid(row=0, column=0, columnspan=len(attributes))
        Label(self.attributes_frame, text="Attrubutes").grid(row=0, column=0)

        self.option_menu_dic = {}
        for i, attr in enumerate(attributes):
            # Create a label with the attribute name
            Label(self.attributes_frame, text=attr).grid(row=1, column=i)

            # Get the values associated with the attribute
            values = data_handler.get_attribute_values(attr)
            values = tuple(["All"] + values)

            # Create a an option menu (dropdown list) with the possible values
            var = StringVar()
            opt_m = OptionMenu(self.attributes_frame,
            opt_m.grid(row=2, column=i)
            self.option_menu_dic[attr] = var

        # Create the list, which is a Treeview-widget
        self.object_frame = Frame(self.top, relief=RAISED)

        self.image_size = 50, 50
        tree_columns, tree_data = data_handler.get_object_data(
        style = ttk.Style(self.top)
        style.configure('Calendar.Treeview', rowheight=self.image_size[0])

        self.tree = ttk.Treeview(self.object_frame,
        vsb = ttk.Scrollbar(orient="vertical", command=self.tree.yview)
        hsb = ttk.Scrollbar(orient="horizontal", command=self.tree.xview)
        self.tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
        hsb.grid(column=0, row=1, sticky='ew', in_=self.object_frame)

        # Keep a list of all images so that they are all referenced to avoid the garbage collector
        # to remove them.
        self.images = []
        self.build_tree(tree_data, tree_columns)

        # Show the GUI by letting the root window enter the main loop
Ejemplo n.º 9
class HMM:
    "A class for building Hidden Markov Models of tagged word data"
    ######### CLASS VARIABLES #########
    # store a small list of punctuation to help with training P(Ci+1|Ci)
    punct_list = ["''", '``', ',']
    def __init__(self, untagged_sents, pos_tags, words_given_pos, \
        words_given_pos_upper, pos2_given_pos1, start_tag):
        Construct a HMM object
        :param untagged_sents: list of untagged sentences for tagging
        :param pos_tags: list of possible POS tags
        :param words_given_pos: nltk.ConditionalFreqDist for P(Wi|Ck) with all words
            converted to lowercase
        :param words_given_pos_upper: nltk.ConditionalFreqDist for P(Wi|Ck) with
            words left in original capitalization
        :param pos2_given_pos1: nltk.ConditionalFreqDist for P(Ci+1|Ci)
        :param default_tag: POS tag to guess for words
        :param start_tag: start tag used to mark sentence beginning
        self.start_tag = start_tag
        self.untagged_sents = untagged_sents
        self.num_untagged_sents = len(untagged_sents)
        self.all_pos_tags = pos_tags
        self.words_given_pos = words_given_pos
        self.words_given_pos_upper = words_given_pos_upper
        self.pos2_given_pos1 = pos2_given_pos1
        # initialize one guesser object to use for the whole test
        self.guesser = Guesser(pos_tags, words_given_pos)
    ######### `PUBLIC' FUNCTIONS #########
    def tag(self):
        Tag all this object's sentences, return a list of tagged sentences
        msg("Tagging sentences:\n")
        start_time = time.time() # mark the start time for this process
        tagged_sents = [] # array to hold tagged sentences
        complete = 0 # how many sentences we have tagged
        # initialize variables to track for tagging stats
        total_prob_time = 0 # time spent looking up probabilities
        total_other_time = 0 # time spent doing other things
        total_guess_count = 0 # words we used the guesser to guess POS for
        total_word_count = 0 # num words tagged
        total_unknown_count = 0 # num words with no P(Wi|Ci)
        # tag each sentence and track statistics
        for sent in self.untagged_sents:
            total_word_count += len(sent)
            (tagged_sent, prob_time, other_time, guess_count, unknown_count) = \
            total_prob_time += prob_time
            total_other_time += other_time
            total_guess_count += guess_count
            total_unknown_count += unknown_count
            tagged_sents.append(tagged_sent) # append tagged sentence to array
            complete += 1 # increment our completed counter for progress bar
            # show nice progress bar
            progress_bar(complete,self.num_untagged_sents,time.time() - start_time)
        # print nice things to the user
        msg("Time spent looking up probabilities: %0.2fs\n" % total_prob_time)
        msg("Total unseen words: %d (%0.2f%% of total)\n" % (total_unknown_count, \
            total_unknown_count / total_word_count * 100))
        msg("Total words guessed: %d (%0.2f%% of unseen)\n" % (total_guess_count, \
            total_guess_count / total_unknown_count * 100))
        return tagged_sents
    def tag_sent(self, words):
        Tag a sentence using the Viterbi algorithm
        :param words: a list of untagged words
        # initialize stats tracking variables
        prob_time = 0
        other_time = 0
        start_time = time.time()
        guess_count = 0
        unknown_count = 0
        # initialize arrays used for algorithm
        # reusable looping list: number of words in our sentence
        words_range = range(len(words))
        # reusable looping list: number of possible POS tags
        pos_range = range(len(self.all_pos_tags))
        # initialize i x j matrix to hold scores
        scores = [[None for j in words_range] for i in pos_range]
        # initialize i x j matrix to hold backpointers
        backpointer = [[None for j in words_range] for i in pos_range]
        # initialize array of POS tags for this sentence
        pos_tags = ['' for j in words_range]
        # initialize array of POS tags for words in sentence
        pos_tag_indices = [None for j in words_range]
        # initialize array of guess states for words in sentence
        guessed_pos = [None for j in words_range]
        # initialize count of words we guessed on for reporting
        guess_count = 0
        # give P(Wi|Ck) trained with lowercase a shorthand name
        cpwp = lambda word,pos: self.words_given_pos[pos].freq(word)
        # give P(Wi|Ck) trained with normal capitalization a shorthand name
        cpwpu = lambda word,pos: self.words_given_pos_upper[pos].freq(word)
        # give P(Ci+1|Ci)   a shorthand name
        cpp2p1 = lambda pos2,pos1: self.pos2_given_pos1[pos1].freq(pos2)
        # loop through words
        for j in words_range:
            word_j = words[j] # store current word in a local variable
            # determine whether word begins with a capital letter
            is_upper = re.search(r'[A-Z]', word_j[0]) is not None
            # initialize an array to hold the scores for this word not taking into
            # account the word probability, i.e., including only the path and
            # the bare POS probability
            scores_without_word_prob = [0 for i in pos_range]
            # loop through possible POS tags
            for i in pos_range:
                tag_i = self.all_pos_tags[i] # POS tag for this POS index
                # if this is the first word, perform initial calculation...
                if j==0:
                    # find P(Wj|Ci) using lowercase since in the first word,
                    # capitalization is not helpful information
                    cpwp_ji = cpwp(word_j.lower(), tag_i)
                    # find P(Ci|'^')
                    cp_istart = cpp2p1(tag_i, self.start_tag)
                    # calculate score using P(Ci|'^') and P(Wj|Ci)
                    scores[i][j] = cp_istart * cpwp_ji
                    # also find bare POS probability, in this case the same as
                    # P(Ci|'^')
                    scores_without_word_prob[i] = cp_istart
                    # initialize backpointer for this word to 0
                    backpointer[i][j] = 0
                # if we're not looking at the first word...
                    start_prob_time = time.time() # start our prob lookup timer
                    # initialize an array corresponding to all the POS tags with 1
                    # in each slot. This will hold the probability that POS i is
                    # what it is given that it may have followed any other POS
                    scores_pp2p1 = [-1 for m in pos_range]
                    # we don't actually need to lookup this conditional probability
                    # for every POS, since we know which POS for words[j-1] have the
                    # highest score so far. Thus we only look at those POS in 
                    # last_max_indices, which stores the POS indices of the POS that
                    # scored highest for word[j-1]
                    for k in last_max_indices:
                        scores_pp2p1[k] = cpp2p1(tag_i, self.all_pos_tags[k])
                    # now we want to find the highest P(Ci|Ck) score
                    max_pp2p1_score = max(scores_pp2p1)
                    # also, get the POS index (k from Ck) corresponding to it
                    max_k = scores_pp2p1.index(max_pp2p1_score)
                    # now we find P(Wj|Ci)
                    if is_upper:
                        # if Wj is uppercase, look in the uppercase freq table
                        cpwp_ji = cpwpu(word_j, tag_i)
                        # if Wj is lowercase, we know first of all that it can't be
                        # a proper noun, so remove these from the running
                        if tag_i in [self.guesser.tags.proper_noun, \
                            cpwp_ji = 0
                        # otherwise, lookup the probability from the lowercase
                        # freq table
                            cpwp_ji = cpwp(word_j, tag_i)
                    # calculate the score for this word and possible POS as (a) the
                    # best score from the path so far, (b) the best possible score
                    # for the POS under consideration, and (c) P(Wj|Ci)
                    scores[i][j] = scores[max_k][j-1] * max_pp2p1_score * cpwp_ji

                    # keep track of the score for this POS without taking into 
                    # account P(Wj|Ci), so if word_j is an untrained word, we can
                    # use bare POS frequencies to help
                    scores_without_word_prob[i] = scores[max_k][j-1] * \
                    # assert that the path to this word/POS combo came through the
                    # POS which gave us the highest score in our calculation,
                    # so we can recover the best POS for each word at the end
                    backpointer[i][j] = max_k
                    prob_time += time.time() - start_prob_time
            # end: for i in pos_range
            did_guess = False
            # take care that not all scores for this word are 0
            if self._smoothing_needed(scores, j_value=j):
                # if all the scores are zero, guess that we've never seen this word
                # in training
                unknown_count += 1
                # try to guess a tag for this word based on its form and the bare
                # POS scores (i.e., guess based on form and then based on the
                # previous POS)
                guess_tag = self.guesser.guess(word_j, scores_without_word_prob)
                # if we didn't come up with a guess, make sure our smoother doesn't
                # weight any POS over any other
                if guess_tag == None:
                # otherwise, tell our smoother that we have a guess so that it
                # weights the guessed POS highest
                    # determine the index of the guessed POS tag

                        guess_index = self.all_pos_tags.index(guess_tag)

                        did_guess = True
                        guess_count += 1
                    except ValueError as e:

                # get a smoothed column of scores for scores[j]
                scores = self._smooth_values(scores, j_value=j, \

            # record whether or not we guessed the POS for this word
            guessed_pos[j] = did_guess
            # turn the score column into a 1-dimensional array so we can more easily
            # find the best POS for this word
            scores_for_this_word = [scores[n][j] for n in pos_range]
            # get the POS indices which performed best for this word to pass on to
            # the algorithm for the next word, so it can only compute scores for
            # realistically likely POS
            last_max_indices = indices_of_max(scores_for_this_word)
        # end: for j in words_range
        # recover the POS tag indices for words in the sentence that led to the best
        # final scores
        for j in reversed(words_range):
            # get the column representing scores for each POS possible for words[j]
            col = [scores[i][j] for i in pos_range]
            # our last POS is whichever had the highest score in the last column
            if j==len(words_range)-1:
                pos_tag_indices[j] = col.index(max(col))
            # otherwise the POS is whichever the backpointer pointed to from the
            # next word
                pos_tag_indices[j] = backpointer[pos_tag_indices[j+1]][j+1]
        # get the actual tags for the indices recovered
        pos_tags = [self.all_pos_tags[index] for index in pos_tag_indices]
        # associate POS tags with words
        tagged_sent = [(words[j], pos_tags[j]) for j in words_range]
        # calculate time stats
        end_time = time.time()
        other_time = end_time - start_time - prob_time
        # return a bundle of tag data and other stats
        return (tagged_sent, prob_time, other_time, guess_count, unknown_count)
    ######### `PRIVATE' FUNCTIONS #########
    def _smoothing_needed(self, matrix, j_value):
        Determine whether smoothing is needed for a column of a matrix
        :param matrix: the list of lists to examine
        :param j_value: the index of the column to examine for smoothing, i.e.,
        return max([matrix[i][j_value] for i in range(len(matrix))]) == 0
    def _smooth_values(self, matrix, j_value=0, guess_index=-1):
        Ensure that a column of a matrix is not full of zeroes.
        :param matrix: list of lists of numbers
        :param j_value: matrix column (default: 0)
        :param guess_index: row index to prefer (default: -1)
        row_range = range(len(matrix)) # range for looping through rows
        # fill an array with values from the column
        value_array = [matrix[i][j_value] for i in row_range]
        # assume we have all zeroes
        # if we want to prefer one row, give it a certain value 0 - 1
        if guess_index > -1:
            guess_value = 0.75
            # give everything else the remaining probability distributed evenly
            non_guess_value = (1 - guess_value) / len(matrix)
            for i in row_range:
                # give our preferred row the weighted value
                if i==guess_index:
                    matrix[guess_index][j_value] = guess_value
                # give every other row the rest of the probability distribution
                    matrix[guess_index][j_value] = non_guess_value
        # otherwise, simply split the probability value of 1 evenly over all rows
            for i in row_range:
                matrix[i][j_value] = 1 / len(matrix)

        return matrix