예제 #1
0
 def clean(self):
     """
     Clean corpus files and write the results to disk
     """
     
     # loop through files
     for corpus_file in self.corpus_files:
         
         msg("Cleaning %s..." % corpus_file)
         
         # get the file in a string
         f = open(self.corpus_path + corpus_file, 'r')
         data = f.read()
         f.close()
         
         # use an unoptimized set of arcane regular expressions to clean the data
         data = re.sub(r' +(\r)?\n', '\n', data)
         para_sep = r'======================================'
         data = re.sub(r'([^\.])(\n+)', '\\1 ', data)
         data = re.sub(para_sep, '\n'+para_sep+'\n', data)
         data = re.sub(r' +\n', '\n', data)
         data = re.sub(r'\n\n+', '\n', data)
         data = re.sub(para_sep + r'\n' + para_sep, para_sep, data)
         data = re.sub('^\n' + para_sep + '\n', '', data)
         data = re.sub(r' *(\[|\]) *', ' ', data)
         data = re.sub(r'\n +', '\n', data)
         data = re.sub(r'^ +', '', data)
         data = re.sub(para_sep + r'\n', '', data)
         
         # write the cleaned data to a new file
         new_file = corpus_file + '_cleaned'
         f = open(self.corpus_path + new_file, 'w')
         f.write(data)
         
         msg("done!\n")
예제 #2
0
    def clean(self):
        """
        Clean corpus files and write the results to disk
        """

        # loop through files
        for corpus_file in self.corpus_files:

            msg("Cleaning %s..." % corpus_file)

            # get the file in a string
            f = open(self.corpus_path + corpus_file, 'r')
            data = f.read()
            f.close()

            # use an unoptimized set of arcane regular expressions to clean the data
            data = re.sub(r' +(\r)?\n', '\n', data)
            para_sep = r'======================================'
            data = re.sub(r'([^\.])(\n+)', '\\1 ', data)
            data = re.sub(para_sep, '\n' + para_sep + '\n', data)
            data = re.sub(r' +\n', '\n', data)
            data = re.sub(r'\n\n+', '\n', data)
            data = re.sub(para_sep + r'\n' + para_sep, para_sep, data)
            data = re.sub('^\n' + para_sep + '\n', '', data)
            data = re.sub(r' *(\[|\]) *', ' ', data)
            data = re.sub(r'\n +', '\n', data)
            data = re.sub(r'^ +', '', data)
            data = re.sub(para_sep + r'\n', '', data)

            # write the cleaned data to a new file
            new_file = corpus_file + '_cleaned'
            f = open(self.corpus_path + new_file, 'w')
            f.write(data)

            msg("done!\n")
예제 #3
0
파일: Tagger.py 프로젝트: Homyg/hmm-tagger
 def run_test_cycles(self):
     """
     Run the test cycles for training and testing the tagger.
     Specifically, employ ten-fold cross-validation to train/test on different
     segments of the corpus.
     """
     
     total_time_start = time.time() # keep track of time
     pct_step = int(100 / Tagger.test_cycles) # cycle steps in pct
     test_pct = pct_step # percentage of the corpus to test the tagger on
     train_pct = 100 - test_pct # percentage of the corpus to train the tagger on
     rights = [] # array to hold number of correctly-tagged words for each test
     wrongs = [] # array to hold number of incorrectly-tagged words for each test
     totals = [] # array to hold number of total words for each test
     all_missed = [] # array to hold incorrect tag information for each test
     sep = ''.join(["-" for i in range(50)]) + "\n" # logging separator
     
     # loop from 0-90 (step size 10)
     for start_train_pct in [x*pct_step for x in range(Tagger.test_cycles)]:
         msg("%sSTARTING TEST CYCLE %d\n%s" % (sep, (start_train_pct/pct_step)+1,\
             sep))
         
         # find the percent point to start collecting test sentences
         # may be > 100, so circle round
         start_test_pct = (start_train_pct+train_pct) % 100
         
         # train the tagger on sentences from the corpus matching our range
         training_sents = self.tb.training_sents(train_pct,start_train_pct)
         self.train(training_sents)
         
         # test the tagger on the rest of the sentences
         testing_sents = self.tb.testing_sents(test_pct,start_test_pct)
         (right, wrong, missed) = self.test(testing_sents)
         
         # gather accuracy statistics for this test
         total = right + wrong
         rights.append(right) # store the correct count for this test cycle
         wrongs.append(wrong) # store the incorrect count for this test cycle
         totals.append(total) # store the total words tested for this test cycle
         all_missed += missed # add incorrect tag information from this cycle
         
         msg("Total words: %d\n" % total)
         msg("Correct tags: %d (%0.2f%%)\n" % (right, right / total * 100))
         msg("Incorrect tags: %d (%0.2f%%)\n" % (wrong, wrong / total * 100))
     # end: test cycle
         
     msg("%s%s" % (sep,sep))
     
     # calculate and output statistics for the entire test
     print "Total tests run: %d" % len(totals)
     print "Total time taken: %0.2f seconds" % (time.time() - total_time_start)
     print "Average correct tags: %0.2f%%" % (sum(rights) / sum(totals) * 100)
     print "Average incorrect tags: %0.2f%%" % (sum(wrongs) / sum(totals) * 100)
     print
     
     # give the option of inspecting incorrect tags
     if raw_input("Examine bad tags? ") in ['y','Y']:
         self.inspect(all_missed)
예제 #4
0
    def training_sents(self, train_pct, start_train_pct):
        """
        Get a list of sentences for training
        
        :param train_pct: what pct of the corpus to retrieve
        :param start_train_pct: where in the corpus to begin retrieval
        """

        msg("Getting training sentences...")
        sents = self._sents_by_pct(train_pct, start_train_pct)
        msg("done: %d%% starting at %d%%\n" % (train_pct, start_train_pct))

        return sents
예제 #5
0
    def training_sents(self, train_pct, start_train_pct):
        """
        Get a list of sentences for training
        
        :param train_pct: what pct of the corpus to retrieve
        :param start_train_pct: where in the corpus to begin retrieval
        """

        msg("Getting training sentences...")
        sents = self._sents_by_pct(train_pct, start_train_pct)
        msg("done: %d%% starting at %d%%\n" % (train_pct, start_train_pct))

        return sents
예제 #6
0
    def testing_sents(self, test_pct, start_test_pct):
        """
        Get a list of untagged and tagged sentences for testing

        :param test_pct: what pct of the corpus to retrieve
        :param start_test_pct: where in the corpus to begin retrieval
        """

        # when we retrieve testing sentences, we want to get tagged and untagged
        # versions of them so we can evaluate accuracy
        msg("Getting testing sentences...")
        untagged_sents = self._sents_by_pct(test_pct, start_test_pct, tagged=False)
        tagged_sents = self._sents_by_pct(test_pct, start_test_pct, tagged=True)
        msg("done: %d%% starting at %d%%\n" % (test_pct, start_test_pct))

        return (untagged_sents, tagged_sents)
예제 #7
0
    def run_test_cycles(self):
        """
        Run the test cycles for training and testing the tagger.
        Specifically, employ ten-fold cross-validation to train/test on different
        segments of the corpus.
        """
        
        total_time_start = time.time() # keep track of time
        rights = [] # array to hold number of correctly-tagged words for each test
        wrongs = [] # array to hold number of incorrectly-tagged words for each test
        totals = [] # array to hold number of total words for each test
        all_missed = [] # array to hold incorrect tag information for each test
        sep = ''.join(["-" for i in range(50)]) + "\n" # logging s

        # returns tagged sentences
        training_sents = self.training.tagged_sents

        self.train(training_sents)

        # returns untagged sentences
        testing_tagged_sents = self.testing.tagged_sents

        testing_untagged_sents = self.testing.sents

        testing_sents = (testing_untagged_sents, testing_tagged_sents)


        (right, wrong, missed) = self.test(testing_sents)

        # gather accuracy statistics for this test
        total = right + wrong
        rights.append(right) # store the correct count for this test cycle
        wrongs.append(wrong) # store the incorrect count for this test cycle
        totals.append(total) # store the total words tested for this test cycle
        all_missed += missed # add incorrect tag information from this cycle

        msg("Total words: %d\n" % total)
        msg("Correct tags: %d (%0.2f%%)\n" % (right, right / total * 100))
        msg("Incorrect tags: %d (%0.2f%%)\n" % (wrong, wrong / total * 100))

        msg("%s%s" % (sep,sep))
        
        # calculate and output statistics for the entire test
        print "Total tests run: %d" % len(totals)
        print "Total time taken: %0.2f seconds" % (time.time() - total_time_start)
        print "Average correct tags: %0.2f%%" % (sum(rights) / sum(totals) * 100)
        print "Average incorrect tags: %0.2f%%" % (sum(wrongs) / sum(totals) * 100)
        print
        
        # give the option of inspecting incorrect tags
        if raw_input("Examine bad tags? ") in ['y','Y']:
            self.inspect(all_missed)
예제 #8
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
예제 #9
0
    def testing_sents(self, test_pct, start_test_pct):
        """
        Get a list of untagged and tagged sentences for testing

        :param test_pct: what pct of the corpus to retrieve
        :param start_test_pct: where in the corpus to begin retrieval
        """

        # when we retrieve testing sentences, we want to get tagged and untagged
        # versions of them so we can evaluate accuracy
        msg("Getting testing sentences...")
        untagged_sents = self._sents_by_pct(test_pct,
                                            start_test_pct,
                                            tagged=False)
        tagged_sents = self._sents_by_pct(test_pct,
                                          start_test_pct,
                                          tagged=True)
        msg("done: %d%% starting at %d%%\n" % (test_pct, start_test_pct))

        return (untagged_sents, tagged_sents)
예제 #10
0
    def __init__(self, corpus_path, corpus_files):
        """
        Construct a Treebank object
        
        :param corpus_path: path to corpus files
        :param corpus_files: list of filenames for corpus text
        """

        msg("Importing treebank...")

        # get a corpus reader object for our corpus using NLTK
        treebank = TaggedCorpusReader(corpus_path, corpus_files)

        # get all sentences from corpus in a tagged format
        self.tagged_sents = treebank.tagged_sents()

        # get all sentences from corpus in an untagged format
        self.sents = treebank.sents()

        msg("done!\n")
예제 #11
0
    def pos_tags(self):
        """
        Create a list of all POS tags found in the corpus
        """

        msg("Getting POS tag list...")
        tags = []

        # loop through sentences
        for sent in self.tagged_sents:

            # loop through tagged words
            for (word, pos) in sent:

                # add tag if it's not already in list
                if pos not in tags:
                    tags.append(pos)

        msg("done\n")

        return tags
예제 #12
0
    def pos_tags(self):
        """
        Create a list of all POS tags found in the corpus
        """

        msg("Getting POS tag list...")
        tags = []

        # loop through sentences
        for sent in self.tagged_sents:

            # loop through tagged words
            for (word, pos) in sent:

                # add tag if it's not already in list
                if pos not in tags:
                    tags.append(pos)

        msg("done\n")

        return tags
예제 #13
0
    def train(self, sents):
        """
        Train the tagger on a set of tagged sentences
        
        :param sents: list of tagged sentences
        """

        # collect POS tags from our corpus
        self.pos_tags = self.tb.pos_tags()

        # add start markers to help with bigram tagging
        msg("Adjusting POS tags...")
        sents = self._adjust_pos(sents)
        msg("done\n")

        # create 2 conditional frequency distributions (from the NLTK) that store
        # observed probabilities that a given word has a certain POS, one for
        # lowercase-normalized words and one for words as they appear in the text
        msg("Training (Wi|Ck)...")

        # create a CFD for words normalized to lowercase
        self.words_given_pos = ConditionalFreqDist((wp[1], wp[0].lower()) for \
            sent in sents for wp in sent)

        # create a CFD for words left in their original capitalization
        self.words_given_pos_upper = ConditionalFreqDist((wp[1], wp[0]) for \
            sent in sents for wp in sent)
        msg("done\n")

        # create another CFD that stores probabilities that stores observed
        # probabilities that one POS follows another POS
        msg("Training (Ci+1|Ci)...")
        self.pos2_given_pos1 = ConditionalFreqDist((sent[i-1][1], sent[i][1]) for \
            sent in sents for i in range(1,len(sent)))

        msg("done\n")
예제 #14
0
 def inspect(self, missed):
     """
     Inspect a testing session, and print data about tag accuracy
     
     :param missed: list of tuples of missed tags like:
         (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
     """
     
     # create a CFD so we can examine a matrix of incorrect vs correct tags
     # ms[1][1] = tag of a gold_tagged_word
     # ms[0][1] = tag of an hmm_tagged_word
     cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)
     
     # initialize a hash to store mistakes by frequency
     mistakes = {}
     
     # print a table showing mistake frequency
     cfd.tabulate()
     msg("\n")
     
     # loop through mistake frequencies by gold standard tag, i.e., if we are
     # examining gold-standard 'IN', count what we incorrectly tagged it as
     conds = cfd.conditions()
     for g_tag in conds:
         for hmm_tag in cfd[g_tag].keys():
             # how many times did we incorrectly say g_tag was hmm_tag?
             count = cfd[g_tag][hmm_tag]
             
             # add these mistakes to the count
             if count not in mistakes.keys():
                 mistakes[count] = []
             mistakes[count].append((hmm_tag, g_tag))
             
     # get a list of all mistake types that occurred over a threshold, worst first
     mistake_counts = set([count for (count, mistake_set) in \
         mistakes.iteritems() if count > Tagger.mistake_threshold])
     mistake_counts = reversed(sorted(mistake_counts))
     
     # now create a list of mistake types to show the user, i.e., loop 
     # through all types and if they are of a high-frequency type, add to list
     mistakes_to_halt = []
     for count in mistake_counts:
         mistake_set = mistakes[count]
         for mistake_tuple in mistake_set:
             mistakes_to_halt.append(mistake_tuple)
             msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                 mistake_tuple[1]))
     msg("\n")
     
     # create separators used when outputting missed word contexts
     sep_big = "---------------------------------------------------\n"
     sep_small = "\n-----------------------------------------\n"
     
     # loop through individual mistakes and, if they match the kind of error
     # we want to halt for, show the user the mistake as well as the sentence
     # context for both the gold-standard sentence and the hmm-tagged sentence
     response = None
     for missed_set in missed:
         if response not in ['q','Q']:
             (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                 gold_tagged_sent) = missed_set
             should_halt = False
             # determine whether the current mistake matches a mistake type
             # we want to halt for
             for pair in mistakes_to_halt:
                 if hmm_tagged_word[1] == pair[0] and \
                     gold_tagged_word[1] == pair[1]:
                     should_halt = True
             if should_halt:
                 msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                 (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                     gold_tagged_word[1], sep_small))
                 
                 msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     gold_tagged_sent])))
                 msg(sep_small)
                 msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                     hmm_tagged_sent])))
                 
                 # get user input to decide whether to keep going
                 response = raw_input("\n\nEnter to continue, Q to quit: ")
예제 #15
0
    def train(self, sents):
        """
        Train the tagger on a set of tagged sentences
        
        :param sents: list of tagged sentences
        """
        
        # collect POS tags from our corpus
        self.pos_tags = self.training.pos_tags()
        
        # add start markers to help with bigram tagging
        msg("Adjusting POS tags...")
        sents = self._adjust_pos(sents)
        msg("done\n")
        
        # create 2 conditional frequency distributions (from the NLTK) that store
        # observed probabilities that a given word has a certain POS, one for
        # lowercase-normalized words and one for words as they appear in the text
        msg("Training (Wi|Ck)...")
        
        # create a CFD for words normalized to lowercase
        self.words_given_pos = ConditionalFreqDist((wp[1], wp[0].lower()) for \
            sent in sents for wp in sent)
            
        # create a CFD for words left in their original capitalization
        self.words_given_pos_upper = ConditionalFreqDist((wp[1], wp[0]) for \
            sent in sents for wp in sent)
        msg("done\n")
        
        # create another CFD that stores probabilities that stores observed
        # probabilities that one POS follows another POS
        msg("Training (Ci+1|Ci)...")
        self.pos2_given_pos1 = ConditionalFreqDist((sent[i-1][1], sent[i][1]) for \
            sent in sents for i in range(1,len(sent)))

        msg("done\n")
예제 #16
0
    def run_test_cycles(self):
        """
        Run the test cycles for training and testing the tagger.
        Specifically, employ ten-fold cross-validation to train/test on different
        segments of the corpus.
        """

        total_time_start = time.time()  # keep track of time
        pct_step = int(100 / Tagger.test_cycles)  # cycle steps in pct
        test_pct = pct_step  # percentage of the corpus to test the tagger on
        train_pct = 100 - test_pct  # percentage of the corpus to train the tagger on
        rights = [
        ]  # array to hold number of correctly-tagged words for each test
        wrongs = [
        ]  # array to hold number of incorrectly-tagged words for each test
        totals = []  # array to hold number of total words for each test
        all_missed = [
        ]  # array to hold incorrect tag information for each test
        sep = ''.join(["-" for i in range(50)]) + "\n"  # logging separator

        # loop from 0-90 (step size 10)
        for start_train_pct in [
                x * pct_step for x in range(Tagger.test_cycles)
        ]:
            msg("%sSTARTING TEST CYCLE %d\n%s" % (sep, (start_train_pct/pct_step)+1,\
                sep))

            # find the percent point to start collecting test sentences
            # may be > 100, so circle round
            start_test_pct = (start_train_pct + train_pct) % 100

            # train the tagger on sentences from the corpus matching our range
            training_sents = self.tb.training_sents(train_pct, start_train_pct)
            self.train(training_sents)

            # test the tagger on the rest of the sentences
            testing_sents = self.tb.testing_sents(test_pct, start_test_pct)
            (right, wrong, missed) = self.test(testing_sents)

            # gather accuracy statistics for this test
            total = right + wrong
            rights.append(right)  # store the correct count for this test cycle
            wrongs.append(
                wrong)  # store the incorrect count for this test cycle
            totals.append(
                total)  # store the total words tested for this test cycle
            all_missed += missed  # add incorrect tag information from this cycle

            msg("Total words: %d\n" % total)
            msg("Correct tags: %d (%0.2f%%)\n" % (right, right / total * 100))
            msg("Incorrect tags: %d (%0.2f%%)\n" %
                (wrong, wrong / total * 100))
        # end: test cycle

        msg("%s%s" % (sep, sep))

        # calculate and output statistics for the entire test
        print "Total tests run: %d" % len(totals)
        print "Total time taken: %0.2f seconds" % (time.time() -
                                                   total_time_start)
        print "Average correct tags: %0.2f%%" % (sum(rights) / sum(totals) *
                                                 100)
        print "Average incorrect tags: %0.2f%%" % (sum(wrongs) / sum(totals) *
                                                   100)
        print

        # give the option of inspecting incorrect tags
        if raw_input("Examine bad tags? ") in ['y', 'Y']:
            self.inspect(all_missed)
예제 #17
0
    def inspect(self, missed):
        """
        Inspect a testing session, and print data about tag accuracy
        
        :param missed: list of tuples of missed tags like:
            (hmm_tagged_word, gold_tagged_word, hmm_context, gold_context)
        """

        # create a CFD so we can examine a matrix of incorrect vs correct tags
        # ms[1][1] = tag of a gold_tagged_word
        # ms[0][1] = tag of an hmm_tagged_word
        cfd = ConditionalFreqDist((ms[1][1], ms[0][1]) for ms in missed)

        # initialize a hash to store mistakes by frequency
        mistakes = {}

        # print a table showing mistake frequency
        cfd.tabulate()
        msg("\n")

        # loop through mistake frequencies by gold standard tag, i.e., if we are
        # examining gold-standard 'IN', count what we incorrectly tagged it as
        conds = cfd.conditions()
        for g_tag in conds:
            for hmm_tag in cfd[g_tag].keys():
                # how many times did we incorrectly say g_tag was hmm_tag?
                count = cfd[g_tag][hmm_tag]

                # add these mistakes to the count
                if count not in mistakes.keys():
                    mistakes[count] = []
                mistakes[count].append((hmm_tag, g_tag))

        # get a list of all mistake types that occurred over a threshold, worst first
        mistake_counts = set([count for (count, mistake_set) in \
            mistakes.iteritems() if count > Tagger.mistake_threshold])
        mistake_counts = reversed(sorted(mistake_counts))

        # now create a list of mistake types to show the user, i.e., loop
        # through all types and if they are of a high-frequency type, add to list
        mistakes_to_halt = []
        for count in mistake_counts:
            mistake_set = mistakes[count]
            for mistake_tuple in mistake_set:
                mistakes_to_halt.append(mistake_tuple)
                msg("%d\t%s\twas really\t%s\n" % (count, mistake_tuple[0], \
                    mistake_tuple[1]))
        msg("\n")

        # create separators used when outputting missed word contexts
        sep_big = "---------------------------------------------------\n"
        sep_small = "\n-----------------------------------------\n"

        # loop through individual mistakes and, if they match the kind of error
        # we want to halt for, show the user the mistake as well as the sentence
        # context for both the gold-standard sentence and the hmm-tagged sentence
        response = None
        for missed_set in missed:
            if response not in ['q', 'Q']:
                (hmm_tagged_word, gold_tagged_word, hmm_tagged_sent, \
                    gold_tagged_sent) = missed_set
                should_halt = False
                # determine whether the current mistake matches a mistake type
                # we want to halt for
                for pair in mistakes_to_halt:
                    if hmm_tagged_word[1] == pair[0] and \
                        gold_tagged_word[1] == pair[1]:
                        should_halt = True
                if should_halt:
                    msg("%sTagged '%s' with %s when it should have been %s.%s" %\
                    (sep_big, hmm_tagged_word[0], hmm_tagged_word[1],\
                        gold_tagged_word[1], sep_small))

                    msg("Gold: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        gold_tagged_sent])))
                    msg(sep_small)
                    msg("Mine: " + (' '.join([(w[0] + "/" + w[1]) for w in \
                        hmm_tagged_sent])))

                    # get user input to decide whether to keep going
                    response = raw_input("\n\nEnter to continue, Q to quit: ")