Python column Examples, Column.column Python Examples

Example #1

0

Show file

File: test_lts_learner.py Project: aanavas/s2project

def TestRuleSystem (lts_learner, lts_rules, verbose=False):
    
    all_results = []
        
    for alignment_pathname in prog_args:
        if os.path.exists (alignment_pathname):
            t1 = time.time()
            lts_measurer = Measurer()    
            lts_measurer.InitWithAlignments (alignment_pathname)
            if verbose: print 'Reading alignments from %s in %3.2f s' %(alignment_pathname, time.time()-t1)
                
            all_results.append (lts_measurer.MeasurePredictionAccuracy (lts_learner, lts_rules, verbose=True))
                
    if len (all_results) > 0:
        total_words_correct  = sum (column (column (all_results)))
        total_words_tested   = sum (column (column (all_results), 1))
        total_chars_correct  = sum (column (column (all_results,1)))
        total_chars_tested   = sum (column (column (all_results,1), 1))
        word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested)
        char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested)
            
        if verbose:
            print ' Threshold %i' %(threshold_value)
            print ' Total words correct: %6.3f  %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested)
            print ' Total chars correct: %6.3f  %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested)
            print '--------------------\n'

    return (total_words_correct, total_words_tested), (total_chars_correct, total_chars_tested)

Example #2

0

Show file

File: LTS_IO.py Project: aanavas/s2project

    def WriteOutLetterToSoundSolutions (outfile, word_counter, charseq, phoneseq, solutions, include_failures=False):
        
        word = string.join (charseq,  '')
        pron = string.join (phoneseq, ' ')    
        
        if solutions: 
            for cnt, (score, soln) in enumerate (solutions):
                total_score = reduce ((lambda x,y: x+y), column(soln,2)) 

                outfile.write ('  %5i.  Soln %i %5.0f  %20s : %-40s |' \
                    %(word_counter, cnt+1, total_score, word, pron))
                                    
                for (source, target, score) in soln:
                    num_chars = 1 + int (math.log (max(1.0,score),10.0))
                    spacer = ' ' * max (0, (5 - num_chars))
                    if len(target) == 0: target = ('_')
                        
                    outfile.write (' %2s [%i] %s -> %-6s' \
                        %(string.join(source,''), 
                          score, spacer, 
                          string.join(target) ))
                
                outfile.write('\n')
                    
                    
        elif include_failures:
            outfile.write ('  %5i.  FAILED  %5s %20s : %-40s |' %(word_counter, '', word, pron))
            outfile.write (' %2s -> %-5s\n' %(word, ''))
        pass

Example #3

0

Show file

File: test_lts_learner.py Project: aanavas/s2project

def ComputeAccuracyVsRulesCurve (lts_learner, max_threshold = 9999999999):

    rule_counts_list = list (set (column (lts_learner.GetRulesByCount())))
    rule_counts_list.sort()

    
    print '%6s %8s %8s' %('Thresh', 'Rules', 'Chars')
    for thr in rule_counts_list:
        lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (thr)
        filename = 'rules/lexrules_%s.scm' %(string.zfill(num_lts_rules,5))
        print '%6i %8i %8i' %(thr, num_lts_rules, num_lts_chars)
        lts_learner.WriteOutFestivalRules (filename, 'cmu_us', lts_rules)
    sys.exit()
        
        
        #new_lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (3)
        #lts_learner.WriteOutFestivalRules ('afile', new_lts_rules)
     
 
    print '%4s %6s %6s %8s %8s' %('Num', 'Thresh', 'Rules', 'Words', 'Chars')
        
    for i, threshold in enumerate (rule_counts_list):
        if threshold > max_threshold: continue
        lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (threshold)
            
        word_perf, char_perf = TestRuleSystem (lts_learner, lts_rules)
        word_percent_correct = 100.0 * word_perf[0] / word_perf[1]
        char_percent_correct = 100.0 * char_perf[0] / char_perf[1]
            
        print '%4i %6i %6i %8.3f %8.3f  '  \
            %(i+1, threshold, num_lts_rules, word_percent_correct, char_percent_correct), \
              word_perf, char_perf
    print

Example #4

0

Show file

File: LTS_RuleLearnerSrvr.py Project: aanavas/s2project

def LaunchServer():    
    # Delete the launch notification file if it happens to be hanging around
    # from a previous run (it shouldn't but could if the server was killed.)
        
    if os.path.exists (Launch_Notice_Filename):
        os.remove (Launch_Notice_Filename)


    # Define flag list and get command line option.     
    # Note: either --festdict or --lexlist is required  

    flag_list = [('help',        'this command'),
                 ('port=',       'IP port number for communication with xml server (default 8000)'),
                 ('workdir=',    'specifies the working directory to use (defaults to current)'),
                 ('phoneset=',   'filename of list of phonemes'),
                 ('prompts=',    'filename of festival-format prompt list'),   
                 ('festdict=',   'filename of festival-format pronunciation dictionary'),
                 ('lexlist=',    'filename of word list with word frequency counts'),
                 ('ignore=',     'filename of festival-format list of words to ignore (eg. those lexlearned previously)'),
                 ('allowables=', 'filename of (partial) list of LTS allowables (optional)'),
                 ('newpronuns=', 'filename of existing list of word pronunciations (optional)'),
                 ('rules=',      'filename of a pickled rule set that has previously been learned')]

    try:
        opt_list, prog_args = getopt.getopt (sys.argv[1:], '', column(flag_list))
        option_tbl = dict (opt_list)
    except getopt.GetoptError, msg:
        print 'Error', msg
        sys.exit('      type --help for program options\n')

Example #5

0

Show file

File: LTS_RuleSystem.py Project: aanavas/s2project

    def PredictMultipleWordPronuns (self, given_charseq, verbose=False):


        # ----------------------------------------------------------------------------------------------
        def FindAllSolutions (prediction):
            
            def DescendOneLevel (curr_solution, level):
                
                if level >= len (prediction): 
                    all_solutions.append (curr_solution[:])
                    curr_solution.pop()    
                    return

                letter, best_phoneseq, all_phoneseqs = prediction [level]
                    
                for one_phoneseq, score, rule_num in all_phoneseqs:
                    curr_solution.append ((one_phoneseq, score))
                    DescendOneLevel (curr_solution, level+1)
                        
                if curr_solution: curr_solution.pop()
            pass    
                
            pronun_list = []
            all_solutions = []     
            DescendOneLevel ([], 0)
                
            for one_soln in all_solutions:
                rhs_seq = column (one_soln, 0)
                phoneseq = []    
                for rhs in rhs_seq: 
                    if rhs: phoneseq.append (string.join(rhs))
                score   = sum (column (one_soln, 1))
                pronun_list.append ((score, string.join(phoneseq)))
                    
                             
            # sort words by score in _ascending_ order because lower counts     
            # represent more specialzed conditions identified by letter context.
                
            pronun_list.sort (reverse=False)    
            answer = [(pronun, count) for (count, pronun) in pronun_list]

            return answer
        pass        


        pred_phoneseq, prediction = self.PredictOneWordPronun (given_charseq)
        all_solutions = FindAllSolutions (prediction)
            
        if verbose and len (all_solutions) > 0:
            for soln in all_solutions:
                print 'Word soln:', soln
            print
        

        # do this because FindAllSolutions doesn't get pronunciations
        # when some of the letters go to '?'                         
            
        answer = [string.join(pred_phoneseq)] + column (all_solutions[1:])
        return answer

Example #6

0

Show file

File: LTS_IO.py Project: aanavas/s2project

def WeightedAve (given_list):
    
    answer = 0.0
    total_count = sum (column (given_list))

    for item_info in given_list:
        item_count  = item_info[0]
        item_value  = item_info[1]
        item_weight = float(item_count) / float(total_count)
        answer += item_weight * item_value
    return answer

Example #7

0

Show file

File: LTS_RuleSystem.py Project: aanavas/s2project

        def FindAllSolutions (prediction):
            
            def DescendOneLevel (curr_solution, level):
                
                if level >= len (prediction): 
                    all_solutions.append (curr_solution[:])
                    curr_solution.pop()    
                    return

                letter, best_phoneseq, all_phoneseqs = prediction [level]
                    
                for one_phoneseq, score, rule_num in all_phoneseqs:
                    curr_solution.append ((one_phoneseq, score))
                    DescendOneLevel (curr_solution, level+1)
                        
                if curr_solution: curr_solution.pop()
            pass    
                
            pronun_list = []
            all_solutions = []     
            DescendOneLevel ([], 0)
                
            for one_soln in all_solutions:
                rhs_seq = column (one_soln, 0)
                phoneseq = []    
                for rhs in rhs_seq: 
                    if rhs: phoneseq.append (string.join(rhs))
                score   = sum (column (one_soln, 1))
                pronun_list.append ((score, string.join(phoneseq)))
                    
                             
            # sort words by score in _ascending_ order because lower counts     
            # represent more specialzed conditions identified by letter context.
                
            pronun_list.sort (reverse=False)    
            answer = [(pronun, count) for (count, pronun) in pronun_list]

            return answer

Example #8

0

Show file

File: test_lts_learner.py Project: aanavas/s2project

        alignment_file_pathname = option_tbl.get('--align','')
 
        #for alignment_pathname in prog_args:
        for alignment_pathname in [alignment_file_pathname]:
            if os.path.exists (alignment_pathname):
                t1 = time.time()
                lts_measurer = Measurer()
                lts_measurer.InitWithAlignments (alignment_pathname)
                print 'Reading alignments from %s in %3.2f s\n' %(alignment_pathname, time.time()-t1)
                #all_results.append (TestRuleLearner (lts_learner, lts_measurer))
                    
        lts_measurer.WriteWagonTrainingData('wagon.data')
            
                     
        if len (all_results) > 1:
            total_words_correct  = sum (column (column (all_results)))
            total_words_tested   = sum (column (column (all_results), 1))
            total_chars_correct  = sum (column (column (all_results,1)))
            total_chars_tested   = sum (column (column (all_results,1), 1))
            word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested)
            char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested)
                 
            print ' Total words correct: %6.3f  %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested)
            print ' Total chars correct: %6.3f  %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested)
            print '--------------------\n'



    # --measure_curve
    elif option_tbl.has_key ('--measure_curve') and option_tbl.has_key ('--rules'):
        ComputeAccuracyVsRulesCurve (lts_learner)

Example #9

0

Show file

File: convert_dictionary_format.py Project: aanavas/s2project

from Column import column


# ==============
# Mainline code.
# ==============

if __name__ == "__main__":
    flag_list = [
        ("help", "this command"),
        ("festdict=", "filename of festival-format pronunciation dictionary"),
        ("janusdict=", "filename of janus-format pronunciation dictionary"),
    ]

    try:
        opt_list, prog_args = getopt.getopt(sys.argv[1:], "", column(flag_list))
        option_tbl = dict(opt_list)
    except getopt.GetoptError, msg:
        print "Error", msg
        sys.exit("      type --help for program options\n")

    # Print out the options if requested.

    if option_tbl.has_key("--help") or len(sys.argv) == 1:
        print "python", sys.argv[0]
        for option_flag, description in flag_list:
            print "%12s %s" % (option_flag, description)
        print
        sys.exit()

    # Read in the festival format dictionary and convert it to janus format.

Example #10

0

Show file

File: measure_word_coverage.py Project: aanavas/s2project

    uncovered_words = {}

    for prompt_name, prompt in sorted(recorded_prompt_tbl.items()):
        word_list = prompt.split()
        total_prompt_tokens += len(word_list)

        for raw_word in word_list:
            word = DictionaryIO.TrimExternalPunctuation(raw_word).lower()
            total_prompt_words.add(word)

            if word in words_with_pronuns:
                prompt_tokens_covered += 1
            else:
                uncovered_words[word] = uncovered_words.get(word, 0) + 1

    total_token_count = sum(column(word_freq_list, 2))
    covered_prompt_words = len(total_prompt_words) - len(uncovered_words)
    corpus_word_percent = 100.0 * len(word_pronun_counts) / float(max(1, (len(word_freq_list))))
    corpus_token_percent = 100.0 * sum(word_pronun_counts.values()) / float(max(1, total_token_count))
    prompt_token_percent = 100.0 * prompt_tokens_covered / float(max(1, total_prompt_tokens))
    prompt_word_percent = 100.0 * covered_prompt_words / float(max(1, len(total_prompt_words)))

    print "  corpus word coverage: %6i / %-6i (%3.2f)" % (
        len(word_pronun_counts),
        len(word_freq_list),
        corpus_word_percent,
    )
    print " corpus token coverage: %6i / %-6i (%3.2f)" % (
        sum(word_pronun_counts.values()),
        total_token_count,
        corpus_token_percent,

Example #11

0

Show file

File: DictionaryIO.py Project: aanavas/s2project

        print 'Warning: %s is not a utf-8 file, trying again as iso-8859' %(dict_pathname)
        infile.close()
        infile = codecs.open (dict_pathname, 'r', 'iso8859')
        ReadFile (infile)    
    infile.close()
        
    # convert the word count table to a list
                
    word_freq_list = []
    for word in sorted (word_freq_table):
        word_freq_list.append ([word, '', word_freq_table[word]])

    # then either extract only the words or sort the contents by frequency

    if words_only:
        return column (word_freq_list, 0)
            
    elif sorted_by_count:
        sorted_by_count = sorted ([(v, k) for k, v in word_freq_table.items()], reverse=True)
        sorted_by_count = [[v, '', k] for k, v in sorted_by_count]
        return sorted_by_count
            
    else:        
        return word_freq_list            

pass


# --------------------------------------------------------------------------------------------------

Example #12

0

Show file

File: DictionaryIO.py Project: aanavas/s2project

def ReadFestivalDictionary (dict_pathname,
                            words_as_charseq   = False,
                            words_only         = False, 
                            reverse_direction  = False, 
                            character_encoding = 'auto'):
    
    # --------------------------------------------------------------------------
    def DetermineEncoding():

        infile = file (dict_pathname, 'r')
            
        for line in infile:
            for ch in line:
                if ch not in string.printable:
                    infile.close()
                    return 'utf-8'
        return 'ascii'
                        

    if character_encoding == 'auto':
        character_encoding = DetermineEncoding()
        
    if character_encoding != 'ascii':
        dec = codecs.getdecoder (character_encoding)
        enc = codecs.getencoder (character_encoding)
         
    word_pronun_list  = []
    word_pronun_table = {}
    phone_number_tbl  = {}

    char_mapping = string.maketrans ('()"','   ')
         
    infile = file (dict_pathname, 'r')
   #infile = codecs.open (dict_pathname, 'r', 'utf-8')


    for cnt, rawline in enumerate (infile.readlines()):
        line = string.strip (rawline.strip()[1:-1])
        if not line: continue
                
        num_left_parens = line.count('(')
        pronun_part = ''    
            
        # Case 1. the word is given as a string
            
        if num_left_parens == 1:
            quote_pos = string.rfind (line, '"') + 2
            word_part = string.translate (line[:quote_pos], char_mapping)
            rest_part = string.translate (line[quote_pos:], char_mapping)
            fields    = string.split (rest_part, maxsplit=1)
                
            #print cnt+1, len(fields), quote_pos, character_encoding, rest_part
                
            if len(fields) >= 2: 
                if character_encoding == 'ascii':
                     word_letters = list (string.strip(word_part))
                else:    
                    try:
                        word_unicode = dec  (string.strip(word_part))
                        word_letters = list (word_unicode[0])
                        assert len(word_letters) == enc(word_unicode[0])[1]
                    except:
                        #print 'SKIPPING', cnt+1, line
                        continue
                        
                annotation  = fields[0]    
                pronun_part = fields[1]
        
        # Case 2. the word given as a character sequence
        elif num_left_parens == 2:
            pos1 = line.find ('(')
            pos2 = line.find (')')
            pos3 = line.rfind ('(')
            pos4 = line.rfind (')')

            letter_part  = line[pos1:pos2].translate (char_mapping)
            pronun_part  = line[pos3:pos4].translate (char_mapping)
            annotation   = line[pos2-1:pos3-1].strip() 
            word_letters = letter_part.split()
                
        if not pronun_part: continue

        
        # check for duplicates before adding words to the pronunciation list
            
        word_string  = string.join (word_letters,'')
        word_phones  = tuple (pronun_part.split())
        word_letters = tuple (word_letters)
            
        if words_as_charseq:
            word = word_letters
        else:
            word = word_string        

        # This adds a default word count of 1, and allows only unique words
        

        if not word_pronun_table.has_key(word):
            if not reverse_direction:
                word_pronun_list.append ([word, word_phones, 1])
            else:
                word_pronun_list.append ([word_phones, word, 1])
            word_pronun_table [word] = True
        
        """            
        if not reverse_direction:
            word_pronun_list.append ([word_letters, word_phones, annotation])
        else:
            word_pronun_list.append ([word_phones, word_letters, annotation])
        """
                
    infile.close()


    if words_only:
        return column (word_pronun_list, 0)
    else:        
        return word_pronun_list

Example #13

0

Show file

File: PronunciationOracle.py Project: aanavas/s2project

 def GetFullWordsOnlyList (self): 
     return column (self.word_pronuns_list)

Example #14

0

Show file

File: LTS_IO.py Project: aanavas/s2project

    def PrintCondensedList (production_list):
        
        # create table that maps lhs -> all rhs productions, with counts

        lts_productions = {}
            
        for cnt, (usage_count, production) in enumerate (sorted_production_list):
            lhs = production[0]
            rhs = production[1]    
            lts_productions [lhs] = lts_productions.get(lhs,[]) + [(rhs, usage_count)]
                

        # write out the results alphabetically
            
        outfile.write ('LTS Productions organized by letter:\n')

        lhs_keys = lts_productions.keys()
        lhs_keys.sort (cmp = LatinCharacterSet.OrderLetters)


        # First pass.
        production_info = []

        for i, lhs in enumerate (lhs_keys):
            rhs_productions = lts_productions[lhs]
            rhs_prod_counts = column (rhs_productions,1)
            rhs_prod_list   = column (rhs_productions,0)    
            rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter)
            num_productions = sum (rhs_prod_counts)
            entropy = TinyStats.ComputeEntropyFromSymbolCounts (rhs_productions)
            perplexity = math.pow (2.0, entropy)   
            production_info.append ((num_productions, perplexity, len(rhs_display_str)) )

        try:
            max_rhs_string_length = max (column (production_info,2))
        except:        
            max_rhs_string_length = 1
             
        formatting_string = string.replace ('%4i. %6i %6.3f %4s -> %-Xs', 'X', str(max_rhs_string_length+2))


        # Second pass.
        for i, lhs in enumerate (lhs_keys):
            rhs_productions = lts_productions[lhs]
            rhs_prod_counts = column (rhs_productions,1)
            rhs_prod_list   = column (rhs_productions,0)    
            rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter)
                
            num_productions, perplexity = production_info[i][:2]
            outfile.write ('%4i' %(i+1))    
            
            try:
                outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs, rhs_display_str))
            except UnicodeEncodeError:
                lhs_utf = utf(lhs)[0]
                outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs_utf, rhs_display_str))
            
            for j, prod_count in enumerate (rhs_prod_counts):
                #if j > 3: continue
                outfile.write ('%6i' %(prod_count))
            outfile.write('\n')
        outfile.write('\n')
            

        perplexity_list = column (production_info,1)
            
        if perplexity_list:
            min_perplexity = min (perplexity_list)
            max_perplexity = max (perplexity_list)
        else:        
            min_perplexity = 0
            max_perplexity = 0

        outfile.write ('Num letter productions: %5i\n'  %(len(sorted_production_list)))    
        outfile.write ('Min letter perplexity: %6.3f\n' %(min_perplexity))
        outfile.write ('Max letter perplexity: %6.3f\n' %(max_perplexity))
        outfile.write ('Ave letter perplexity: %6.3f\n' %(WeightedAve(production_info)))
        outfile.write ('\n')
            
        return lts_productions

Example #15

0

Show file

File: WordSelector.py Project: aanavas/s2project

 def GetLetterStats (self):
     return column (self.ngram_stats)

Example #16

0

Show file

File: LTS_IO.py Project: aanavas/s2project

def WriteOutRules (output_filename, lts_rule_system, phone_name_converter = {}, include_sorted_rule_list = False):
    
    # --------------------------------------------------------------------------
    # Added this to protect from automatic conversion to unicode through xml-rpc
    # This is just a short term fix before fully supporting UTF-8.              
        
    def Enc (given_string, encoding = 'latin-1'):
        if type (given_string) == type ('ascii'):
            return given_string
        else:
            return given_string.encode (encoding)



    # case 1. open a file suitable for writing utf-8 strings
    # case 2. most likely this is sys.stdout                
        
    if type (output_filename) == type('string'):
        outfile = codecs.open (output_filename, 'w', 'utf-8')    
    else:
        outfile = output_filename
        

    rule_count = 0
    lhs_symbol_list = lts_rule_system.keys()
    lhs_symbol_list.sort (cmp = LatinCharacterSet.OrderLetters)
    lhs_perplexity_list = []    
    

    # find the right width for the rhs part
    M = max_rhs_symbol_length = 0
        
    sorted_rule_list = []
        

    for lhs in lhs_symbol_list:
        lts_rule_chain = lts_rule_system[lhs]
        for rule_context, rhs_symbol_seq, application_count in lts_rule_chain:
            rhs_string  = string.join (rhs_symbol_seq,'-')
                
           #M = max (M, len(Enc(rhs_string)))
            M = max (M, len(utf(rhs_string)))
                 
            sorted_rule_list.append ((application_count, lhs, rhs_symbol_seq, rule_context))
                
    format_str = string.replace ('%6i. %2s -> %Xs / %s [%i] %s', 'X', str(M))


    outfile.write ('LTS Rule System:\n')
    outfile.write ('%6s  %6s\n' %('Count','Perplexity'))    

    for lhs in lhs_symbol_list:
        lts_rule_chain = lts_rule_system[lhs]
            
        entropy = TinyStats.ComputeEntropyFromSymbolCounts (column (lts_rule_chain,1,3))
        perplexity = math.pow (2.0, entropy)
        application_total = sum (column (lts_rule_chain,2))    
        lhs_perplexity_list.append ((application_total, perplexity))
         
        outfile.write ('%6i %6.3f' %(len(lts_rule_chain), perplexity))

        for rule_context, rhs_symbol_seq, application_count in lts_rule_chain:
            rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs_symbol_seq)
            rule_count += 1    
            lhs_symbol  = rule_context[1]
            context_str = rule_context[0] + '_' + rule_context[2]
            rhs_string  = string.join (rhs_symbol_seq,'-')
            num_chars   = 1 + int (math.log (max(1.0,application_count),10.0))
            spacer      = ' ' * max (0, (10 - num_chars - len(context_str)))
            if rhs_string == '': rhs_string = '_'
                
            lhs_utf = utf(lhs_symbol)[0]
            rhs_utf = utf(rhs_string)[0]
            ctx_utf = utf(context_str)[0]       
                
                
           #outfile.write (format_str %(rule_count, Enc(lhs_symbol), Enc(rhs_string), Enc(context_str), application_count, spacer))
           #outfile.write (format_str %(rule_count, lhs_utf, rhs_utf, ctx_utf, application_count, spacer))
            outfile.write (format_str %(rule_count, lhs_symbol, rhs_string, context_str, application_count, spacer))
                  
        outfile.write('\n')


    perplexity_list = column (lhs_perplexity_list,1)
        
    if perplexity_list:
        min_perplexity = min (perplexity_list)
        max_perplexity = max (perplexity_list)
    else:        
        min_perplexity = 0
        max_perplexity = 0

    outfile.write ('\n')
    outfile.write ('Number of LTS rules: %i\n' %(rule_count))            
    outfile.write ('Min lts rule perplexity: %6.3f\n' %(min_perplexity))
    outfile.write ('Max lts rule perplexity: %6.3f\n' %(max_perplexity))
    outfile.write ('Ave lts rule perplexity: %6.3f\n' %(WeightedAve(lhs_perplexity_list)))
    outfile.write ('\n')
        

    if include_sorted_rule_list:
        sorted_rule_list.sort (reverse=True)
        outfile.write ('Rules sorted by count:')
            
        for i, (rule_count, lhs, rhs, rule_context) in enumerate (sorted_rule_list):
            rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs)
            context_str = rule_context[0] + '_' + rule_context[2]
            rhs_string  = string.join (rhs_symbol_seq,'-')
            outfile.write ('\n')
            outfile.write (format_str %(i+1, lhs, rhs_string, context_str, rule_count, ''))
        outfile.write ('\n\n')

Example #17

0

Show file

File: WordSelector.py Project: aanavas/s2project

    def FindCoverageWordList (sorted_rule_list):
        

        # ------------------------------------------------------------------------------------------
        def FindOneWord (given_rule_list, rules_not_covered, test_size_threshold = 1):
                
            prev_word_set   = set()
            cand_word_set   = set()
            local_rule_list = given_rule_list[:]
           #local_rule_list.reverse()

            for i, (count, rule) in enumerate (local_rule_list):
                if rule not in rules_not_covered: continue
                    
                if not cand_word_set:
                    cand_word_set = rule_word_id_sets [rule]
                else:
                    cand_word_set = cand_word_set.intersection (rule_word_id_sets[rule])
                
                
                if cand_word_set: 
                    prev_word_set = cand_word_set           # save the last non-empty set
                    if len(cand_word_set) <= test_size_threshold: break
                else:        
                    cand_word_set = prev_word_set           # restore the candidate set to be non-empty
                
                # alternate halt strategry that is
                # not as effective as the above   
                # if not cand_word_set: break     
                # prev_word_set = cand_word_set   

                
            best_id, best_score, best_rules, rule_counts = FindBestWordInSet (prev_word_set, rules_not_covered, verbose=False)
            best_word     = string.join (given_word_pronun_list[best_id][0],'')
            cand_word_set = rule_word_id_sets[rule]
                
            n = len (sorted_word_id_list) + len (best_word_id_list) + 1

            print 'Word %6i %5i %8.2f %4i %4i  %s' %(n, len(best_word), best_score, len(best_rules), len(rules_not_covered), best_word), best_rules
        
            return best_id, rule_counts
        pass        
        #---
           
            
        
        
        Set_Size_Threshold = 100
          
        # old
        # non_covered_rules = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items())))
            
        non_covered_rules = set()
        for count, rule in sorted_rule_list:
            rule_lhs, rule_rhs_index = rule
            if rule_rhs_index == 1 and rule_word_id_sets[rule]:
                non_covered_rules.add (rule)


        best_word_id_set  = set()
        best_word_id_list = []
            
            
        # pass 1
        
        loop_count = 0
        num_reinsertions = 0    
            
                
        while non_covered_rules:    
            loop_count += 1
            if loop_count >= 510: return best_word_id_list      # jmk!!!
                
            print '%6i. Searching for %i rules among %i ...' %(loop_count, len(non_covered_rules), len(sorted_rule_list))
                
            
            # resort the rules
            sorted_rule_list = [(count, rule) for (rule, count) in rule_count_table.items()]
            sorted_rule_list.sort (reverse=True)

            j = 0
            for count, rule in sorted_rule_list:
                if rule in non_covered_rules:
                    j += 1
                    print '%6i looking %6i %s' %(j, count, rule)    
            print
            

                    
            best_word_id, rules_covered = FindOneWord (sorted_rule_list, non_covered_rules, Set_Size_Threshold)
                
            best_word_id_set.add (best_word_id)
            best_word_id_list.append (best_word_id)    
            UpdateWordIdSets (best_word_id, rules_covered.keys())
                

            for rule, count in rules_covered.items():    
                selected_words_rule_count_tbl [rule] = selected_words_rule_count_tbl.get(rule,0) + count
                  

            word_charseq = given_word_pronun_list [best_word_id][0]
            word_charset = set (word_charseq)    
            word  = string.join (word_charseq,'')


            for lhs in word_charset:
                rule_chain = rule_count_lhs_table.get(lhs,[])
                    
                for i in range (1,len(rule_chain)):
                    this_rule_total, this_rule = rule_chain[i]
                    prev_rule_total, prev_rule = rule_chain[i-1]
                    this_rule_count = selected_words_rule_count_tbl.get (this_rule,0)
                    prev_rule_count = selected_words_rule_count_tbl.get (prev_rule,0)
                    
                    print 'here %2s %4i %6i %8s %12s %-2i %6i' \
                        %(lhs, i, prev_rule_count, prev_rule, this_rule, this_rule_count, this_rule_total),
                         
                    if this_rule_count == 0:
                        if this_rule not in non_covered_rules:
                            non_covered_rules.add (this_rule)
                            print '  adding this', this_rule,
                                
                            if prev_rule_count <= 1:
                                non_covered_rules.add (prev_rule)
                                rule_count_table [prev_rule] = this_rule_total + 1   
                                print '  adding prev', prev_rule,
                        print
                        break        
                        
                    elif prev_rule_count <= this_rule_count and prev_rule not in non_covered_rules:
                        non_covered_rules.add (prev_rule)
                        rule_count_table [prev_rule] = this_rule_total + 1   
                        print '  adding prev', prev_rule    
                        break    
                    else:
                        print
                         
            print

            """
            for word_rule, word_count in rules_covered.items():
                rule_lhs, rule_rank = word_rule
                rule_chain = rule_count_lhs_table [rule_lhs]
                N = len (rule_chain)    
                next_rule = ''

                for i, (occurance_count, rule) in enumerate (rule_chain):
                    if rule == word_rule:
                        if i+1 < N:
                            next_rule = rule_chain[i+1][1]
                            break
                                
                rule_already_wanted = next_rule in non_covered_rules
                    
                if next_rule and not rule_already_wanted:
                    non_covered_rules.add (word_rule)
                    non_covered_rules.add (next_rule)
                        
                this_word_count = selected_words_rule_count_tbl.get (word_rule,0)
                next_word_count = selected_words_rule_count_tbl.get (next_rule,0)

                print 'here %4i %s %6s %s %6i %6i %s' \
                    %(len(non_covered_rules), rule, rule_already_wanted, next_rule, this_word_count, next_word_count, word)
            """        

           
            """
            for rule in rules_covered.keys():    
                selected_rule_count = selected_words_rule_count_tbl [rule]
                #print '%6i %6i %2i %s' %(rule_count_table[rule], selected_rule_count, count, rule)
                
                # this is inefficient!
                for compare_count, compare_rule in sorted_rule_list:
                    if rule[0] == compare_rule[0]:
                        if rule_count_table [rule] <                \
                          rule_count_table [compare_rule]           \
                        and                                         \
                           selected_words_rule_count_tbl [rule] >=  \
                           selected_words_rule_count_tbl.get(compare_rule,0):
                            num_reinsertions += 1
                            non_covered_rules.add (compare_rule)
                            #print '  re-adding rule %6i %6i %s' %(count, rule_count_table [compare_rule], compare_rule)
            """
                
        pass 
                
        print 'Num reinsertions', num_reinsertions
        return best_word_id_list
        
        # Note: pass 2 may not work with this revised algorithm, so skip
            


        # pass 2

        non_covered_rules   = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items())))
        total_letter_count  = 0 
        total_rules_covered = set()    
        accumulated_score   = 1.0    
        new_word_id_list    = []

        while best_word_id_set:
            word_id, word_score, word_rules, rule_counts = FindBestWordInSet (best_word_id_set, non_covered_rules) 
            best_word_id_set.discard (word_id)
            new_word_id_list.append  (word_id)    
            UpdateWordIdSets (word_id, word_rules)
                
            list_size = len (new_word_id_list)
            entry = given_word_pronun_list [word_id]    
            word  = string.join (entry[0],'')
            total_letter_count  += len(word)    
            total_rules_covered |= word_rules
            current_merit = sum (map ((lambda x: rule_count_table[x]), total_rules_covered))
            current_score = 1.0 - float(current_merit) / maximum_merit    
            accumulated_score += current_score * len(word)
            
            print '%4i %5i %8.2f %4i %4i %8i %8.3f %8.3f %s' \
                %(list_size, total_letter_count, word_score, len(word_rules), len(total_rules_covered), current_merit, current_score, accumulated_score, word)
                    
            if not non_covered_rules: break
        print


        return new_word_id_list