Ejemplo n.º 1
def TestRuleSystem (lts_learner, lts_rules, verbose=False):
    all_results = []
    for alignment_pathname in prog_args:
        if os.path.exists (alignment_pathname):
            t1 = time.time()
            lts_measurer = Measurer()    
            lts_measurer.InitWithAlignments (alignment_pathname)
            if verbose: print 'Reading alignments from %s in %3.2f s' %(alignment_pathname, time.time()-t1)
            all_results.append (lts_measurer.MeasurePredictionAccuracy (lts_learner, lts_rules, verbose=True))
    if len (all_results) > 0:
        total_words_correct  = sum (column (column (all_results)))
        total_words_tested   = sum (column (column (all_results), 1))
        total_chars_correct  = sum (column (column (all_results,1)))
        total_chars_tested   = sum (column (column (all_results,1), 1))
        word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested)
        char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested)
        if verbose:
            print ' Threshold %i' %(threshold_value)
            print ' Total words correct: %6.3f  %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested)
            print ' Total chars correct: %6.3f  %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested)
            print '--------------------\n'

    return (total_words_correct, total_words_tested), (total_chars_correct, total_chars_tested)
Ejemplo n.º 2
    def WriteOutLetterToSoundSolutions (outfile, word_counter, charseq, phoneseq, solutions, include_failures=False):
        word = string.join (charseq,  '')
        pron = string.join (phoneseq, ' ')    
        if solutions: 
            for cnt, (score, soln) in enumerate (solutions):
                total_score = reduce ((lambda x,y: x+y), column(soln,2)) 

                outfile.write ('  %5i.  Soln %i %5.0f  %20s : %-40s |' \
                    %(word_counter, cnt+1, total_score, word, pron))
                for (source, target, score) in soln:
                    num_chars = 1 + int (math.log (max(1.0,score),10.0))
                    spacer = ' ' * max (0, (5 - num_chars))
                    if len(target) == 0: target = ('_')
                    outfile.write (' %2s [%i] %s -> %-6s' \
                          score, spacer, 
                          string.join(target) ))
        elif include_failures:
            outfile.write ('  %5i.  FAILED  %5s %20s : %-40s |' %(word_counter, '', word, pron))
            outfile.write (' %2s -> %-5s\n' %(word, ''))
Ejemplo n.º 3
def ComputeAccuracyVsRulesCurve (lts_learner, max_threshold = 9999999999):

    rule_counts_list = list (set (column (lts_learner.GetRulesByCount())))

    print '%6s %8s %8s' %('Thresh', 'Rules', 'Chars')
    for thr in rule_counts_list:
        lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (thr)
        filename = 'rules/lexrules_%s.scm' %(string.zfill(num_lts_rules,5))
        print '%6i %8i %8i' %(thr, num_lts_rules, num_lts_chars)
        lts_learner.WriteOutFestivalRules (filename, 'cmu_us', lts_rules)
        #new_lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (3)
        #lts_learner.WriteOutFestivalRules ('afile', new_lts_rules)
    print '%4s %6s %6s %8s %8s' %('Num', 'Thresh', 'Rules', 'Words', 'Chars')
    for i, threshold in enumerate (rule_counts_list):
        if threshold > max_threshold: continue
        lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (threshold)
        word_perf, char_perf = TestRuleSystem (lts_learner, lts_rules)
        word_percent_correct = 100.0 * word_perf[0] / word_perf[1]
        char_percent_correct = 100.0 * char_perf[0] / char_perf[1]
        print '%4i %6i %6i %8.3f %8.3f  '  \
            %(i+1, threshold, num_lts_rules, word_percent_correct, char_percent_correct), \
              word_perf, char_perf
Ejemplo n.º 4
def LaunchServer():    
    # Delete the launch notification file if it happens to be hanging around
    # from a previous run (it shouldn't but could if the server was killed.)
    if os.path.exists (Launch_Notice_Filename):
        os.remove (Launch_Notice_Filename)

    # Define flag list and get command line option.     
    # Note: either --festdict or --lexlist is required  

    flag_list = [('help',        'this command'),
                 ('port=',       'IP port number for communication with xml server (default 8000)'),
                 ('workdir=',    'specifies the working directory to use (defaults to current)'),
                 ('phoneset=',   'filename of list of phonemes'),
                 ('prompts=',    'filename of festival-format prompt list'),   
                 ('festdict=',   'filename of festival-format pronunciation dictionary'),
                 ('lexlist=',    'filename of word list with word frequency counts'),
                 ('ignore=',     'filename of festival-format list of words to ignore (eg. those lexlearned previously)'),
                 ('allowables=', 'filename of (partial) list of LTS allowables (optional)'),
                 ('newpronuns=', 'filename of existing list of word pronunciations (optional)'),
                 ('rules=',      'filename of a pickled rule set that has previously been learned')]

        opt_list, prog_args = getopt.getopt (sys.argv[1:], '', column(flag_list))
        option_tbl = dict (opt_list)
    except getopt.GetoptError, msg:
        print 'Error', msg
        sys.exit('      type --help for program options\n')
Ejemplo n.º 5
    def PredictMultipleWordPronuns (self, given_charseq, verbose=False):

        # ----------------------------------------------------------------------------------------------
        def FindAllSolutions (prediction):
            def DescendOneLevel (curr_solution, level):
                if level >= len (prediction): 
                    all_solutions.append (curr_solution[:])

                letter, best_phoneseq, all_phoneseqs = prediction [level]
                for one_phoneseq, score, rule_num in all_phoneseqs:
                    curr_solution.append ((one_phoneseq, score))
                    DescendOneLevel (curr_solution, level+1)
                if curr_solution: curr_solution.pop()
            pronun_list = []
            all_solutions = []     
            DescendOneLevel ([], 0)
            for one_soln in all_solutions:
                rhs_seq = column (one_soln, 0)
                phoneseq = []    
                for rhs in rhs_seq: 
                    if rhs: phoneseq.append (string.join(rhs))
                score   = sum (column (one_soln, 1))
                pronun_list.append ((score, string.join(phoneseq)))
            # sort words by score in _ascending_ order because lower counts     
            # represent more specialzed conditions identified by letter context.
            pronun_list.sort (reverse=False)    
            answer = [(pronun, count) for (count, pronun) in pronun_list]

            return answer

        pred_phoneseq, prediction = self.PredictOneWordPronun (given_charseq)
        all_solutions = FindAllSolutions (prediction)
        if verbose and len (all_solutions) > 0:
            for soln in all_solutions:
                print 'Word soln:', soln

        # do this because FindAllSolutions doesn't get pronunciations
        # when some of the letters go to '?'                         
        answer = [string.join(pred_phoneseq)] + column (all_solutions[1:])
        return answer
Ejemplo n.º 6
def WeightedAve (given_list):
    answer = 0.0
    total_count = sum (column (given_list))

    for item_info in given_list:
        item_count  = item_info[0]
        item_value  = item_info[1]
        item_weight = float(item_count) / float(total_count)
        answer += item_weight * item_value
    return answer            
Ejemplo n.º 7
        def FindAllSolutions (prediction):
            def DescendOneLevel (curr_solution, level):
                if level >= len (prediction): 
                    all_solutions.append (curr_solution[:])

                letter, best_phoneseq, all_phoneseqs = prediction [level]
                for one_phoneseq, score, rule_num in all_phoneseqs:
                    curr_solution.append ((one_phoneseq, score))
                    DescendOneLevel (curr_solution, level+1)
                if curr_solution: curr_solution.pop()
            pronun_list = []
            all_solutions = []     
            DescendOneLevel ([], 0)
            for one_soln in all_solutions:
                rhs_seq = column (one_soln, 0)
                phoneseq = []    
                for rhs in rhs_seq: 
                    if rhs: phoneseq.append (string.join(rhs))
                score   = sum (column (one_soln, 1))
                pronun_list.append ((score, string.join(phoneseq)))
            # sort words by score in _ascending_ order because lower counts     
            # represent more specialzed conditions identified by letter context.
            pronun_list.sort (reverse=False)    
            answer = [(pronun, count) for (count, pronun) in pronun_list]

            return answer
Ejemplo n.º 8
        alignment_file_pathname = option_tbl.get('--align','')
        #for alignment_pathname in prog_args:
        for alignment_pathname in [alignment_file_pathname]:
            if os.path.exists (alignment_pathname):
                t1 = time.time()
                lts_measurer = Measurer()
                lts_measurer.InitWithAlignments (alignment_pathname)
                print 'Reading alignments from %s in %3.2f s\n' %(alignment_pathname, time.time()-t1)
                #all_results.append (TestRuleLearner (lts_learner, lts_measurer))
        if len (all_results) > 1:
            total_words_correct  = sum (column (column (all_results)))
            total_words_tested   = sum (column (column (all_results), 1))
            total_chars_correct  = sum (column (column (all_results,1)))
            total_chars_tested   = sum (column (column (all_results,1), 1))
            word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested)
            char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested)
            print ' Total words correct: %6.3f  %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested)
            print ' Total chars correct: %6.3f  %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested)
            print '--------------------\n'

    # --measure_curve
    elif option_tbl.has_key ('--measure_curve') and option_tbl.has_key ('--rules'):
        ComputeAccuracyVsRulesCurve (lts_learner)
Ejemplo n.º 9
from Column import column

# ==============
# Mainline code.
# ==============

if __name__ == "__main__":
    flag_list = [
        ("help", "this command"),
        ("festdict=", "filename of festival-format pronunciation dictionary"),
        ("janusdict=", "filename of janus-format pronunciation dictionary"),

        opt_list, prog_args = getopt.getopt(sys.argv[1:], "", column(flag_list))
        option_tbl = dict(opt_list)
    except getopt.GetoptError, msg:
        print "Error", msg
        sys.exit("      type --help for program options\n")

    # Print out the options if requested.

    if option_tbl.has_key("--help") or len(sys.argv) == 1:
        print "python", sys.argv[0]
        for option_flag, description in flag_list:
            print "%12s %s" % (option_flag, description)

    # Read in the festival format dictionary and convert it to janus format.
Ejemplo n.º 10
    uncovered_words = {}

    for prompt_name, prompt in sorted(recorded_prompt_tbl.items()):
        word_list = prompt.split()
        total_prompt_tokens += len(word_list)

        for raw_word in word_list:
            word = DictionaryIO.TrimExternalPunctuation(raw_word).lower()

            if word in words_with_pronuns:
                prompt_tokens_covered += 1
                uncovered_words[word] = uncovered_words.get(word, 0) + 1

    total_token_count = sum(column(word_freq_list, 2))
    covered_prompt_words = len(total_prompt_words) - len(uncovered_words)
    corpus_word_percent = 100.0 * len(word_pronun_counts) / float(max(1, (len(word_freq_list))))
    corpus_token_percent = 100.0 * sum(word_pronun_counts.values()) / float(max(1, total_token_count))
    prompt_token_percent = 100.0 * prompt_tokens_covered / float(max(1, total_prompt_tokens))
    prompt_word_percent = 100.0 * covered_prompt_words / float(max(1, len(total_prompt_words)))

    print "  corpus word coverage: %6i / %-6i (%3.2f)" % (
    print " corpus token coverage: %6i / %-6i (%3.2f)" % (
Ejemplo n.º 11
        print 'Warning: %s is not a utf-8 file, trying again as iso-8859' %(dict_pathname)
        infile = codecs.open (dict_pathname, 'r', 'iso8859')
        ReadFile (infile)    
    # convert the word count table to a list
    word_freq_list = []
    for word in sorted (word_freq_table):
        word_freq_list.append ([word, '', word_freq_table[word]])

    # then either extract only the words or sort the contents by frequency

    if words_only:
        return column (word_freq_list, 0)
    elif sorted_by_count:
        sorted_by_count = sorted ([(v, k) for k, v in word_freq_table.items()], reverse=True)
        sorted_by_count = [[v, '', k] for k, v in sorted_by_count]
        return sorted_by_count
        return word_freq_list            


# --------------------------------------------------------------------------------------------------

Ejemplo n.º 12
def ReadFestivalDictionary (dict_pathname,
                            words_as_charseq   = False,
                            words_only         = False, 
                            reverse_direction  = False, 
                            character_encoding = 'auto'):
    # --------------------------------------------------------------------------
    def DetermineEncoding():

        infile = file (dict_pathname, 'r')
        for line in infile:
            for ch in line:
                if ch not in string.printable:
                    return 'utf-8'
        return 'ascii'

    if character_encoding == 'auto':
        character_encoding = DetermineEncoding()
    if character_encoding != 'ascii':
        dec = codecs.getdecoder (character_encoding)
        enc = codecs.getencoder (character_encoding)
    word_pronun_list  = []
    word_pronun_table = {}
    phone_number_tbl  = {}

    char_mapping = string.maketrans ('()"','   ')
    infile = file (dict_pathname, 'r')
   #infile = codecs.open (dict_pathname, 'r', 'utf-8')

    for cnt, rawline in enumerate (infile.readlines()):
        line = string.strip (rawline.strip()[1:-1])
        if not line: continue
        num_left_parens = line.count('(')
        pronun_part = ''    
        # Case 1. the word is given as a string
        if num_left_parens == 1:
            quote_pos = string.rfind (line, '"') + 2
            word_part = string.translate (line[:quote_pos], char_mapping)
            rest_part = string.translate (line[quote_pos:], char_mapping)
            fields    = string.split (rest_part, maxsplit=1)
            #print cnt+1, len(fields), quote_pos, character_encoding, rest_part
            if len(fields) >= 2: 
                if character_encoding == 'ascii':
                     word_letters = list (string.strip(word_part))
                        word_unicode = dec  (string.strip(word_part))
                        word_letters = list (word_unicode[0])
                        assert len(word_letters) == enc(word_unicode[0])[1]
                        #print 'SKIPPING', cnt+1, line
                annotation  = fields[0]    
                pronun_part = fields[1]
        # Case 2. the word given as a character sequence
        elif num_left_parens == 2:
            pos1 = line.find ('(')
            pos2 = line.find (')')
            pos3 = line.rfind ('(')
            pos4 = line.rfind (')')

            letter_part  = line[pos1:pos2].translate (char_mapping)
            pronun_part  = line[pos3:pos4].translate (char_mapping)
            annotation   = line[pos2-1:pos3-1].strip() 
            word_letters = letter_part.split()
        if not pronun_part: continue

        # check for duplicates before adding words to the pronunciation list
        word_string  = string.join (word_letters,'')
        word_phones  = tuple (pronun_part.split())
        word_letters = tuple (word_letters)
        if words_as_charseq:
            word = word_letters
            word = word_string        

        # This adds a default word count of 1, and allows only unique words

        if not word_pronun_table.has_key(word):
            if not reverse_direction:
                word_pronun_list.append ([word, word_phones, 1])
                word_pronun_list.append ([word_phones, word, 1])
            word_pronun_table [word] = True
        if not reverse_direction:
            word_pronun_list.append ([word_letters, word_phones, annotation])
            word_pronun_list.append ([word_phones, word_letters, annotation])

    if words_only:
        return column (word_pronun_list, 0)
        return word_pronun_list            
Ejemplo n.º 13
 def GetFullWordsOnlyList (self): 
     return column (self.word_pronuns_list)
Ejemplo n.º 14
    def PrintCondensedList (production_list):
        # create table that maps lhs -> all rhs productions, with counts

        lts_productions = {}
        for cnt, (usage_count, production) in enumerate (sorted_production_list):
            lhs = production[0]
            rhs = production[1]    
            lts_productions [lhs] = lts_productions.get(lhs,[]) + [(rhs, usage_count)]

        # write out the results alphabetically
        outfile.write ('LTS Productions organized by letter:\n')

        lhs_keys = lts_productions.keys()
        lhs_keys.sort (cmp = LatinCharacterSet.OrderLetters)

        # First pass.
        production_info = []

        for i, lhs in enumerate (lhs_keys):
            rhs_productions = lts_productions[lhs]
            rhs_prod_counts = column (rhs_productions,1)
            rhs_prod_list   = column (rhs_productions,0)    
            rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter)
            num_productions = sum (rhs_prod_counts)
            entropy = TinyStats.ComputeEntropyFromSymbolCounts (rhs_productions)
            perplexity = math.pow (2.0, entropy)   
            production_info.append ((num_productions, perplexity, len(rhs_display_str)) )

            max_rhs_string_length = max (column (production_info,2))
            max_rhs_string_length = 1
        formatting_string = string.replace ('%4i. %6i %6.3f %4s -> %-Xs', 'X', str(max_rhs_string_length+2))

        # Second pass.
        for i, lhs in enumerate (lhs_keys):
            rhs_productions = lts_productions[lhs]
            rhs_prod_counts = column (rhs_productions,1)
            rhs_prod_list   = column (rhs_productions,0)    
            rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter)
            num_productions, perplexity = production_info[i][:2]
            outfile.write ('%4i' %(i+1))    
                outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs, rhs_display_str))
            except UnicodeEncodeError:
                lhs_utf = utf(lhs)[0]
                outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs_utf, rhs_display_str))
            for j, prod_count in enumerate (rhs_prod_counts):
                #if j > 3: continue
                outfile.write ('%6i' %(prod_count))

        perplexity_list = column (production_info,1)
        if perplexity_list:
            min_perplexity = min (perplexity_list)
            max_perplexity = max (perplexity_list)
            min_perplexity = 0
            max_perplexity = 0

        outfile.write ('Num letter productions: %5i\n'  %(len(sorted_production_list)))    
        outfile.write ('Min letter perplexity: %6.3f\n' %(min_perplexity))
        outfile.write ('Max letter perplexity: %6.3f\n' %(max_perplexity))
        outfile.write ('Ave letter perplexity: %6.3f\n' %(WeightedAve(production_info)))
        outfile.write ('\n')
        return lts_productions
Ejemplo n.º 15
 def GetLetterStats (self):
     return column (self.ngram_stats)
Ejemplo n.º 16
def WriteOutRules (output_filename, lts_rule_system, phone_name_converter = {}, include_sorted_rule_list = False):
    # --------------------------------------------------------------------------
    # Added this to protect from automatic conversion to unicode through xml-rpc
    # This is just a short term fix before fully supporting UTF-8.              
    def Enc (given_string, encoding = 'latin-1'):
        if type (given_string) == type ('ascii'):
            return given_string
            return given_string.encode (encoding)

    # case 1. open a file suitable for writing utf-8 strings
    # case 2. most likely this is sys.stdout                
    if type (output_filename) == type('string'):
        outfile = codecs.open (output_filename, 'w', 'utf-8')    
        outfile = output_filename

    rule_count = 0
    lhs_symbol_list = lts_rule_system.keys()
    lhs_symbol_list.sort (cmp = LatinCharacterSet.OrderLetters)
    lhs_perplexity_list = []    

    # find the right width for the rhs part
    M = max_rhs_symbol_length = 0
    sorted_rule_list = []

    for lhs in lhs_symbol_list:
        lts_rule_chain = lts_rule_system[lhs]
        for rule_context, rhs_symbol_seq, application_count in lts_rule_chain:
            rhs_string  = string.join (rhs_symbol_seq,'-')
           #M = max (M, len(Enc(rhs_string)))
            M = max (M, len(utf(rhs_string)))
            sorted_rule_list.append ((application_count, lhs, rhs_symbol_seq, rule_context))
    format_str = string.replace ('%6i. %2s -> %Xs / %s [%i] %s', 'X', str(M))

    outfile.write ('LTS Rule System:\n')
    outfile.write ('%6s  %6s\n' %('Count','Perplexity'))    

    for lhs in lhs_symbol_list:
        lts_rule_chain = lts_rule_system[lhs]
        entropy = TinyStats.ComputeEntropyFromSymbolCounts (column (lts_rule_chain,1,3))
        perplexity = math.pow (2.0, entropy)
        application_total = sum (column (lts_rule_chain,2))    
        lhs_perplexity_list.append ((application_total, perplexity))
        outfile.write ('%6i %6.3f' %(len(lts_rule_chain), perplexity))

        for rule_context, rhs_symbol_seq, application_count in lts_rule_chain:
            rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs_symbol_seq)
            rule_count += 1    
            lhs_symbol  = rule_context[1]
            context_str = rule_context[0] + '_' + rule_context[2]
            rhs_string  = string.join (rhs_symbol_seq,'-')
            num_chars   = 1 + int (math.log (max(1.0,application_count),10.0))
            spacer      = ' ' * max (0, (10 - num_chars - len(context_str)))
            if rhs_string == '': rhs_string = '_'
            lhs_utf = utf(lhs_symbol)[0]
            rhs_utf = utf(rhs_string)[0]
            ctx_utf = utf(context_str)[0]       
           #outfile.write (format_str %(rule_count, Enc(lhs_symbol), Enc(rhs_string), Enc(context_str), application_count, spacer))
           #outfile.write (format_str %(rule_count, lhs_utf, rhs_utf, ctx_utf, application_count, spacer))
            outfile.write (format_str %(rule_count, lhs_symbol, rhs_string, context_str, application_count, spacer))

    perplexity_list = column (lhs_perplexity_list,1)
    if perplexity_list:
        min_perplexity = min (perplexity_list)
        max_perplexity = max (perplexity_list)
        min_perplexity = 0
        max_perplexity = 0

    outfile.write ('\n')
    outfile.write ('Number of LTS rules: %i\n' %(rule_count))            
    outfile.write ('Min lts rule perplexity: %6.3f\n' %(min_perplexity))
    outfile.write ('Max lts rule perplexity: %6.3f\n' %(max_perplexity))
    outfile.write ('Ave lts rule perplexity: %6.3f\n' %(WeightedAve(lhs_perplexity_list)))
    outfile.write ('\n')

    if include_sorted_rule_list:
        sorted_rule_list.sort (reverse=True)
        outfile.write ('Rules sorted by count:')
        for i, (rule_count, lhs, rhs, rule_context) in enumerate (sorted_rule_list):
            rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs)
            context_str = rule_context[0] + '_' + rule_context[2]
            rhs_string  = string.join (rhs_symbol_seq,'-')
            outfile.write ('\n')
            outfile.write (format_str %(i+1, lhs, rhs_string, context_str, rule_count, ''))
        outfile.write ('\n\n')
Ejemplo n.º 17
    def FindCoverageWordList (sorted_rule_list):

        # ------------------------------------------------------------------------------------------
        def FindOneWord (given_rule_list, rules_not_covered, test_size_threshold = 1):
            prev_word_set   = set()
            cand_word_set   = set()
            local_rule_list = given_rule_list[:]

            for i, (count, rule) in enumerate (local_rule_list):
                if rule not in rules_not_covered: continue
                if not cand_word_set:
                    cand_word_set = rule_word_id_sets [rule]
                    cand_word_set = cand_word_set.intersection (rule_word_id_sets[rule])
                if cand_word_set: 
                    prev_word_set = cand_word_set           # save the last non-empty set
                    if len(cand_word_set) <= test_size_threshold: break
                    cand_word_set = prev_word_set           # restore the candidate set to be non-empty
                # alternate halt strategry that is
                # not as effective as the above   
                # if not cand_word_set: break     
                # prev_word_set = cand_word_set   

            best_id, best_score, best_rules, rule_counts = FindBestWordInSet (prev_word_set, rules_not_covered, verbose=False)
            best_word     = string.join (given_word_pronun_list[best_id][0],'')
            cand_word_set = rule_word_id_sets[rule]
            n = len (sorted_word_id_list) + len (best_word_id_list) + 1

            print 'Word %6i %5i %8.2f %4i %4i  %s' %(n, len(best_word), best_score, len(best_rules), len(rules_not_covered), best_word), best_rules
            return best_id, rule_counts
        Set_Size_Threshold = 100
        # old
        # non_covered_rules = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items())))
        non_covered_rules = set()
        for count, rule in sorted_rule_list:
            rule_lhs, rule_rhs_index = rule
            if rule_rhs_index == 1 and rule_word_id_sets[rule]:
                non_covered_rules.add (rule)

        best_word_id_set  = set()
        best_word_id_list = []
        # pass 1
        loop_count = 0
        num_reinsertions = 0    
        while non_covered_rules:    
            loop_count += 1
            if loop_count >= 510: return best_word_id_list      # jmk!!!
            print '%6i. Searching for %i rules among %i ...' %(loop_count, len(non_covered_rules), len(sorted_rule_list))
            # resort the rules
            sorted_rule_list = [(count, rule) for (rule, count) in rule_count_table.items()]
            sorted_rule_list.sort (reverse=True)

            j = 0
            for count, rule in sorted_rule_list:
                if rule in non_covered_rules:
                    j += 1
                    print '%6i looking %6i %s' %(j, count, rule)    

            best_word_id, rules_covered = FindOneWord (sorted_rule_list, non_covered_rules, Set_Size_Threshold)
            best_word_id_set.add (best_word_id)
            best_word_id_list.append (best_word_id)    
            UpdateWordIdSets (best_word_id, rules_covered.keys())

            for rule, count in rules_covered.items():    
                selected_words_rule_count_tbl [rule] = selected_words_rule_count_tbl.get(rule,0) + count

            word_charseq = given_word_pronun_list [best_word_id][0]
            word_charset = set (word_charseq)    
            word  = string.join (word_charseq,'')

            for lhs in word_charset:
                rule_chain = rule_count_lhs_table.get(lhs,[])
                for i in range (1,len(rule_chain)):
                    this_rule_total, this_rule = rule_chain[i]
                    prev_rule_total, prev_rule = rule_chain[i-1]
                    this_rule_count = selected_words_rule_count_tbl.get (this_rule,0)
                    prev_rule_count = selected_words_rule_count_tbl.get (prev_rule,0)
                    print 'here %2s %4i %6i %8s %12s %-2i %6i' \
                        %(lhs, i, prev_rule_count, prev_rule, this_rule, this_rule_count, this_rule_total),
                    if this_rule_count == 0:
                        if this_rule not in non_covered_rules:
                            non_covered_rules.add (this_rule)
                            print '  adding this', this_rule,
                            if prev_rule_count <= 1:
                                non_covered_rules.add (prev_rule)
                                rule_count_table [prev_rule] = this_rule_total + 1   
                                print '  adding prev', prev_rule,
                    elif prev_rule_count <= this_rule_count and prev_rule not in non_covered_rules:
                        non_covered_rules.add (prev_rule)
                        rule_count_table [prev_rule] = this_rule_total + 1   
                        print '  adding prev', prev_rule    

            for word_rule, word_count in rules_covered.items():
                rule_lhs, rule_rank = word_rule
                rule_chain = rule_count_lhs_table [rule_lhs]
                N = len (rule_chain)    
                next_rule = ''

                for i, (occurance_count, rule) in enumerate (rule_chain):
                    if rule == word_rule:
                        if i+1 < N:
                            next_rule = rule_chain[i+1][1]
                rule_already_wanted = next_rule in non_covered_rules
                if next_rule and not rule_already_wanted:
                    non_covered_rules.add (word_rule)
                    non_covered_rules.add (next_rule)
                this_word_count = selected_words_rule_count_tbl.get (word_rule,0)
                next_word_count = selected_words_rule_count_tbl.get (next_rule,0)

                print 'here %4i %s %6s %s %6i %6i %s' \
                    %(len(non_covered_rules), rule, rule_already_wanted, next_rule, this_word_count, next_word_count, word)

            for rule in rules_covered.keys():    
                selected_rule_count = selected_words_rule_count_tbl [rule]
                #print '%6i %6i %2i %s' %(rule_count_table[rule], selected_rule_count, count, rule)
                # this is inefficient!
                for compare_count, compare_rule in sorted_rule_list:
                    if rule[0] == compare_rule[0]:
                        if rule_count_table [rule] <                \
                          rule_count_table [compare_rule]           \
                        and                                         \
                           selected_words_rule_count_tbl [rule] >=  \
                            num_reinsertions += 1
                            non_covered_rules.add (compare_rule)
                            #print '  re-adding rule %6i %6i %s' %(count, rule_count_table [compare_rule], compare_rule)
        print 'Num reinsertions', num_reinsertions
        return best_word_id_list
        # Note: pass 2 may not work with this revised algorithm, so skip

        # pass 2

        non_covered_rules   = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items())))
        total_letter_count  = 0 
        total_rules_covered = set()    
        accumulated_score   = 1.0    
        new_word_id_list    = []

        while best_word_id_set:
            word_id, word_score, word_rules, rule_counts = FindBestWordInSet (best_word_id_set, non_covered_rules) 
            best_word_id_set.discard (word_id)
            new_word_id_list.append  (word_id)    
            UpdateWordIdSets (word_id, word_rules)
            list_size = len (new_word_id_list)
            entry = given_word_pronun_list [word_id]    
            word  = string.join (entry[0],'')
            total_letter_count  += len(word)    
            total_rules_covered |= word_rules
            current_merit = sum (map ((lambda x: rule_count_table[x]), total_rules_covered))
            current_score = 1.0 - float(current_merit) / maximum_merit    
            accumulated_score += current_score * len(word)
            print '%4i %5i %8.2f %4i %4i %8i %8.3f %8.3f %s' \
                %(list_size, total_letter_count, word_score, len(word_rules), len(total_rules_covered), current_merit, current_score, accumulated_score, word)
            if not non_covered_rules: break

        return new_word_id_list