def fill_missing_words(grammar: PCFG, missing_words: Set[str]): # UNK -> word1 | word2 | ... | wordN unknown = Nonterminal('UNK') unk_rules = [ Production(unknown, [missing_word]) for missing_word in missing_words ] # Add UNK as a possibility to all rules with strings in the right hand side corrected_rules: List[Nonterminal] = [] rule: ProbabilisticProduction for rule in grammar.productions(): # right hand side has a string somewhere if any(isinstance(element, str) for element in rule.rhs()): # rule has already been corrected if rule.lhs() in corrected_rules: continue unk_rules.append(Production(rule.lhs(), [unknown])) corrected_rules.append(rule.lhs()) return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)