Example #1
0
def fill_missing_words(grammar: PCFG, missing_words: Set[str]):
    # UNK -> word1 | word2 | ... | wordN
    unknown = Nonterminal('UNK')
    unk_rules = [
        Production(unknown, [missing_word]) for missing_word in missing_words
    ]

    # Add UNK as a possibility to all rules with strings in the right hand side
    corrected_rules: List[Nonterminal] = []
    rule: ProbabilisticProduction
    for rule in grammar.productions():

        # right hand side has a string somewhere
        if any(isinstance(element, str) for element in rule.rhs()):

            # rule has already been corrected
            if rule.lhs() in corrected_rules:
                continue

            unk_rules.append(Production(rule.lhs(), [unknown]))

            corrected_rules.append(rule.lhs())

    return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)