コード例 #1
0
ファイル: basic_learner.py プロジェクト: edderick/langframe
    def __init__(self, necessary=FiniteSymbolTable(), possible=UniversalSymbolTable()):
        self.necessary = necessary
        self.possible = possible
        self.expressions = UniversalSymbolTable()

        self.log = NPLogger()
コード例 #2
0
ファイル: basic_learner.py プロジェクト: edderick/langframe
class NPSymbolLearner:
    def __init__(self, necessary=FiniteSymbolTable(), possible=UniversalSymbolTable()):
        self.necessary = necessary
        self.possible = possible
        self.expressions = UniversalSymbolTable()

        self.log = NPLogger()

    def __str__(self):
        """String representation of symbol learner state (show N and P tables by word index)"""
        return "\n".join("%s || %s | %s" % (word.rjust(10),
                str(self.necessary[word]).ljust(25),
                str(self.possible[word]).ljust(25))
                          for word in self.necessary )

    def __contains__(self, word):
        """Is there an entry in this learner for some word?"""
        return word in self.necessary

    def process(self, pair):
        """Perform one learning iteration, from an utterance -> meaning pair"""
        self.log.info("[~~>] %s" % pair)
        self._rule1(pair)
        self._rule2(pair)
        self._rule3(pair)
        self._rule4(pair)
        self._rule5(pair)

    def converged(self, word):
        """Have the symbol sets converged on the same symbol sets for an entry?"""
        return self.necessary[word] == self.possible[word]

    def all_consistent(self):
        """Checks all lexical entries for consistency"""
        for word in self.necessary:
            if not self.consistent(word):
                return False 
        return True

    def consistent(self, word):
        """
        Inconsistency implies a lexical entry has been corrupted by noise (no correct meanings in
        UTM pair) or homonymy (multiple disparate meanings for the same word)
        """

        # universal possible set trivially consistent (non-empty and n is subset)
        if self.possible[word].is_universal():
            return True

        # if conceptual expressions for sense empty => corrupted
        # words with semantically null meaning (e.g. "the") have BottomExpression
        if (not self.expressions[word].is_universal()) and len(self.expressions[word]) is 0:
            return False

        # consistency: necessary is subset of possible
        for symbol in self.necessary[word]:
            if symbol not in self.possible[word]:
                return False
        return True
            

    def _rule1(self, pair):
        """
        For an utterance/meaning pair, filter out the hypotheses that violate Rule 1 in Siskind 1996.
        """
        self.log.rule_debug("", symbol="1")
        self.log.rule_debug(pair, indent=1, symbol="->")

        # create sets p,n as the union of the possible, necessary  sets for each word in the utterance
        p_universal = False
        p = set()
        n = set()
        for word in pair.words:
            if self.possible[word].is_universal():
                p_universal = True

            # if any possible set for a word is universal, then the union of p's is universal..
            if not p_universal:
                p = p.union(self.possible[word])
            n = n.union(self.necessary[word])

        def test(hypothesis):
            # remove hypotheses that contain a symbol ruled out for all words
            # NB. if p is universal, this trivally passes
            if not p_universal and not hypothesis.symbols.issubset(p):
                debug_msg = "Sym/s Not Poss: %s for %s" % (hypothesis,
                    hypothesis.symbols.difference(p))
                self.log.rule_debug(debug_msg, symbol="-", indent=2)

                return False

            # filter out hypotheses that are missing a necessary symbol for a word
            if not n.issubset(hypothesis.symbols):
                debug_msg = "Sym/s in N missing: %s for %s" % (hypothesis,
                   n.difference(hypothesis.symbols))
                self.log.rule_debug(debug_msg, symbol="-", indent=2)

                return False

            return True

        pair.hypotheses = {hyp for hyp in pair.hypotheses if test(hyp)}
        self.log.rule_debug(pair, indent=1, symbol="<-")

    def _rule2(self, pair):
        """
        For each word in the utterance, eliminate from the possible symbols any
        symbols that don't appear in any utterance meanings... i.e. symbols that don't show
        up at all for an utterance are obviously not possible symbols
        """
        self.log.rule_debug("", symbol="2", indent=1)
        self.log.rule_debug("", symbol="->", indent=2)
        self.log.rule_debug(self, indent=3)

        # find all the symbols in each of the remaining hypotheses...
        remaining_symbols = set()
        for hypothesis in pair.hypotheses:
            remaining_symbols = remaining_symbols.union(hypothesis.symbols)

        # remove from the possible entry for each word, those symbols not in the above set
        for word in pair.words:
            self.possible[word] = self.possible[word].intersection(remaining_symbols)

        self.log.rule_debug("", symbol="<-", indent=2)
        self.log.rule_debug(self,  indent=2)

    def _rule3(self, pair):
        """
        Add to necessary symbols for a word those symbols which appear in all remaining hypotheses, but
        are missing from the possible table for all other words in the utterance. i.e. we have a symbol
        that is in all hypotheses for an utterance, and it is impossible for all words but one => that must be part
        of the definition for that remaining word.
        """
        
        # NOTE: could be optimised by breaking loop as symbol set becomes negative/
        # identity

        self.log.rule_debug("", symbol="3", indent=1)

        # which symbols are in all remaining hypotheses?
        if len(pair.hypotheses) > 0:
            element = pair.hypotheses.pop()
            pair.hypotheses.add(element)
            common_symbols = element.symbols

            for expression in pair.hypotheses:
                common_symbols.intersection_update(expression.symbols)

        else: # empty hypothesis set
            return

        self.log.rule_debug("Common: %s" % common_symbols,
                            symbol="", indent=3)

        # test all word combinations (Cartesian product)
        for word in pair.words:
            symbols = copy.copy(common_symbols)
            self.log.rule_debug(word, symbol="", indent=2)

            for other_word in pair.words:
                if word != other_word:
                    symbols.difference_update(self.possible[other_word])
                    self.log.rule_debug("P(%s):\t\t %s" % (other_word, symbols), indent=3)
            self.necessary[word].update(symbols)

        self.log.rule_debug("", symbol="<-", indent=2)
        self.log.rule_debug(self,  indent=2)

    def _rule4(self, pair):
        """
        For symbols that appear only once in all remaining hypotheses, if it's
        in the necessary set for some word, it can be removed from the possible
        set for all other words in the utterance.

        This is because each conceptual symbol can only be "contributed to"
        by one word symbol in an utterance. If we know it's being contributed
        to by some word, we can exclude it being contributed to by any other.
        """

        self.log.rule_debug("", symbol="4", indent=1)

        # find symbols that appear only once
        once_symbols = set()
        for hypothesis in pair.hypotheses:
            for symbol in hypothesis.symbols:
                if hypothesis.symbol_count[symbol] <= 1:
                    once_symbols.add(symbol)

        self.log.rule_debug("once: %s" % once_symbols,
                            indent=2)

        # remove the appropriate symbols
        for word in pair.words:
            self.log.rule_debug("%s:\tP:%s" % (word, self.possible[word]),
                                indent=2)
            for other_word in pair.words:
                if other_word != word:
                    self.log.rule_debug("%s:" % other_word, indent=3)
                    for symbol in self.necessary[other_word]:
                        symbols = once_symbols.intersection(self.necessary[other_word])
                        self.possible[word].difference_update(symbols)
                        self.log.rule_debug("%s:\tP(%s=%s" % (symbol, word, self.possible[word]),
                                            indent=4)
        self.log.rule_debug("", symbol="<-", indent=2)
        self.log.rule_debug(self,  indent=2)

    def _rule5(self, pair):
        """
        Generate conceptual expressions if a word in the utterance has converged
        """
        self.log.rule_debug("", symbol="5", indent=1)
        for word in pair.words:
            # has the word converged?
            if self.converged(word):
                self.log.rule_debug("convergence: %s" % word, indent=1)

                # empty => BottomExpression
                #   both N and P empty => semantically null but non-corrupt entry
                if len(self.necessary[word]) is 0 and len(self.possible[word]) is 0:
                    self.log.rule_debug("empty", indent=2)
                    self.expressions[word] = SymbolSet({BottomExpression()})

                # variables => variables
                elif self.necessary[word] == self.necessary[word].variables():
                    self.log.rule_debug("variable: %s" % self.necessary[word],
                                          indent=2)
                    self.expressions[word] = self.expressions[word].intersection( self.necessary[word] )

                # compare constant terms
                else:
                    valid_subexpressions = set()
                    word_constants = set(symbol for symbol in self.necessary[word] if symbol.isupper())

                    self.log.rule_debug("constants: %s" % word_constants,
                                                      indent=2)

                    for hypothesis in pair.hypotheses:
                       valid_subexpressions = valid_subexpressions.union(
                           hypothesis.subexpressions_for_constants(word_constants) )

                    self.log.rule_debug("compatible exprs:", indent=2)
                    self.log.rule_debug("\n".join("%s" % str(expr)
                                                  for expr in valid_subexpressions),
                                        indent=2)

                    self.expressions[word] = self.expressions[word].intersection(valid_subexpressions)