コード例 #1
0
    def __init__(self, start, productions):
        """
        Create a new context-free grammar, from the given start state
        and set of C{CFGProduction}s.
        
        @param start: The start symbol
        @type start: L{Nonterminal}
        @param productions: The list of productions that defines the grammar
        @type productions: C{list} of C{PCFGProduction}
        @raise ValueError: if the set of productions with any left-hand-side
            do not have probabilities that sum to a value within
            PCFG.EPSILON of 1.
        """
        assert _chktype(1, start, Nonterminal)
        assert _chktype(2, productions, (PCFGProduction,), [PCFGProduction])
        CFG.__init__(self, start, productions)

        # Make sure that the probabilities sum to one.
        probs = {}
        for production in productions:
            probs[production.lhs()] = (probs.get(production.lhs(), 0) +
                                       production.prob())
        for (lhs, p) in probs.items():
            if not ((1-PCFG.EPSILON) < p < (1+PCFG.EPSILON)):
                raise ValueError("CFGProductions for %r do not sum to 1" % lhs)
コード例 #2
0
 def __init__(self, start, productions):
     """
     Create a new context-free grammar, from the given start state
     and set of C{CFGProduction}s.
     
     @param start: The start symbol
     @type start: L{Nonterminal}
     @param productions: The list of productions that defines the grammar
     @type productions: C{list} of L{CFGProduction}
     """
     assert _chktype(1, start, Nonterminal)
     assert _chktype(2, productions, (CFGProduction,), [CFGProduction])
     self._start = start
     self._productions = tuple(productions)
コード例 #3
0
    def __init__(self, grammar, trace=0, **property_names):
        """
        Create a new C{BottomUpPCFGChartParser}, that uses C{grammar}
        to parse texts.

        @type grammar: C{PCFG}
        @param grammar: The grammar used to parse texts.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            and higher numbers will produce more verbose tracing
            output.
        """
        assert _chktype(1, grammar, PCFG)
        assert _chktype(2, trace, types.IntType)
        self._grammar = grammar
        self._trace = trace
        AbstractParser.__init__(self, **property_names)
コード例 #4
0
    def __init__(self, beam_size, grammar, trace=0, **property_names):
        """
        Create a new C{BottomUpPCFGChartParser}, that uses C{grammar}
        to parse texts.

        @type beam_size: C{int}
        @param beam_size: The maximum length for the parser's edge queue.
        @type grammar: C{PCFG}
        @param grammar: The grammar used to parse texts.
        @type trace: C{int}
        @param trace: The level of tracing that should be used when
            parsing a text.  C{0} will generate no tracing output;
            and higher numbers will produce more verbose tracing
            output.
        """
        assert _chktype(1, beam_size, types.IntType)
        assert _chktype(2, grammar, PCFG)
        assert _chktype(3, trace, types.IntType)
        BottomUpPCFGChartParser.__init__(self, grammar, trace, **property_names)
        self._beam_size = beam_size
コード例 #5
0
def attested_classes(tokens, **property_names):
    """
    @return: A list of all classes that are attested in the given list
        of tokens.
    @rtype: C{list} of (immutable)
    @param tokens: The list of tokens from which to extract classes.
    @type tokens: C{list} of (C{Token} with type C{ClassedText})
    """
    CLASS = property_names.get('CLASS', 'CLASS')
    assert _chktype(1, tokens, [Token], (Token,))
    return list(sets.Set([token[CLASS] for token in tokens]))
コード例 #6
0
    def __init__(self, lhs, rhs):
        """
        Construct a new C{CFGProduction}.

        @param lhs: The left-hand side of the new C{CFGProduction}.
        @type lhs: L{Nonterminal}
        @param rhs: The right-hand side of the new C{CFGProduction}.
        @type rhs: sequence of (C{Nonterminal} and (terminal))
        """
        assert _chktype(1, lhs, Nonterminal)
        self._lhs = lhs
        self._rhs = tuple(rhs)
コード例 #7
0
 def __div__(self, rhs):
     """
     @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where
         C{M{A}} is the symbol for this nonterminal, and C{M{B}}
         is the symbol for rhs.
     @rtype: L{Nonterminal}
     @param rhs: The nonterminal used to form the right hand side
         of the new nonterminal.
     @type rhs: L{Nonterminal}
     """
     assert _chktype(1, rhs, Nonterminal)
     return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
コード例 #8
0
    def trace(self, trace=2):
        """
        Set the level of tracing output that should be generated when
        parsing a text.

        @type trace: C{int}
        @param trace: The trace level.  A trace level of C{0} will
            generate no tracing output; and higher trace levels will
            produce more verbose tracing output.
        @rtype: C{None}
        """
        assert _chktype(1, trace, types.IntType)
        self._trace = trace
コード例 #9
0
    def get_parse_list(self, token):
        # Inherit docs from ParserI
        assert _chktype(1, token, Token)
        SUBTOKENS = self.property("SUBTOKENS")
        LEAF = self.property("LEAF")

        subtokens = token[SUBTOKENS]

        # The most likely consituant table.  This table specifies the
        # most likely constituent for a given span and type.
        # Constituents can be either Trees or Tokens.  For
        # Trees, the "type" is the Nonterminal for the tree's
        # root node value.  For Tokens, the "type" is the token's
        # type.  The table is stored as a dictionary, since it is
        # sparse.
        constituents = {}

        # Initialize the constituents dictionary with the words from
        # the text.
        if self._trace:
            print ("Inserting tokens into the most likely" + " constituents table...")
        for index in range(len(subtokens)):
            tok = subtokens[index]
            probtok = tok.copy()
            constituents[index, index + 1, tok[LEAF]] = probtok
            if self._trace > 1:
                self._trace_lexical_insertion(tok, subtokens)

        # Consider each span of length 1, 2, ..., n; and add any trees
        # that might cover that span to the constituents dictionary.
        for length in range(1, len(subtokens) + 1):
            if self._trace:
                if self._trace > 1:
                    print
                print ("Finding the most likely constituents" + " spanning %d text elements..." % length)
            # print constituents
            for start in range(len(subtokens) - length + 1):
                span = (start, start + length)
                self._add_constituents_spanning(span, constituents, subtokens)

        # Find all trees that span the entire text & have the right cat
        trees = [constituents.get((0, len(subtokens), self._grammar.start()), [])]

        # Sort the trees, and return the requested number of them.
        trees.sort(lambda t1, t2: cmp(t2.prob(), t1.prob()))
        return trees
コード例 #10
0
    def train(self, train_toks, **kwargs):
        """
        Train a new C{ConditionalExponentialClassifier}, using the
        given training samples.  This
        C{ConditionalExponentialClassifier} should encode the model
        that maximizes entropy from all the models that are
        emperically consistant with C{train_toks}.
        
        @param kwargs: Keyword arguments.
          - C{iterations}: The maximum number of times IIS should
            iterate.  If IIS converges before this number of
            iterations, it may terminate.  Default=C{20}.
            (type=C{int})
            
          - C{debug}: The debugging level.  Higher values will cause
            more verbose output.  Default=C{0}.  (type=C{int})
            
          - C{classes}: The set of possible classes.  If none is given,
            then the set of all classes attested in the training data
            will be used instead.  (type=C{list} of (immutable)).
            
          - C{accuracy_cutoff}: The accuracy value that indicates
            convergence.  If the accuracy becomes closer to one
            than the specified value, then IIS will terminate.  The
            default value is None, which indicates that no accuracy
            cutoff should be used. (type=C{float})

          - C{delta_accuracy_cutoff}: The change in accuracy should be
            taken to indicate convergence.  If the accuracy changes by
            less than this value in a single iteration, then IIS will
            terminate.  The default value is C{None}, which indicates
            that no accuracy-change cutoff should be
            used. (type=C{float})

          - C{log_likelihood_cutoff}: specifies what log-likelihood
            value should be taken to indicate convergence.  If the
            log-likelihod becomes closer to zero than the specified
            value, then IIS will terminate.  The default value is
            C{None}, which indicates that no log-likelihood cutoff
            should be used. (type=C{float})

          - C{delta_log_likelihood_cutoff}: specifies what change in
            log-likelihood should be taken to indicate convergence.
            If the log-likelihood changes by less than this value in a
            single iteration, then IIS will terminate.  The default
            value is C{None}, which indicates that no
            log-likelihood-change cutoff should be used.  (type=C{float})
        """
        assert _chktype(1, train_toks, [Token], (Token,))
        # Process the keyword arguments.
        iter = 20
        debug = 0
        classes = None
        ll_cutoff = lldelta_cutoff = None
        acc_cutoff = accdelta_cutoff = None
        for (key, val) in kwargs.items():
            if key in ('iterations', 'iter'): iter = val
            elif key == 'debug': debug = val
            elif key == 'classes': classes = val
            elif key == 'log_likelihood_cutoff':
                ll_cutoff = abs(val)
            elif key == 'delta_log_likelihood_cutoff':
                lldelta_cutoff = abs(val)
            elif key == 'accuracy_cutoff': 
                acc_cutoff = abs(val)
            elif key == 'delta_accuracy_cutoff':
                accdelta_cutoff = abs(val)
            else: raise TypeError('Unknown keyword arg %s' % key)
        if classes is None:
            classes = attested_classes(train_toks)
            self._classes = classes
            
        # Find the classes, if necessary.
        if classes is None:
            classes = find_classes(train_toks)

        # Find the length of the first token's feature vector.
        if len(train_toks) == 0:
            raise ValueError('Expected at least one training token')
        vector0 = train_toks[0]['FEATURE_VECTOR']
        self._feature_vector_len = len(vector0)
        self._weight_vector_len = self._feature_vector_len*len(self._classes)

        # Build the offsets dictionary.  This maps from a class to the
        # index in the weight vector where that class's weights begin.
        self._offsets = dict([(cls, i*self._feature_vector_len)
                              for i, cls in enumerate(classes)])

        # Find the frequency with which each feature occurs in the
        # training data.
        ffreq_emperical = self._ffreq_emperical(train_toks)

        # Find the nf map, and related variables nfarray and nfident.
        # nf is the sum of the features for a given labeled text.
        # nfmap compresses this sparse set of values to a dense list.
        # nfarray performs the reverse operation.  nfident is 
        # nfarray multiplied by an identity matrix.
        nfmap = self._nfmap(train_toks)
        nfs = nfmap.items()
        nfs.sort(lambda x,y:cmp(x[1],y[1]))
        nfarray = numarray.array([nf for (nf, i) in nfs], 'd')
        nftranspose = numarray.reshape(nfarray, (len(nfarray), 1))

        # An array that is 1 whenever ffreq_emperical is zero.  In
        # other words, it is one for any feature that's not attested
        # in the data.  This is used to avoid division by zero.
        unattested = numarray.zeros(self._weight_vector_len, 'd')
        for i in range(len(unattested)):
            if ffreq_emperical[i] == 0: unattested[i] = 1

        # Build the classifier.  Start with weight=1 for each feature,
        # except for the unattested features.  Start those out at
        # zero, since we know that's the correct value.
        weights = numarray.ones(self._weight_vector_len, 'd')
        weights -= unattested
        classifier = ConditionalExponentialClassifier(classes, weights)
                
        if debug > 0: print '  ==> Training (%d iterations)' % iter
        if debug > 2:
            print
            print '      Iteration    Log Likelihood    Accuracy'
            print '      ---------------------------------------'

        # Train for a fixed number of iterations.
        for iternum in range(iter):
            if debug > 2:
                print ('     %9d    %14.5f    %9.3f' %
                       (iternum, classifier_log_likelihood(classifier, train_toks),
                        classifier_accuracy(classifier, train_toks)))

            # Calculate the deltas for this iteration, using Newton's method.
            deltas = self._deltas(train_toks, classifier, unattested,
                                  ffreq_emperical, nfmap, nfarray,
                                  nftranspose)

            # Use the deltas to update our weights.
            weights = classifier.weights()
            weights *= numarray.exp(deltas)
            classifier.set_weights(weights)
                        
            # Check log-likelihood cutoffs.
            if ll_cutoff is not None or lldelta_cutoff is not None:
                ll = classifier_log_likelihood(classifier, train_toks)
                if ll_cutoff is not None and ll > -ll_cutoff: break
                if lldelta_cutoff is not None:
                    if (ll - ll_old) < lldelta_cutoff: break
                    ll_old = ll

            # Check accuracy cutoffs.
            if acc_cutoff is not None or accdelta_cutoff is not None:
                acc = classifier_accuracy(classifier, train_toks)
                if acc_cutoff is not None and acc < acc_cutoff: break
                if accdelta_cutoff is not None:
                    if (acc_old - acc) < accdelta_cutoff: break
                    acc_old = acc

        if debug > 2:
            print ('     %9d    %14.5f    %9.3f' %
                   (iternum+1, classifier_log_likelihood(classifier, train_toks),
                    classifier_accuracy(classifier, train_toks)))
            print
                   
        # Return the classifier.
        return classifier