Beispiel #1
0
def train_maxent_classifier_with_iis(train_toks,
                                     trace=3,
                                     labels=None,
                                     iterations=20,
                                     acc_cutoff=None,
                                     accdelta_cutoff=None,
                                     ll_cutoff=None,
                                     lldelta_cutoff=None):
    """
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples.  This C{ConditionalExponentialClassifier} will
    encode the model that maximizes entropy from all the models that
    are empirically consistent with C{train_toks}.

    See L{train_maxent_classifier()} for parameter descriptions.
    """
    # Fill in default args, & take abs values of ll cutoffs.
    if not labels: labels = attested_labels(train_toks)
    if ll_cutoff: ll_cutoff = abs(ll_cutoff)
    if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff)

    # Find a list of all labels in the training data.
    labels = attested_labels(train_toks)

    # Construct an encoding from the training data.
    encoding = SparseBinaryVectorEncoding.train(train_toks)

    # Build the offsets dictionary.  This maps from a class to the
    # index in the weight vector where that class's weights begin.
    offsets = dict([(label, i * encoding.length())
                    for i, label in enumerate(labels)])

    # Count how many times each feature occurs in the training data.
    empirical_ffreq = calculate_empirical_fcount(train_toks, encoding,
                                                 offsets) / len(train_toks)

    # Find the nf map, and related variables nfarray and nfident.
    # nf is the sum of the features for a given labeled text.
    # nfmap compresses this sparse set of values to a dense list.
    # nfarray performs the reverse operation.  nfident is
    # nfarray multiplied by an identity matrix.
    nfmap = calculate_nfmap(train_toks, encoding)
    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
    nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))

    # An array that is 1 whenever empirical_ffreq is zero.  In
    # other words, it is one for any feature that's not attested
    # in the data.  This is used to avoid division by zero.
    unattested = numpy.zeros(len(empirical_ffreq))
    for i in range(len(empirical_ffreq)):
        if empirical_ffreq[i] == 0: unattested[i] = 1

    # Build the classifier.  Start with weight=1 for each feature,
    # except for the unattested features.  Start those out at
    # zero, since we know that's the correct value.
    weights = numpy.ones(len(empirical_ffreq), 'd')
    weights -= unattested
    classifier = ConditionalExponentialClassifier(labels, encoding, weights)

    if trace > 0: print '  ==> Training (%d iterations)' % iterations
    if trace > 2:
        print
        print '      Iteration    Log Likelihood    Accuracy'
        print '      ---------------------------------------'

    # Train for a fixed number of iterations.
    for iternum in range(iterations):
        if trace > 2:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            print '     %9d    %14.5f    %9.3f' % (iternum + 1, ll, acc)

        # Calculate the deltas for this iteration, using Newton's method.
        deltas = calculate_deltas(train_toks, classifier, unattested,
                                  empirical_ffreq, nfmap, nfarray, nftranspose,
                                  offsets, encoding)

        # Use the deltas to update our weights.
        weights = classifier.weights()
        weights *= 2**deltas  # numpy.exp(deltas)
        classifier.set_weights(weights)

        # Check log-likelihood cutoffs.
        if ll_cutoff is not None or lldelta_cutoff is not None:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            if ll_cutoff is not None and ll > -ll_cutoff: break
            if lldelta_cutoff is not None:
                if (ll - ll_old) < lldelta_cutoff: break
                ll_old = ll

        # Check accuracy cutoffs.
        if acc_cutoff is not None or accdelta_cutoff is not None:
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            if acc_cutoff is not None and acc < acc_cutoff: break
            if accdelta_cutoff is not None:
                if (acc_old - acc) < accdelta_cutoff: break
                acc_old = acc

    if trace > 2:
        ll = nltk.classify.util.log_likelihood(classifier, train_toks)
        acc = nltk.classify.util.accuracy(classifier, train_toks)
        print '         Final    %14.5f    %9.3f' % (ll, acc)

    # Return the classifier.
    return classifier
Beispiel #2
0
def train_maxent_classifier_with_iis(
    train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None,
    accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None):
    """
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples.  This C{ConditionalExponentialClassifier} will
    encode the model that maximizes entropy from all the models that
    are empirically consistent with C{train_toks}.

    See L{train_maxent_classifier()} for parameter descriptions.
    """
    # Fill in default args, & take abs values of ll cutoffs.
    if not labels: labels = attested_labels(train_toks)
    if ll_cutoff: ll_cutoff = abs(ll_cutoff)
    if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff)
        
    # Find a list of all labels in the training data.
    labels = attested_labels(train_toks)

    # Construct an encoding from the training data.
    encoding = SparseBinaryVectorEncoding.train(train_toks)
    
    # Build the offsets dictionary.  This maps from a class to the
    # index in the weight vector where that class's weights begin.
    offsets = dict([(label, i*encoding.length())
                    for i, label in enumerate(labels)])

    # Count how many times each feature occurs in the training data.
    empirical_ffreq = calculate_empirical_fcount(train_toks, encoding,
                                                 offsets) / len(train_toks)

    # Find the nf map, and related variables nfarray and nfident.
    # nf is the sum of the features for a given labeled text.
    # nfmap compresses this sparse set of values to a dense list.
    # nfarray performs the reverse operation.  nfident is 
    # nfarray multiplied by an identity matrix.
    nfmap = calculate_nfmap(train_toks, encoding)
    nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
    nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))

    # An array that is 1 whenever empirical_ffreq is zero.  In
    # other words, it is one for any feature that's not attested
    # in the data.  This is used to avoid division by zero.
    unattested = numpy.zeros(len(empirical_ffreq))
    for i in range(len(empirical_ffreq)):
        if empirical_ffreq[i] == 0: unattested[i] = 1

    # Build the classifier.  Start with weight=1 for each feature,
    # except for the unattested features.  Start those out at
    # zero, since we know that's the correct value.
    weights = numpy.ones(len(empirical_ffreq), 'd')
    weights -= unattested
    classifier = ConditionalExponentialClassifier(labels, encoding, weights)
            
    if trace > 0: print '  ==> Training (%d iterations)' % iterations
    if trace > 2:
        print
        print '      Iteration    Log Likelihood    Accuracy'
        print '      ---------------------------------------'

    # Train for a fixed number of iterations.
    for iternum in range(iterations):
        if trace > 2:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            print '     %9d    %14.5f    %9.3f' % (iternum+1, ll, acc)

        # Calculate the deltas for this iteration, using Newton's method.
        deltas = calculate_deltas(
            train_toks, classifier, unattested, empirical_ffreq, 
            nfmap, nfarray, nftranspose, offsets, encoding)

        # Use the deltas to update our weights.
        weights = classifier.weights()
        weights *= 2**deltas # numpy.exp(deltas)
        classifier.set_weights(weights)
                    
        # Check log-likelihood cutoffs.
        if ll_cutoff is not None or lldelta_cutoff is not None:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            if ll_cutoff is not None and ll > -ll_cutoff: break
            if lldelta_cutoff is not None:
                if (ll - ll_old) < lldelta_cutoff: break
                ll_old = ll

        # Check accuracy cutoffs.
        if acc_cutoff is not None or accdelta_cutoff is not None:
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            if acc_cutoff is not None and acc < acc_cutoff: break
            if accdelta_cutoff is not None:
                if (acc_old - acc) < accdelta_cutoff: break
                acc_old = acc

    if trace > 2:
        ll = nltk.classify.util.log_likelihood(classifier, train_toks)
        acc = nltk.classify.util.accuracy(classifier, train_toks)
        print '         Final    %14.5f    %9.3f' % (ll, acc)
               
    # Return the classifier.
    return classifier
Beispiel #3
0
def train_maxent_classifier_with_gis(train_toks,
                                     trace=3,
                                     labels=None,
                                     iterations=20,
                                     acc_cutoff=None,
                                     accdelta_cutoff=None,
                                     ll_cutoff=None,
                                     lldelta_cutoff=None):
    """
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples.  This C{ConditionalExponentialClassifier} will
    encode the model that maximizes entropy from all the models that
    are empirically consistent with C{train_toks}.

    See L{train_maxent_classifier()} for parameter descriptions.
    """
    # Fill in default args, & take abs values of ll cutoffs.
    if not labels: labels = attested_labels(train_toks)
    if ll_cutoff: ll_cutoff = abs(ll_cutoff)
    if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff)

    # Construct an encoding from the training data.
    encoding = GISEncoding.train(train_toks)

    # Cinv is the inverse of the sum of each vector.  This controls
    # the learning rate: higher Cinv (or lower C) gives faster
    # learning.
    Cinv = 1.0 / encoding.C()

    # Build the offsets dictionary.  This maps from a class to the
    # index in the weight vector where that class's weights begin.
    offsets = dict([(label, i * encoding.length())
                    for i, label in enumerate(labels)])

    # Count how many times each feature occurs in the training data.
    empirical_fcount = calculate_empirical_fcount(train_toks, encoding,
                                                  offsets)

    # Define an array that is 1 whenever empirical_fcount is zero.  In
    # other words, it is one for any feature that's not attested in
    # the training data.  This is used to avoid division by zero.
    unattested = numpy.zeros(len(empirical_fcount))
    for i in range(len(empirical_fcount)):
        if empirical_fcount[i] == 0: unattested[i] = 1

    # Build the classifier.  Start with weight=1 for each feature,
    # except for the unattested features.  Start those out at
    # zero, since we know that's the correct value.
    weights = numpy.ones(len(empirical_fcount), 'd')
    weights -= unattested
    classifier = ConditionalExponentialClassifier(labels, encoding, weights)

    # Old log-likelihood and accuracy; used to check if the change
    # in log-likelihood or accuracy is sufficient to indicate convergence.
    ll_old = None
    acc_old = None

    if trace > 0: print '  ==> Training (%d iterations)' % iterations
    if trace > 2:
        print
        print '      Iteration    Log Likelihood    Accuracy'
        print '      ---------------------------------------'

    # Train for a fixed number of iterations.!
    for iternum in range(iterations):
        if trace > 2:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            print '     %9d    %14.5f    %9.3f' % (iternum + 1, ll, acc)

        # Use the model to estimate the number of times each
        # feature should occur in the training data.
        estimated_fcount = calculate_estimated_fcount(classifier, train_toks,
                                                      encoding, offsets)

        # Avoid division by zero.
        estimated_fcount += unattested

        # Update the classifier weights
        weights = classifier.weights()
        weights *= (empirical_fcount / estimated_fcount)**Cinv
        classifier.set_weights(weights)

        # Check log-likelihood cutoffs.
        if ll_cutoff is not None or lldelta_cutoff is not None:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            if ll_cutoff is not None and ll >= -abs(ll_cutoff): break
            if lldelta_cutoff is not None:
                if ll_old and (ll - ll_old) <= lldelta_cutoff: break
                ll_old = ll

        # Check accuracy cutoffs.
        if acc_cutoff is not None or accdelta_cutoff is not None:
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            if acc_cutoff is not None and acc >= acc_cutoff: break
            if accdelta_cutoff is not None:
                if acc_old and (acc_old - acc) <= accdelta_cutoff: break
                acc_old = acc

    if trace > 2:
        ll = nltk.classify.util.log_likelihood(classifier, train_toks)
        acc = nltk.classify.util.accuracy(classifier, train_toks)
        print '         Final    %14.5f    %9.3f' % (ll, acc)

    # Return the classifier.
    return classifier
Beispiel #4
0
def train_maxent_classifier_with_gis(
    train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None,
    accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None):
    """
    Train a new C{ConditionalExponentialClassifier}, using the given
    training samples.  This C{ConditionalExponentialClassifier} will
    encode the model that maximizes entropy from all the models that
    are empirically consistent with C{train_toks}.

    See L{train_maxent_classifier()} for parameter descriptions.
    """
    # Fill in default args, & take abs values of ll cutoffs.
    if not labels: labels = attested_labels(train_toks)
    if ll_cutoff: ll_cutoff = abs(ll_cutoff)
    if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff)

    # Construct an encoding from the training data.
    encoding = GISEncoding.train(train_toks)

    # Cinv is the inverse of the sum of each vector.  This controls
    # the learning rate: higher Cinv (or lower C) gives faster
    # learning.
    Cinv = 1.0/encoding.C()
    
    # Build the offsets dictionary.  This maps from a class to the
    # index in the weight vector where that class's weights begin.
    offsets = dict([(label, i*encoding.length())
                    for i, label in enumerate(labels)])

    # Count how many times each feature occurs in the training data.
    empirical_fcount = calculate_empirical_fcount(train_toks, encoding,
                                                  offsets)
    
    # Define an array that is 1 whenever empirical_fcount is zero.  In
    # other words, it is one for any feature that's not attested in
    # the training data.  This is used to avoid division by zero.
    unattested = numpy.zeros(len(empirical_fcount))
    for i in range(len(empirical_fcount)):
        if empirical_fcount[i] == 0: unattested[i] = 1

    # Build the classifier.  Start with weight=1 for each feature,
    # except for the unattested features.  Start those out at
    # zero, since we know that's the correct value.
    weights = numpy.ones(len(empirical_fcount), 'd')
    weights -= unattested
    classifier = ConditionalExponentialClassifier(labels, encoding, weights)

    # Old log-likelihood and accuracy; used to check if the change
    # in log-likelihood or accuracy is sufficient to indicate convergence.
    ll_old = None
    acc_old = None
        
    if trace > 0: print '  ==> Training (%d iterations)' % iterations
    if trace > 2:
        print
        print '      Iteration    Log Likelihood    Accuracy'
        print '      ---------------------------------------'

    # Train for a fixed number of iterations.!
    for iternum in range(iterations):
        if trace > 2:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            print '     %9d    %14.5f    %9.3f' % (iternum+1, ll, acc)
        
        # Use the model to estimate the number of times each
        # feature should occur in the training data.
        estimated_fcount = calculate_estimated_fcount(classifier, train_toks,
                                                      encoding, offsets)

        # Avoid division by zero.
        estimated_fcount += unattested
        
        # Update the classifier weights
        weights = classifier.weights()
        weights *= (empirical_fcount / estimated_fcount) ** Cinv
        classifier.set_weights(weights)

        # Check log-likelihood cutoffs.
        if ll_cutoff is not None or lldelta_cutoff is not None:
            ll = nltk.classify.util.log_likelihood(classifier, train_toks)
            if ll_cutoff is not None and ll >= -abs(ll_cutoff): break
            if lldelta_cutoff is not None:
                if ll_old and (ll - ll_old) <= lldelta_cutoff: break
                ll_old = ll

        # Check accuracy cutoffs.
        if acc_cutoff is not None or accdelta_cutoff is not None:
            acc = nltk.classify.util.accuracy(classifier, train_toks)
            if acc_cutoff is not None and acc >= acc_cutoff: break
            if accdelta_cutoff is not None:
                if acc_old and (acc_old - acc) <= accdelta_cutoff: break
                acc_old = acc

    if trace > 2:
        ll = nltk.classify.util.log_likelihood(classifier, train_toks)
        acc = nltk.classify.util.accuracy(classifier, train_toks)
        print '         Final    %14.5f    %9.3f' % (ll, acc)

    # Return the classifier.
    return classifier