def train_maxent_classifier_with_iis(train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None, accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} will encode the model that maximizes entropy from all the models that are empirically consistent with C{train_toks}. See L{train_maxent_classifier()} for parameter descriptions. """ # Fill in default args, & take abs values of ll cutoffs. if not labels: labels = attested_labels(train_toks) if ll_cutoff: ll_cutoff = abs(ll_cutoff) if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff) # Find a list of all labels in the training data. labels = attested_labels(train_toks) # Construct an encoding from the training data. encoding = SparseBinaryVectorEncoding.train(train_toks) # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. offsets = dict([(label, i * encoding.length()) for i, label in enumerate(labels)]) # Count how many times each feature occurs in the training data. empirical_ffreq = calculate_empirical_fcount(train_toks, encoding, offsets) / len(train_toks) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = calculate_nfmap(train_toks, encoding) nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) # An array that is 1 whenever empirical_ffreq is zero. In # other words, it is one for any feature that's not attested # in the data. This is used to avoid division by zero. unattested = numpy.zeros(len(empirical_ffreq)) for i in range(len(empirical_ffreq)): if empirical_ffreq[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numpy.ones(len(empirical_ffreq), 'd') weights -= unattested classifier = ConditionalExponentialClassifier(labels, encoding, weights) if trace > 0: print ' ==> Training (%d iterations)' % iterations if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations. for iternum in range(iterations): if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' %9d %14.5f %9.3f' % (iternum + 1, ll, acc) # Calculate the deltas for this iteration, using Newton's method. deltas = calculate_deltas(train_toks, classifier, unattested, empirical_ffreq, nfmap, nfarray, nftranspose, offsets, encoding) # Use the deltas to update our weights. weights = classifier.weights() weights *= 2**deltas # numpy.exp(deltas) classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = nltk.classify.util.log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll > -ll_cutoff: break if lldelta_cutoff is not None: if (ll - ll_old) < lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = nltk.classify.util.accuracy(classifier, train_toks) if acc_cutoff is not None and acc < acc_cutoff: break if accdelta_cutoff is not None: if (acc_old - acc) < accdelta_cutoff: break acc_old = acc if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier
def train_maxent_classifier_with_iis( train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None, accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} will encode the model that maximizes entropy from all the models that are empirically consistent with C{train_toks}. See L{train_maxent_classifier()} for parameter descriptions. """ # Fill in default args, & take abs values of ll cutoffs. if not labels: labels = attested_labels(train_toks) if ll_cutoff: ll_cutoff = abs(ll_cutoff) if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff) # Find a list of all labels in the training data. labels = attested_labels(train_toks) # Construct an encoding from the training data. encoding = SparseBinaryVectorEncoding.train(train_toks) # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. offsets = dict([(label, i*encoding.length()) for i, label in enumerate(labels)]) # Count how many times each feature occurs in the training data. empirical_ffreq = calculate_empirical_fcount(train_toks, encoding, offsets) / len(train_toks) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = calculate_nfmap(train_toks, encoding) nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) # An array that is 1 whenever empirical_ffreq is zero. In # other words, it is one for any feature that's not attested # in the data. This is used to avoid division by zero. unattested = numpy.zeros(len(empirical_ffreq)) for i in range(len(empirical_ffreq)): if empirical_ffreq[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numpy.ones(len(empirical_ffreq), 'd') weights -= unattested classifier = ConditionalExponentialClassifier(labels, encoding, weights) if trace > 0: print ' ==> Training (%d iterations)' % iterations if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations. for iternum in range(iterations): if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' %9d %14.5f %9.3f' % (iternum+1, ll, acc) # Calculate the deltas for this iteration, using Newton's method. deltas = calculate_deltas( train_toks, classifier, unattested, empirical_ffreq, nfmap, nfarray, nftranspose, offsets, encoding) # Use the deltas to update our weights. weights = classifier.weights() weights *= 2**deltas # numpy.exp(deltas) classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = nltk.classify.util.log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll > -ll_cutoff: break if lldelta_cutoff is not None: if (ll - ll_old) < lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = nltk.classify.util.accuracy(classifier, train_toks) if acc_cutoff is not None and acc < acc_cutoff: break if accdelta_cutoff is not None: if (acc_old - acc) < accdelta_cutoff: break acc_old = acc if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier
def train_maxent_classifier_with_gis(train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None, accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} will encode the model that maximizes entropy from all the models that are empirically consistent with C{train_toks}. See L{train_maxent_classifier()} for parameter descriptions. """ # Fill in default args, & take abs values of ll cutoffs. if not labels: labels = attested_labels(train_toks) if ll_cutoff: ll_cutoff = abs(ll_cutoff) if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff) # Construct an encoding from the training data. encoding = GISEncoding.train(train_toks) # Cinv is the inverse of the sum of each vector. This controls # the learning rate: higher Cinv (or lower C) gives faster # learning. Cinv = 1.0 / encoding.C() # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. offsets = dict([(label, i * encoding.length()) for i, label in enumerate(labels)]) # Count how many times each feature occurs in the training data. empirical_fcount = calculate_empirical_fcount(train_toks, encoding, offsets) # Define an array that is 1 whenever empirical_fcount is zero. In # other words, it is one for any feature that's not attested in # the training data. This is used to avoid division by zero. unattested = numpy.zeros(len(empirical_fcount)) for i in range(len(empirical_fcount)): if empirical_fcount[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numpy.ones(len(empirical_fcount), 'd') weights -= unattested classifier = ConditionalExponentialClassifier(labels, encoding, weights) # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None if trace > 0: print ' ==> Training (%d iterations)' % iterations if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations.! for iternum in range(iterations): if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' %9d %14.5f %9.3f' % (iternum + 1, ll, acc) # Use the model to estimate the number of times each # feature should occur in the training data. estimated_fcount = calculate_estimated_fcount(classifier, train_toks, encoding, offsets) # Avoid division by zero. estimated_fcount += unattested # Update the classifier weights weights = classifier.weights() weights *= (empirical_fcount / estimated_fcount)**Cinv classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = nltk.classify.util.log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll >= -abs(ll_cutoff): break if lldelta_cutoff is not None: if ll_old and (ll - ll_old) <= lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = nltk.classify.util.accuracy(classifier, train_toks) if acc_cutoff is not None and acc >= acc_cutoff: break if accdelta_cutoff is not None: if acc_old and (acc_old - acc) <= accdelta_cutoff: break acc_old = acc if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier
def train_maxent_classifier_with_gis( train_toks, trace=3, labels=None, iterations=20, acc_cutoff=None, accdelta_cutoff=None, ll_cutoff=None, lldelta_cutoff=None): """ Train a new C{ConditionalExponentialClassifier}, using the given training samples. This C{ConditionalExponentialClassifier} will encode the model that maximizes entropy from all the models that are empirically consistent with C{train_toks}. See L{train_maxent_classifier()} for parameter descriptions. """ # Fill in default args, & take abs values of ll cutoffs. if not labels: labels = attested_labels(train_toks) if ll_cutoff: ll_cutoff = abs(ll_cutoff) if lldelta_cutoff: lldelta_cutoff = abs(lldelta_cutoff) # Construct an encoding from the training data. encoding = GISEncoding.train(train_toks) # Cinv is the inverse of the sum of each vector. This controls # the learning rate: higher Cinv (or lower C) gives faster # learning. Cinv = 1.0/encoding.C() # Build the offsets dictionary. This maps from a class to the # index in the weight vector where that class's weights begin. offsets = dict([(label, i*encoding.length()) for i, label in enumerate(labels)]) # Count how many times each feature occurs in the training data. empirical_fcount = calculate_empirical_fcount(train_toks, encoding, offsets) # Define an array that is 1 whenever empirical_fcount is zero. In # other words, it is one for any feature that's not attested in # the training data. This is used to avoid division by zero. unattested = numpy.zeros(len(empirical_fcount)) for i in range(len(empirical_fcount)): if empirical_fcount[i] == 0: unattested[i] = 1 # Build the classifier. Start with weight=1 for each feature, # except for the unattested features. Start those out at # zero, since we know that's the correct value. weights = numpy.ones(len(empirical_fcount), 'd') weights -= unattested classifier = ConditionalExponentialClassifier(labels, encoding, weights) # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None if trace > 0: print ' ==> Training (%d iterations)' % iterations if trace > 2: print print ' Iteration Log Likelihood Accuracy' print ' ---------------------------------------' # Train for a fixed number of iterations.! for iternum in range(iterations): if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' %9d %14.5f %9.3f' % (iternum+1, ll, acc) # Use the model to estimate the number of times each # feature should occur in the training data. estimated_fcount = calculate_estimated_fcount(classifier, train_toks, encoding, offsets) # Avoid division by zero. estimated_fcount += unattested # Update the classifier weights weights = classifier.weights() weights *= (empirical_fcount / estimated_fcount) ** Cinv classifier.set_weights(weights) # Check log-likelihood cutoffs. if ll_cutoff is not None or lldelta_cutoff is not None: ll = nltk.classify.util.log_likelihood(classifier, train_toks) if ll_cutoff is not None and ll >= -abs(ll_cutoff): break if lldelta_cutoff is not None: if ll_old and (ll - ll_old) <= lldelta_cutoff: break ll_old = ll # Check accuracy cutoffs. if acc_cutoff is not None or accdelta_cutoff is not None: acc = nltk.classify.util.accuracy(classifier, train_toks) if acc_cutoff is not None and acc >= acc_cutoff: break if accdelta_cutoff is not None: if acc_old and (acc_old - acc) <= accdelta_cutoff: break acc_old = acc if trace > 2: ll = nltk.classify.util.log_likelihood(classifier, train_toks) acc = nltk.classify.util.accuracy(classifier, train_toks) print ' Final %14.5f %9.3f' % (ll, acc) # Return the classifier. return classifier