def _clean_alignments(alignments): # Take a list of alignments and return a cleaned version. Remove # duplicates, make sure begin and end are set correctly, remove # empty alignments. alignments = listfns.items(alignments) # Get rid of duplicates i = 0 while i < len(alignments): seqA, seqB, score, begin, end = alignments[i] # Make sure end is set reasonably. if end is None: # global alignment end = len(seqA) elif end < 0: end = end + len(seqA) # If there's no alignment here, get rid of it. if begin >= end: del alignments[i] continue alignments[i] = seqA, seqB, score, begin, end i += 1 return alignments
def train(training_set, results, priors=None, typecode=None): """train(training_set, results[, priors]) -> NaiveBayes Train a naive bayes classifier on a training set. training_set is a list of observations. results is a list of the class assignments for each observation. Thus, training_set and results must be the same length. priors is an optional dictionary specifying the prior probabilities for each type of result. If not specified, the priors will be estimated from the training results. """ if not len(training_set): raise ValueError, "No data in the training set." if len(training_set) != len(results): raise ValueError, "training_set and results should be parallel lists." # If no typecode is specified, try to pick a reasonable one. If # training_set is a Numeric array, then use that typecode. # Otherwise, choose a reasonable default. # XXX NOT IMPLEMENTED # Check to make sure each vector in the training set has the same # dimensionality. dimensions = [len(x) for x in training_set] if min(dimensions) != max(dimensions): raise ValueError, "observations have different dimensionality" nb = NaiveBayes() nb.dimensionality = dimensions[0] # Get a list of all the classes. nb.classes = listfns.items(results) nb.classes.sort() # keep it tidy # Estimate the prior probabilities for the classes. if priors is not None: percs = priors else: percs = listfns.contents(results) nb.p_prior = zeros(len(nb.classes)) for i in range(len(nb.classes)): nb.p_prior[i] = percs[nb.classes[i]] # Collect all the observations in class. For each class, make a # matrix of training instances versus dimensions. I might be able # to optimize this with Numeric, if the training_set parameter # were guaranteed to be a matrix. However, this may not be the # case, because the client may be hacking up a sparse matrix or # something. c2i = listfns.itemindex(nb.classes) # class to index of class observations = [[] for c in nb.classes] # separate observations by class for i in range(len(results)): klass, obs = results[i], training_set[i] observations[c2i[klass]].append(obs) # Now make the observations Numeric matrics. for i in range(len(observations)): # XXX typecode must be specified! observations[i] = asarray(observations[i], typecode) # Calculate P(value|class,dim) for every class. # This is a good loop to optimize. nb.p_conditional = [] for i in range(len(nb.classes)): class_observations = observations[i] # observations for this class nb.p_conditional.append([None] * nb.dimensionality) for j in range(nb.dimensionality): # Collect all the values in this dimension. values = class_observations[:, j] # Add pseudocounts here. This needs to be parameterized. #values = list(values) + range(len(nb.classes)) # XXX add 1 # Estimate P(value|class,dim) nb.p_conditional[i][j] = listfns.contents(values) return nb
def train(xs, ys, update_fn=None, typecode=None): """train(xs, ys[, update_fn]) -> LogisticRegression Train a logistic regression classifier on a training set. xs is a list of observations and ys is a list of the class assignments, which should be 0 or 1. xs and ys should contain the same number of elements. update_fn is an optional callback function that takes as parameters that iteration number and log likelihood. """ if len(xs) != len(ys): raise ValueError, "xs and ys should be the same length." if not xs or not xs[0]: raise ValueError, "No observations or observation of 0 dimension." classes = listfns.items(ys) classes.sort() if classes != [0, 1]: raise ValueError, "Classes should be 0's and 1's" if typecode is None: typecode = Float # Dimensionality of the data is the dimensionality of the # observations plus a constant dimension. N, ndims = len(xs), len(xs[0]) + 1 # Make an X array, with a constant first dimension. X = ones((N, ndims), typecode) X[:, 1:] = xs Xt = transpose(X) y = asarray(ys, typecode) # Initialize the beta parameter to 0. beta = zeros(ndims, typecode) MAX_ITERATIONS = 500 CONVERGE_THRESHOLD = 0.01 stepsize = 1.0 # Now iterate using Newton-Raphson until the log-likelihoods # converge. iter = 0 old_beta = old_llik = None while iter < MAX_ITERATIONS: # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) ebetaX = exp(dot(beta, Xt)) p = ebetaX / (1+ebetaX) # Find the log likelihood score and see if I've converged. logp = y*log(p) + (1-y)*log(1-p) llik = sum(logp) if update_fn is not None: update_fn(iter, llik) # Check to see if the likelihood decreased. If it did, then # restore the old beta parameters and half the step size. if llik < old_llik: stepsize = stepsize / 2.0 beta = old_beta # If I've converged, then stop. if old_llik is not None and fabs(llik-old_llik) <= CONVERGE_THRESHOLD: break old_llik, old_beta = llik, beta iter += 1 W = identity(N) * p Xtyp = dot(Xt, y-p) # Calculate the first derivative. XtWX = dot(dot(Xt, W), X) # Calculate the second derivative. #u, s, vt = singular_value_decomposition(XtWX) #print "U", u #print "S", s delta = dot(inverse(XtWX), Xtyp) if fabs(stepsize-1.0) > 0.001: delta = delta * stepsize beta = beta + delta # Update beta. else: raise AssertionError, "Didn't converge." lr = LogisticRegression() lr.beta = map(float, beta) # Convert back to regular array. return lr