Beispiel #1
0
def _clean_alignments(alignments):
    # Take a list of alignments and return a cleaned version.  Remove
    # duplicates, make sure begin and end are set correctly, remove
    # empty alignments.
    alignments = listfns.items(alignments)  # Get rid of duplicates
    i = 0
    while i < len(alignments):
        seqA, seqB, score, begin, end = alignments[i]
        # Make sure end is set reasonably.
        if end is None:  # global alignment
            end = len(seqA)
        elif end < 0:
            end = end + len(seqA)
        # If there's no alignment here, get rid of it.
        if begin >= end:
            del alignments[i]
            continue
        alignments[i] = seqA, seqB, score, begin, end
        i += 1
    return alignments
def _clean_alignments(alignments):
    # Take a list of alignments and return a cleaned version.  Remove
    # duplicates, make sure begin and end are set correctly, remove
    # empty alignments.
    alignments = listfns.items(alignments)  # Get rid of duplicates
    i = 0
    while i < len(alignments):
        seqA, seqB, score, begin, end = alignments[i]
        # Make sure end is set reasonably.
        if end is None:   # global alignment
            end = len(seqA)
        elif end < 0:
            end = end + len(seqA)
        # If there's no alignment here, get rid of it.
        if begin >= end:
            del alignments[i]
            continue
        alignments[i] = seqA, seqB, score, begin, end
        i += 1
    return alignments
Beispiel #3
0
def train(training_set, results, priors=None, typecode=None):
    """train(training_set, results[, priors]) -> NaiveBayes

    Train a naive bayes classifier on a training set.  training_set is a
    list of observations.  results is a list of the class assignments
    for each observation.  Thus, training_set and results must be the same
    length.  priors is an optional dictionary specifying the prior
    probabilities for each type of result.  If not specified, the priors
    will be estimated from the training results.

    """
    if not len(training_set):
        raise ValueError, "No data in the training set."
    if len(training_set) != len(results):
        raise ValueError, "training_set and results should be parallel lists."

    # If no typecode is specified, try to pick a reasonable one.  If
    # training_set is a Numeric array, then use that typecode.
    # Otherwise, choose a reasonable default.
    # XXX NOT IMPLEMENTED

    # Check to make sure each vector in the training set has the same
    # dimensionality.
    dimensions = [len(x) for x in training_set]
    if min(dimensions) != max(dimensions):
        raise ValueError, "observations have different dimensionality"

    nb = NaiveBayes()
    nb.dimensionality = dimensions[0]
    
    # Get a list of all the classes.
    nb.classes = listfns.items(results)
    nb.classes.sort()   # keep it tidy
    
    # Estimate the prior probabilities for the classes.
    if priors is not None:
        percs = priors
    else:
        percs = listfns.contents(results)
    nb.p_prior = zeros(len(nb.classes))
    for i in range(len(nb.classes)):
        nb.p_prior[i] = percs[nb.classes[i]]

    # Collect all the observations in class.  For each class, make a
    # matrix of training instances versus dimensions.  I might be able
    # to optimize this with Numeric, if the training_set parameter
    # were guaranteed to be a matrix.  However, this may not be the
    # case, because the client may be hacking up a sparse matrix or
    # something.
    c2i = listfns.itemindex(nb.classes)      # class to index of class
    observations = [[] for c in nb.classes]  # separate observations by class
    for i in range(len(results)):
        klass, obs = results[i], training_set[i]
        observations[c2i[klass]].append(obs)
    # Now make the observations Numeric matrics.
    for i in range(len(observations)):
        # XXX typecode must be specified!
        observations[i] = asarray(observations[i], typecode)

    # Calculate P(value|class,dim) for every class.
    # This is a good loop to optimize.
    nb.p_conditional = []
    for i in range(len(nb.classes)):
        class_observations = observations[i]   # observations for this class
        nb.p_conditional.append([None] * nb.dimensionality)
        for j in range(nb.dimensionality):
            # Collect all the values in this dimension.
            values = class_observations[:, j]

            # Add pseudocounts here.  This needs to be parameterized.
            #values = list(values) + range(len(nb.classes))  # XXX add 1
            
            # Estimate P(value|class,dim)
            nb.p_conditional[i][j] = listfns.contents(values)
    return nb
def train(xs, ys, update_fn=None, typecode=None):
    """train(xs, ys[, update_fn]) -> LogisticRegression
    
    Train a logistic regression classifier on a training set.  xs is a
    list of observations and ys is a list of the class assignments,
    which should be 0 or 1.  xs and ys should contain the same number
    of elements.  update_fn is an optional callback function that
    takes as parameters that iteration number and log likelihood.
    
    """
    if len(xs) != len(ys):
        raise ValueError, "xs and ys should be the same length."
    if not xs or not xs[0]:
        raise ValueError, "No observations or observation of 0 dimension."
    classes = listfns.items(ys)
    classes.sort()
    if classes != [0, 1]:
        raise ValueError, "Classes should be 0's and 1's"
    if typecode is None:
        typecode = Float

    # Dimensionality of the data is the dimensionality of the
    # observations plus a constant dimension.
    N, ndims = len(xs), len(xs[0]) + 1

    # Make an X array, with a constant first dimension.
    X = ones((N, ndims), typecode)
    X[:, 1:] = xs
    Xt = transpose(X)
    y = asarray(ys, typecode)

    # Initialize the beta parameter to 0.
    beta = zeros(ndims, typecode)

    MAX_ITERATIONS = 500
    CONVERGE_THRESHOLD = 0.01
    stepsize = 1.0
    # Now iterate using Newton-Raphson until the log-likelihoods
    # converge.
    iter = 0
    old_beta = old_llik = None
    while iter < MAX_ITERATIONS:
        # Calculate the probabilities.  p = e^(beta X) / (1+e^(beta X))
        ebetaX = exp(dot(beta, Xt))
        p = ebetaX / (1+ebetaX)
        
        # Find the log likelihood score and see if I've converged.
        logp = y*log(p) + (1-y)*log(1-p)
        llik = sum(logp)
        if update_fn is not None:
            update_fn(iter, llik)
        # Check to see if the likelihood decreased.  If it did, then
        # restore the old beta parameters and half the step size.
        if llik < old_llik:
            stepsize = stepsize / 2.0
            beta = old_beta
        # If I've converged, then stop.
        if old_llik is not None and fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
            break
        old_llik, old_beta = llik, beta
        iter += 1

        W = identity(N) * p
        Xtyp = dot(Xt, y-p)         # Calculate the first derivative.
        XtWX = dot(dot(Xt, W), X)   # Calculate the second derivative.
        #u, s, vt = singular_value_decomposition(XtWX)
        #print "U", u
        #print "S", s
        delta = dot(inverse(XtWX), Xtyp)
        if fabs(stepsize-1.0) > 0.001:
            delta = delta * stepsize
        beta = beta + delta                 # Update beta.
    else:
        raise AssertionError, "Didn't converge."

    lr = LogisticRegression()
    lr.beta = map(float, beta)   # Convert back to regular array.
    return lr
def train(training_set, results, priors=None, typecode=None):
    """train(training_set, results[, priors]) -> NaiveBayes

    Train a naive bayes classifier on a training set.  training_set is a
    list of observations.  results is a list of the class assignments
    for each observation.  Thus, training_set and results must be the same
    length.  priors is an optional dictionary specifying the prior
    probabilities for each type of result.  If not specified, the priors
    will be estimated from the training results.

    """
    if not len(training_set):
        raise ValueError, "No data in the training set."
    if len(training_set) != len(results):
        raise ValueError, "training_set and results should be parallel lists."

    # If no typecode is specified, try to pick a reasonable one.  If
    # training_set is a Numeric array, then use that typecode.
    # Otherwise, choose a reasonable default.
    # XXX NOT IMPLEMENTED

    # Check to make sure each vector in the training set has the same
    # dimensionality.
    dimensions = [len(x) for x in training_set]
    if min(dimensions) != max(dimensions):
        raise ValueError, "observations have different dimensionality"

    nb = NaiveBayes()
    nb.dimensionality = dimensions[0]

    # Get a list of all the classes.
    nb.classes = listfns.items(results)
    nb.classes.sort()  # keep it tidy

    # Estimate the prior probabilities for the classes.
    if priors is not None:
        percs = priors
    else:
        percs = listfns.contents(results)
    nb.p_prior = zeros(len(nb.classes))
    for i in range(len(nb.classes)):
        nb.p_prior[i] = percs[nb.classes[i]]

    # Collect all the observations in class.  For each class, make a
    # matrix of training instances versus dimensions.  I might be able
    # to optimize this with Numeric, if the training_set parameter
    # were guaranteed to be a matrix.  However, this may not be the
    # case, because the client may be hacking up a sparse matrix or
    # something.
    c2i = listfns.itemindex(nb.classes)  # class to index of class
    observations = [[] for c in nb.classes]  # separate observations by class
    for i in range(len(results)):
        klass, obs = results[i], training_set[i]
        observations[c2i[klass]].append(obs)
    # Now make the observations Numeric matrics.
    for i in range(len(observations)):
        # XXX typecode must be specified!
        observations[i] = asarray(observations[i], typecode)

    # Calculate P(value|class,dim) for every class.
    # This is a good loop to optimize.
    nb.p_conditional = []
    for i in range(len(nb.classes)):
        class_observations = observations[i]  # observations for this class
        nb.p_conditional.append([None] * nb.dimensionality)
        for j in range(nb.dimensionality):
            # Collect all the values in this dimension.
            values = class_observations[:, j]

            # Add pseudocounts here.  This needs to be parameterized.
            #values = list(values) + range(len(nb.classes))  # XXX add 1

            # Estimate P(value|class,dim)
            nb.p_conditional[i][j] = listfns.contents(values)
    return nb