def _get_coords(filename): alb = file(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line]))) # returns [(start0, end0), (start1, end1)]
def _get_coords(filename): alb = file(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line])) ) # returns [(start0, end0), (start1, end1)]
def __eq__(self, other): if len(self.data) != len(other.data): return 0 ok = reduce(lambda x, y: x and y, map(lambda x, y: x == y, self.data, other.data)) return ok
def train(training_set, results, feature_fns, update_fn=None, max_iis_iterations=10000, iis_converge=1.0e-5, max_newton_iterations=100, newton_converge=1.0e-10): """Train a maximum entropy classifier, returns MaxEntropy object. Train a maximum entropy classifier on a training set. training_set is a list of observations. results is a list of the class assignments for each observation. feature_fns is a list of the features. These are callback functions that take an observation and class and return a 1 or 0. update_fn is a callback function that is called at each training iteration. It is passed a MaxEntropy object that encapsulates the current state of the training. The maximum number of iterations and the convergence criterion for IIS are given by max_iis_iterations and iis_converge, respectively, while max_newton_iterations and newton_converge are the maximum number of iterations and the convergence criterion for Newton's method. """ if not training_set: raise ValueError("No data in the training set.") if len(training_set) != len(results): raise ValueError("training_set and results should be parallel lists.") # Rename variables for convenience. xs, ys = training_set, results # Get a list of all the classes that need to be trained. classes = sorted(set(results)) # Cache values for all features. features = [_eval_feature_fn(fn, training_set, classes) for fn in feature_fns] # Cache values for f#. f_sharp = _calc_f_sharp(len(training_set), len(classes), features) # Pre-calculate the empirical expectations of the features. e_empirical = _calc_empirical_expects(xs, ys, classes, features) # Now train the alpha parameters to weigh each feature. alphas = [0.0] * len(features) iters = 0 while iters < max_iis_iterations: nalphas = _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, max_newton_iterations, newton_converge) diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas) diff = reduce(lambda x, y: x+y, diff, 0) alphas = nalphas me = MaxEntropy() me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns if update_fn is not None: update_fn(me) if diff < iis_converge: # converged break else: raise RuntimeError("IIS did not converge") return me