def weighted_replicate(seq, weights, n): """ Return n selections from seq, with the count of each element of seq proportional to the corresponding weight (filling in fractions randomly). >>> weighted_replicate('ABC', [1, 2, 1], 4) ['A', 'B', 'B', 'C'] """ assert len(seq) == len(weights) weights = normalize(weights) wholes = [int(w * n) for w in weights] fractions = [(w * n) % 1 for w in weights] return (flatten([x] * nx for x, nx in zip(seq, wholes)) + weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
def train(dataset): examples, target = dataset.examples, dataset.target N = len(examples) epsilon = 1 / (2 * N) w = [1 / N] * N h, z = [], [] for k in range(K): h_k = L(dataset, w) h.append(h_k) error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example)) # Avoid divide-by-0 from either 0% or 100% error rates: error = clip(error, epsilon, 1 - epsilon) for j, example in enumerate(examples): if example[target] == h_k(example): w[j] *= error / (1 - error) w = normalize(w) z.append(math.log((1 - error) / error)) return WeightedMajority(h, z)
def ada_boost(dataset, L, K): """[Figure 18.34]""" examples, target = dataset.examples, dataset.target n = len(examples) eps = 1 / (2 * n) w = [1 / n] * n h, z = [], [] for k in range(K): h_k = L(dataset, w) h.append(h_k) error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example)) # avoid divide-by-0 from either 0% or 100% error rates error = clip(error, eps, 1 - eps) for j, example in enumerate(examples): if example[target] == h_k(example): w[j] *= error / (1 - error) w = normalize(w) z.append(math.log((1 - error) / error)) return weighted_majority(h, z)
def information_content(values): """Number of bits to represent the probability distribution in values.""" probabilities = normalize(removeall(0, values)) return sum(-p * math.log2(p) for p in probabilities)