def make_copy(self, individual): if isinstance(individual.genes[0], Gene): genes_copy = list() for gene in individual.genes: gene_copy = Gene(gene.alleles[:]) gene_copy.class_label = gene.class_label if gene.mutation_step_sizes is not None: gene_copy.mutation_step_sizes = gene.mutation_step_sizes[:] genes_copy.append(gene_copy) genome = Genome(genes_copy, individual.representation, individual.fitness_func, individual.natural_fitness) else: genome = Genome(individual.genes[:], individual.representation, individual.fitness_func, individual.natural_fitness) genome._fitness = individual._fitness if hasattr(individual, 'strategy_params') \ and individual.strategy_params is not None: genome.strategy_params = individual.strategy_params.copy() if hasattr(individual, 'average_sigmas') \ and individual.average_sigmas is not None: genome.average_sigmas = individual.average_sigmas[:] return genome
def main(): data_file = 'classifier/data/bcw/breast-cancer-wisconsin.data.txt' data = list() #--------------------------------------------------------------------------- # Load the data #--------------------------------------------------------------------------- with open(data_file, 'r') as f: for line in f: data_line = list() # Split the line but throw away first number (ID number) line = line.split(',')[1:] # Store class label # data_line['class'] = line[-1] # Store data if '?' not in line: for item in line[:-1]: data_line.append(normalise(float(item))) data_line.append(int(line[-1].rstrip())) data.append(data_line) num_genes = 40 gene_length = 18 # lower + upper bound for each 9 data points classifier = BcwClassifier(data, num_genes, gene_length) #--------------------------------------------------------------------------- # Generate the initial population #--------------------------------------------------------------------------- generations = classifier.generations p = Population(representation=classifier.representation, size=classifier.population_size, fitness_func=classifier.fitness_func, selection_func=classifier.selection_func, crossover_func=classifier.crossover_func, mutation_func=classifier.mutation_func, natural_fitness=True, crossover_probability=classifier.crossover_prob, mutation_probability=classifier.mutation_prob, elite_count=classifier.elite_count, tournament_size=classifier.tournament_size) p.gen_population() #--------------------------------------------------------------------------- # Fiddle the population (ugly hack alert) #--------------------------------------------------------------------------- for i, individual in enumerate(p.population): new_genes = list() average_sigmas = list() individual.average_sigmas = list() for genes in classifier.batch_gen(individual.genes, classifier.gene_length): g = Gene(genes)#[normalise(float(gene)) for gene in genes]) g.class_label = 2 if random.random() < 0.5 else 4 g.mutation_step_sizes = [0.015 for _ in xrange(classifier.gene_length)] new_genes.append(g) # Update info for plotter average_sigmas.append(sum(g.mutation_step_sizes) / len(g.mutation_step_sizes)) individual.genes = new_genes individual.average_sigmas.append(sum(average_sigmas) / len(average_sigmas)) # Add strategy parameters individual.strategy_params = {'mutation_step_size': 0.05} #--------------------------------------------------------------------------- # Run the GA #--------------------------------------------------------------------------- p.run(generations) #--------------------------------------------------------------------------- # Validate the population #--------------------------------------------------------------------------- print avg = 0 for individual in p: avg += classifier.fitness_func(individual.genes, validate=True) print 'min individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \ (classifier.fitness_func(p.min_individual().genes), len(classifier.training_set), classifier.fitness_func(p.min_individual().genes, validate=True), len(classifier.validation_set), len(p.min_individual()), len(p.min_individual()) / classifier.gene_length, p.min_individual()) print 'mean validation fitness:', avg / len(p) print 'max individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \ (classifier.fitness_func(p.max_individual().genes), len(classifier.training_set), classifier.fitness_func(p.max_individual().genes, validate=True), len(classifier.validation_set), len(p.max_individual()), len(p.max_individual()) / classifier.gene_length, p.max_individual()) data = list() for chunk in classifier.batch_gen(p.average_sigmas, p.size): data.append(sum(chunk) / len(chunk)) p.add_to_plot(data, 'average sigma') for gene in p.max_individual().genes: for pair in classifier.batch_gen(gene.alleles, 2): print pair print gene.class_label print p.show_plot()
def main(): """ Run the classification GA on the data file given on the the command line """ if len(sys.argv) != 2: sys.exit('usage: classifier.py datafile') data_file = sys.argv[1] with open(data_file, 'r') as f: # Read the first (informational) line info_line = f.readline().split() # Set length of variables + class gene_length = int(info_line[3]) + 1 # Derive the length of an individual genome_length = (int(info_line[0]) * gene_length) # Binary data (data1.txt) if genome_length == 192: data = [list(line.rstrip().replace(' ', '')) for line in f] classifier = BinaryClassifier(data, gene_length, genome_length) # Binary data (data2.txt) elif genome_length == 448: data = [list(line.rstrip().replace(' ', '')) for line in f] classifier = VariableLengthBinaryClassifier(data, gene_length, genome_length) # Real-valued data elif genome_length == 14000: data = [map(float, line.rstrip().split()) for line in f] # 2 floats (upper, lower) per "bit", plus class gene_length = (gene_length * 2) - 2 genome_length = (int(info_line[0]) * gene_length) classifier = RealValueClassifier(data, gene_length, genome_length) else: raise IOError('unknown data file format') print '[i] loaded data file:', data_file #--------------------------------------------------------------------------- # Generate the initial population #--------------------------------------------------------------------------- generations = classifier.generations p = Population(representation=classifier.representation, size=classifier.population_size, fitness_func=classifier.fitness_func, selection_func=classifier.selection_func, crossover_func=classifier.crossover_func, mutation_func=classifier.mutation_func, natural_fitness=True, crossover_probability=classifier.crossover_prob, mutation_probability=classifier.mutation_prob, elite_count=classifier.elite_count, tournament_size=classifier.tournament_size) p.gen_population() #--------------------------------------------------------------------------- # Fiddle the population (ugly hack alert) #--------------------------------------------------------------------------- step = classifier.gene_length if isinstance(classifier, VariableLengthBinaryClassifier): for individual in p: # Fix a 0 or 1 in the class position for i in xrange(step - 1, len(individual.genes), step): if individual.genes[i] == '#': individual.genes[i] = '1' if random.random() < 0.5 else '0' classifier.genome_lengths.append(len(individual.genes)) if isinstance(classifier, RealValueClassifier): for i, individual in enumerate(p.population): new_genes = list() average_sigmas = list() individual.average_sigmas = list() for genes in classifier.batch_gen(individual.genes, classifier.gene_length): g = Gene(genes) g.class_label = 1 if random.random() < 0.5 else 0 g.mutation_step_sizes = [0.05 for _ in xrange(classifier.gene_length)] new_genes.append(g) # Update info for plotter average_sigmas.append(sum(g.mutation_step_sizes) / len(g.mutation_step_sizes)) individual.genes = new_genes individual.average_sigmas.append(sum(average_sigmas) / len(average_sigmas)) # Add strategy parameters individual.strategy_params = {'mutation_step_size': 0.05} print '[i] fiddled population' # if hasattr(classifier, 'genome_lengths'): # p.add_to_plot([len(i) for i in p], 'avg genome length') #--------------------------------------------------------------------------- # Run the GA #--------------------------------------------------------------------------- p.run(generations) #--------------------------------------------------------------------------- # Validate the population #--------------------------------------------------------------------------- print avg = 0 for individual in p: avg += classifier.fitness_func(individual.genes, validate=True) print 'min individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \ (classifier.fitness_func(p.min_individual().genes), len(classifier.training_set), classifier.fitness_func(p.min_individual().genes, validate=True), len(classifier.validation_set), len(p.min_individual()), len(p.min_individual()) / classifier.gene_length, p.min_individual()) print 'mean validation fitness:', avg / len(p) print 'max individual: %d/%dt, %d/%dv (len=%d, num genes=%d) %s' % \ (classifier.fitness_func(p.max_individual().genes), len(classifier.training_set), classifier.fitness_func(p.max_individual().genes, validate=True), len(classifier.validation_set), len(p.max_individual()), len(p.max_individual()) / classifier.gene_length, p.max_individual()) # TODO: plot amount of generalisation if isinstance(classifier, VariableLengthBinaryClassifier): data = list() for chunk in classifier.chunker(classifier.genome_lengths, p.size): data.append(sum(chunk) / len(chunk)) p.add_to_plot(data, 'avg genome length') for gene in classifier.chunker(p.max_individual().genes, classifier.gene_length): print gene[:-1], gene[-1] elif isinstance(classifier, RealValueClassifier): data = list() for chunk in chunker(p.average_sigmas, p.size): data.append(sum(chunk) / len(chunk)) p.add_to_plot(data, 'average sigma') for gene in p.max_individual().genes: for pair in chunker(gene.alleles, 2): print pair print gene.class_label print p.show_plot()