def find_schemas(self, fitness, num_schemas): """Find the given number of unique schemas using a genetic algorithm Arguments: o fitness - A callable object (ie. function) which will evaluate the fitness of a motif. o num_schemas - The number of unique schemas with good fitness that we want to generate. """ start_population = \ Organism.function_population(self.motif_generator.random_motif, self.initial_population, fitness) finisher = SimpleFinisher(num_schemas, self.min_generations) # set up the evolver and do the evolution evolver = GenerationEvolver(start_population, self.selector) evolved_pop = evolver.evolve(finisher.is_finished) # convert the evolved population into a PatternRepository schema_info = {} for org in evolved_pop: # convert the Genome from a MutableSeq to a Seq so that # the schemas are just strings (and not array("c")s) seq_genome = org.genome.toseq() schema_info[seq_genome.data] = org.fitness return PatternRepository(schema_info)
def from_motifs(self, motif_repository, motif_percent, num_ambiguous): """Generate schema from a list of motifs. Arguments: o motif_repository - A MotifRepository class that has all of the motifs we want to convert to Schema. o motif_percent - The percentage of motifs in the motif bank which should be matches. We'll try to create schema that match this percentage of motifs. o num_ambiguous - The number of ambiguous characters to include in each schema. The positions of these ambiguous characters will be randomly selected. """ # get all of the motifs we can deal with all_motifs = motif_repository.get_top_percentage(motif_percent) # start building up schemas schema_info = {} # continue until we've built schema matching the desired percentage # of motifs total_count = self._get_num_motifs(motif_repository, all_motifs) matched_count = 0 assert total_count > 0, "Expected to have motifs to match" while (float(matched_count) / float(total_count)) < motif_percent: new_schema, matching_motifs = \ self._get_unique_schema(schema_info.keys(), all_motifs, num_ambiguous) # get the number of counts for the new schema and clean up # the motif list schema_counts = 0 for motif in matching_motifs: # get the counts for the motif schema_counts += motif_repository.count(motif) # remove the motif from the motif list since it is already # represented by this schema all_motifs.remove(motif) # all the schema info schema_info[new_schema] = schema_counts matched_count += schema_counts # print "percentage:", float(matched_count) / float(total_count) return PatternRepository(schema_info)
def find(self, seq_records, motif_size): """Find all motifs of the given size in the passed SeqRecords. Arguments: o seq_records - A list of SeqRecord objects which the motifs will be found from. o motif_size - The size of the motifs we want to look for. Returns: A PatternRepository object that contains all of the motifs (and their counts) found in the training sequences). """ motif_info = self._get_motif_dict(seq_records, motif_size) return PatternRepository(motif_info)
def find(self, seq_records, signature_size, max_gap): """Find all signatures in a group of sequences. Arguments: o seq_records - A list of SeqRecord objects we'll use the sequences from to find signatures. o signature_size - The size of each half of a signature (ie. if this is set at 3, then the signature could be AGC-----GAC) o max_gap - The maximum gap size between two parts of a signature. """ sig_info = self._get_signature_dict(seq_records, signature_size, max_gap) return PatternRepository(sig_info)
def find_differences(self, first_records, second_records, motif_size): """Find motifs in two sets of records and return the differences. This is used for finding motifs, but instead of just counting up all of the motifs in a set of records, this returns the differences between two listings of seq_records. o first_records, second_records - Two listings of SeqRecord objects to have their motifs compared. o motif_size - The size of the motifs we are looking for. Returns: A PatternRepository object that has motifs, but instead of their raw counts, this has the counts in the first set of records subtracted from the counts in the second set. """ first_motifs = self._get_motif_dict(first_records, motif_size) second_motifs = self._get_motif_dict(second_records, motif_size) motif_diffs = {} # first deal with all of the keys from the first motif for cur_key in first_motifs.keys(): if second_motifs.has_key(cur_key): motif_diffs[cur_key] = first_motifs[cur_key] - \ second_motifs[cur_key] else: motif_diffs[cur_key] = first_motifs[cur_key] # now see if there are any keys from the second motif # that we haven't got yet. missing_motifs = second_motifs.keys()[:] # remove all of the motifs we've already added for added_motif in motif_diffs.keys(): if added_motif in missing_motifs: missing_motifs.remove(added_motif) # now put in all of the motifs we didn't get for cur_key in missing_motifs: motif_diffs[cur_key] = 0 - second_motifs[cur_key] return PatternRepository(motif_diffs)