Esempio n. 1
0
    def find_schemas(self, fitness, num_schemas):
        """Find the given number of unique schemas using a genetic algorithm

        Arguments:

        o fitness - A callable object (ie. function) which will evaluate
        the fitness of a motif.

        o num_schemas - The number of unique schemas with good fitness
        that we want to generate.
        """
        start_population = \
           Organism.function_population(self.motif_generator.random_motif,
                                        self.initial_population,
                                        fitness)
        finisher = SimpleFinisher(num_schemas, self.min_generations)

        # set up the evolver and do the evolution
        evolver = GenerationEvolver(start_population, self.selector)
        evolved_pop = evolver.evolve(finisher.is_finished)

        # convert the evolved population into a PatternRepository
        schema_info = {}
        for org in evolved_pop:
            # convert the Genome from a MutableSeq to a Seq so that
            # the schemas are just strings (and not array("c")s)
            seq_genome = org.genome.toseq()
            schema_info[seq_genome.data] = org.fitness

        return PatternRepository(schema_info)
Esempio n. 2
0
    def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
        """Generate schema from a list of motifs.

        Arguments:

        o motif_repository - A MotifRepository class that has all of the
        motifs we want to convert to Schema.

        o motif_percent - The percentage of motifs in the motif bank which
        should be matches. We'll try to create schema that match this
        percentage of motifs.

        o num_ambiguous - The number of ambiguous characters to include
        in each schema. The positions of these ambiguous characters will
        be randomly selected.
        """
        # get all of the motifs we can deal with
        all_motifs = motif_repository.get_top_percentage(motif_percent)

        # start building up schemas
        schema_info = {}
        # continue until we've built schema matching the desired percentage
        # of motifs
        total_count = self._get_num_motifs(motif_repository, all_motifs)
        matched_count = 0
        assert total_count > 0, "Expected to have motifs to match"
        while (float(matched_count) / float(total_count)) < motif_percent:
            
            new_schema, matching_motifs = \
                        self._get_unique_schema(schema_info.keys(),
                                                all_motifs, num_ambiguous)

            # get the number of counts for the new schema and clean up
            # the motif list
            schema_counts = 0
            for motif in matching_motifs:
                # get the counts for the motif
                schema_counts += motif_repository.count(motif)

                # remove the motif from the motif list since it is already
                # represented by this schema
                all_motifs.remove(motif)


            # all the schema info
            schema_info[new_schema] = schema_counts

            matched_count += schema_counts

            # print "percentage:", float(matched_count) / float(total_count)

        return PatternRepository(schema_info)
Esempio n. 3
0
    def find(self, seq_records, motif_size):
        """Find all motifs of the given size in the passed SeqRecords.

        Arguments:

        o seq_records - A list of SeqRecord objects which the motifs
        will be found from.

        o motif_size - The size of the motifs we want to look for.

        Returns:
        A PatternRepository object that contains all of the motifs (and their
        counts) found in the training sequences).
        """
        motif_info = self._get_motif_dict(seq_records, motif_size)

        return PatternRepository(motif_info)
Esempio n. 4
0
    def find(self, seq_records, signature_size, max_gap):
        """Find all signatures in a group of sequences.

        Arguments:

        o seq_records - A list of SeqRecord objects we'll use the sequences
        from to find signatures.

        o signature_size - The size of each half of a signature (ie. if this
        is set at 3, then the signature could be AGC-----GAC)

        o max_gap - The maximum gap size between two parts of a signature.
        """
        sig_info = self._get_signature_dict(seq_records, signature_size,
                                            max_gap)

        return PatternRepository(sig_info)
Esempio n. 5
0
    def find_differences(self, first_records, second_records, motif_size):
        """Find motifs in two sets of records and return the differences.

        This is used for finding motifs, but instead of just counting up all
        of the motifs in a set of records, this returns the differences
        between two listings of seq_records.

        o first_records, second_records - Two listings of SeqRecord objects
        to have their motifs compared.

        o motif_size - The size of the motifs we are looking for.

        Returns:
        A PatternRepository object that has motifs, but instead of their
        raw counts, this has the counts in the first set of records
        subtracted from the counts in the second set.
        """
        first_motifs = self._get_motif_dict(first_records, motif_size)
        second_motifs = self._get_motif_dict(second_records, motif_size)

        motif_diffs = {}

        # first deal with all of the keys from the first motif
        for cur_key in first_motifs.keys():
            if second_motifs.has_key(cur_key):
                motif_diffs[cur_key] = first_motifs[cur_key] - \
                                       second_motifs[cur_key]
            else:
                motif_diffs[cur_key] = first_motifs[cur_key]

        # now see if there are any keys from the second motif
        # that we haven't got yet.
        missing_motifs = second_motifs.keys()[:]

        # remove all of the motifs we've already added
        for added_motif in motif_diffs.keys():
            if added_motif in missing_motifs:
                missing_motifs.remove(added_motif)

        # now put in all of the motifs we didn't get
        for cur_key in missing_motifs:
            motif_diffs[cur_key] = 0 - second_motifs[cur_key]

        return PatternRepository(motif_diffs)