예제 #1
0
 def test_sequence_alphabet(self):
     """Setting the alphabet for the Sequence Parser.
     """
     parser = Fasta.SequenceParser(alphabet =
             IUPAC.unambiguous_dna)
     rec = parser.parse(self.handles[0])
     assert rec.seq.alphabet == IUPAC.unambiguous_dna
예제 #2
0
 def test_sequence_iterator(self):
     """Test the iterator with a Sequence Parser.
     """
     parser = Fasta.SequenceParser()
     iterator = Fasta.Iterator(self.test_handle, parser)
     for rec in iter(iterator):
         assert isinstance(rec, SeqRecord.SeqRecord)
예제 #3
0
    def test_schema_representation(self):
        """Convert sequences into schema representations.
        """
        # get a set of schemas we want to code the sequence in
        schema_bank = self._load_schema_repository()
        top_schemas = schema_bank.get_top(25)
        schema_coder = Schema.SchemaCoder(top_schemas, self.schema)

        # get the sequences one at a time, and encode them
        fasta_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(fasta_handle, seq_parser)

        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            schema_values = schema_coder.representation(seq_record.seq)
            if VERBOSE:
                print "Schema values:", schema_values

        fasta_handle.close()
예제 #4
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.num_schemas = 2
        schema_ga = Schema.GeneticAlgorithmFinder()
        schema_ga.min_generations = 1
        self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas,
                                          schema_finder=schema_ga)
예제 #5
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.motif_finder = Motif.MotifFinder()
예제 #6
0
 def test_sequence_title_convert(self):
     """Test title conversion for the Sequence Parser.
     """
     def test_title2ids(title):
         return "id", "name", "description"
     parser = Fasta.SequenceParser(title2ids = test_title2ids)
     rec = parser.parse(self.handles[0])
     assert rec.id == "id"
     assert rec.name == "name"
     assert rec.description == "description"
예제 #7
0
 def test_sequence_parser(self):
     """Basic operation of the Sequence Parser.
     """
     parser = Fasta.SequenceParser()
     for index in range(len(self.handles)):
         handle = self.handles[index]
         rec = parser.parse(handle)
         assert isinstance(rec, SeqRecord.SeqRecord)
         assert isinstance(rec.seq, Seq.Seq)
         assert rec.seq.alphabet == Alphabet.generic_alphabet
         assert len(rec.seq) == self.lengths[index][1]
         assert len(rec.description) == self.lengths[index][0]
예제 #8
0
    def _load_schema_repository(self):
        """Helper function to load a schema repository from a file.

        This also caches a schema bank, to prevent having to do this
        time consuming operation multiple times.
        """
        # if we already have a cached repository, return it
        if self.schema_bank is not None:
            return self.schema_bank

        # otherwise, we'll read in a new schema bank

        # read in the all of the motif records
        motif_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(motif_handle, seq_parser)

        seq_records = []
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            seq_records.append(seq_record)

        motif_handle.close()

        # find motifs from the file
        motif_finder = Motif.MotifFinder()
        motif_size = 9

        motif_bank = motif_finder.find(seq_records, motif_size)

        schema_bank = self.factory.from_motifs(motif_bank, .1, 2)

        # cache the repository
        self.schema_bank = schema_bank

        return schema_bank
예제 #9
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')

        self.test_records = []

        # load the records
        handle = open(test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(handle, seq_parser)
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            self.test_records.append(seq_record)

        handle.close()

        self.sig_finder = Signature.SignatureFinder()
예제 #10
0
# Note that the alphabet is explicitly defined for the sequences.

import os
from Bio import Fasta
from Bio.Alphabet import IUPAC


def get_accession_num(fasta_record):
    title_atoms = fasta_record.title.split()
    accession_atoms = title_atoms[0].split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]


if not os.path.isdir("my_orchid_dict.idx"):
    #Build a new index
    Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx",
                     get_accession_num)
else:
    print "Reusing existing index"

dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna)

orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser)

for id_num in orchid_dict.keys():
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq
예제 #11
0
    id_info = all_info[0]
    rest = all_info[1:]
    descr = string.join(rest, " ")

    # now extract the ids from the id block
    # gi|5690369|gb|AF158246.1|AF158246
    id_info_items = string.split(id_info, "|")
    id = id_info_items[3]  # the id with version info
    name = id_info_items[4]  # the id without version info

    return id, name, descr

tests = [ 'lupine.nu', 'elderberry.nu', 'phlox.nu', 'centaurea.nu', \
    'wisteria.nu', 'sweetpea.nu', 'lavender.nu' ]
record_parser = Fasta.RecordParser()
sequence_parser = Fasta.SequenceParser(Alphabet.generic_dna, title_to_ids)

for test in tests:
    print "testing %s" % test
    datafile = os.path.join('Nucleic', test)
    src_handle = open(datafile)
    data = record_parser.parse(src_handle)
    print data

for test in tests:
    print "testing %s" % test
    datafile = os.path.join('Nucleic', test)
    src_handle = open(datafile)
    data = sequence_parser.parse(src_handle)
    print data.id
    print data.name