def test_sequence_alphabet(self): """Setting the alphabet for the Sequence Parser. """ parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna) rec = parser.parse(self.handles[0]) assert rec.seq.alphabet == IUPAC.unambiguous_dna
def test_sequence_iterator(self): """Test the iterator with a Sequence Parser. """ parser = Fasta.SequenceParser() iterator = Fasta.Iterator(self.test_handle, parser) for rec in iter(iterator): assert isinstance(rec, SeqRecord.SeqRecord)
def test_schema_representation(self): """Convert sequences into schema representations. """ # get a set of schemas we want to code the sequence in schema_bank = self._load_schema_repository() top_schemas = schema_bank.get_top(25) schema_coder = Schema.SchemaCoder(top_schemas, self.schema) # get the sequences one at a time, and encode them fasta_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(fasta_handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break schema_values = schema_coder.representation(seq_record.seq) if VERBOSE: print "Schema values:", schema_values fasta_handle.close()
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.num_schemas = 2 schema_ga = Schema.GeneticAlgorithmFinder() schema_ga.min_generations = 1 self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas, schema_finder=schema_ga)
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') diff_file = os.path.join('NeuralNetwork', 'repeat.fasta') self.test_records = [] self.diff_records = [] # load the records for file, records in ((test_file, self.test_records), (diff_file, self.diff_records)): handle = open(file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break records.append(seq_record) handle.close() self.motif_finder = Motif.MotifFinder()
def test_sequence_title_convert(self): """Test title conversion for the Sequence Parser. """ def test_title2ids(title): return "id", "name", "description" parser = Fasta.SequenceParser(title2ids = test_title2ids) rec = parser.parse(self.handles[0]) assert rec.id == "id" assert rec.name == "name" assert rec.description == "description"
def test_sequence_parser(self): """Basic operation of the Sequence Parser. """ parser = Fasta.SequenceParser() for index in range(len(self.handles)): handle = self.handles[index] rec = parser.parse(handle) assert isinstance(rec, SeqRecord.SeqRecord) assert isinstance(rec.seq, Seq.Seq) assert rec.seq.alphabet == Alphabet.generic_alphabet assert len(rec.seq) == self.lengths[index][1] assert len(rec.description) == self.lengths[index][0]
def _load_schema_repository(self): """Helper function to load a schema repository from a file. This also caches a schema bank, to prevent having to do this time consuming operation multiple times. """ # if we already have a cached repository, return it if self.schema_bank is not None: return self.schema_bank # otherwise, we'll read in a new schema bank # read in the all of the motif records motif_handle = open(self.test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(motif_handle, seq_parser) seq_records = [] while 1: seq_record = iterator.next() if seq_record is None: break seq_records.append(seq_record) motif_handle.close() # find motifs from the file motif_finder = Motif.MotifFinder() motif_size = 9 motif_bank = motif_finder.find(seq_records, motif_size) schema_bank = self.factory.from_motifs(motif_bank, .1, 2) # cache the repository self.schema_bank = schema_bank return schema_bank
def setUp(self): test_file = os.path.join('NeuralNetwork', 'enolase.fasta') self.test_records = [] # load the records handle = open(test_file, 'r') seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna) iterator = Fasta.Iterator(handle, seq_parser) while 1: seq_record = iterator.next() if seq_record is None: break self.test_records.append(seq_record) handle.close() self.sig_finder = Signature.SignatureFinder()
# Note that the alphabet is explicitly defined for the sequences. import os from Bio import Fasta from Bio.Alphabet import IUPAC def get_accession_num(fasta_record): title_atoms = fasta_record.title.split() accession_atoms = title_atoms[0].split('|') gb_name = accession_atoms[3] # strip the version info before returning return gb_name[:-2] if not os.path.isdir("my_orchid_dict.idx"): #Build a new index Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx", get_accession_num) else: print "Reusing existing index" dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna) orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser) for id_num in orchid_dict.keys(): print 'id number:', id_num print 'description:', orchid_dict[id_num].description print 'sequence:', orchid_dict[id_num].seq
id_info = all_info[0] rest = all_info[1:] descr = string.join(rest, " ") # now extract the ids from the id block # gi|5690369|gb|AF158246.1|AF158246 id_info_items = string.split(id_info, "|") id = id_info_items[3] # the id with version info name = id_info_items[4] # the id without version info return id, name, descr tests = [ 'lupine.nu', 'elderberry.nu', 'phlox.nu', 'centaurea.nu', \ 'wisteria.nu', 'sweetpea.nu', 'lavender.nu' ] record_parser = Fasta.RecordParser() sequence_parser = Fasta.SequenceParser(Alphabet.generic_dna, title_to_ids) for test in tests: print "testing %s" % test datafile = os.path.join('Nucleic', test) src_handle = open(datafile) data = record_parser.parse(src_handle) print data for test in tests: print "testing %s" % test datafile = os.path.join('Nucleic', test) src_handle = open(datafile) data = sequence_parser.parse(src_handle) print data.id print data.name