def test_ungap(self): seq = Seq.UnknownSeq(7, alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(), "-")) self.assertEqual("NNNNNNN", str(seq.ungap("-"))) seq = Seq.UnknownSeq(20, alphabet=Alphabet.Gapped(Alphabet.DNAAlphabet(), "-"), character='-') self.assertEqual("", seq.ungap("-"))
def test_stops(self): for nucleotide_seq in [ self.misc_stops, Seq.Seq(self.misc_stops), Seq.Seq(self.misc_stops, Alphabet.generic_nucleotide), Seq.Seq(self.misc_stops, Alphabet.DNAAlphabet()), Seq.Seq(self.misc_stops, IUPAC.unambiguous_dna), ]: self.assertEqual("***RR", str(Seq.translate(nucleotide_seq))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=1))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table="SGC0"))) self.assertEqual("**W**", str(Seq.translate(nucleotide_seq, table=2))) self.assertEqual( "**WRR", str(Seq.translate(nucleotide_seq, table="Yeast Mitochondrial"))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=5))) self.assertEqual("**WSS", str(Seq.translate(nucleotide_seq, table=9))) self.assertEqual( "**CRR", str(Seq.translate(nucleotide_seq, table="Euplotid Nuclear"))) self.assertEqual("***RR", str(Seq.translate(nucleotide_seq, table=11))) self.assertEqual( "***RR", str(Seq.translate(nucleotide_seq, table="Bacterial")))
def make_seq(seq_data, seq_name): seq = SeqRecord(id=seq_name, name=seq_name, seq=Seq(seq_data['Sequence'], alphabet=Alphabet.DNAAlphabet())) seq.features = [ SeqFeature(FeatureLocation(a['start'], a['end'], a['strand']), type=a['type'], qualifiers=OrderedDict({ 'label': a['label'], 'note': a['note'], })) for a in seq_data.get('Annotations', []) ] if not os.path.exists(seq_name): os.makedirs(seq_name) with open('{}/README.md'.format(seq_name), 'w') as f: f.write(seq_data.get('ReadMe', '')) with open('{}/.sequence.json'.format(seq_name), 'w') as f: f.write(json.dumps(seq_data)) SeqIO.write(seq, "{}/sequence.fasta".format(seq_name), "fasta") SeqIO.write(seq, "{}/sequence.gb".format(seq_name), "genbank")
def extract(self, start_pos, end_pos, make_file=False): range_set = set(range(start_pos, end_pos)) partial_gb = SeqRecord( Seq(str(self.gb.seq[start_pos:end_pos]), Alphabet.DNAAlphabet())) for afeat in self.gb.features: afeat_range = set(range(afeat.location.start, afeat.location.end)) if len(afeat_range & range_set) > 0: partial_gb.features.append(afeat) if make_file == True: record_handle = open( partial_gb.id + "_" + str(start_pos) + "_" + str(end_pos), "w") SeqIO.write(partial_gb, record_handle, "genbank") return partial_gb
except ValueError: pass if not isinstance(s, Seq.Seq): continue # Only Seq has this method try: print s.translate() assert False, "Translation shouldn't work on a protein!" except ValueError: pass misc_stops = "TAATAGTGAAGAAGG" for nucleotide_seq in [ misc_stops, Seq.Seq(misc_stops), Seq.Seq(misc_stops, Alphabet.generic_nucleotide), Seq.Seq(misc_stops, Alphabet.DNAAlphabet()), Seq.Seq(misc_stops, IUPAC.unambiguous_dna) ]: assert "***RR" == str(Seq.translate(nucleotide_seq)) assert "***RR" == str(Seq.translate(nucleotide_seq, table=1)) assert "***RR" == str(Seq.translate(nucleotide_seq, table="SGC0")) assert "**W**" == str(Seq.translate(nucleotide_seq, table=2)) assert "**WRR" == str( Seq.translate(nucleotide_seq, table='Yeast Mitochondrial')) assert "**WSS" == str(Seq.translate(nucleotide_seq, table=5)) assert "**WSS" == str(Seq.translate(nucleotide_seq, table=9)) assert "**CRR" == str( Seq.translate(nucleotide_seq, table='Euplotid Nuclear')) assert "***RR" == str(Seq.translate(nucleotide_seq, table=11)) assert "***RR" == str(Seq.translate(nucleotide_seq, table='Bacterial')) del misc_stops
def gen_components(self): """ Construct knowledge base components """ # get options options = self.options num_chromosomes = options.get('num_chromosomes') chromosome_topology = options.get('chromosome_topology') mean_gc_frac = options.get('mean_gc_frac') mean_num_genes = options.get('mean_num_genes') mean_gene_len = options.get('mean_gene_len') mean_coding_frac = options.get('mean_coding_frac') # generate chromosomes and genes cell = self.knowledge_base.cell for i_chr in range(num_chromosomes): num_genes = self.rand(mean_num_genes / num_chromosomes)[0] gene_lens = self.rand(mean_gene_len, count=num_genes) intergene_lens = self.rand(mean_gene_len / mean_coding_frac * (1 - mean_coding_frac), count=num_genes) seq_len = numpy.sum(gene_lens) + numpy.sum(intergene_lens) seq = Seq.Seq( ''.join( random.choice(('A', 'C', 'G', 'T'), p=((1 - mean_gc_frac) / 2, mean_gc_frac / 2, mean_gc_frac / 2, (1 - mean_gc_frac) / 2), size=(seq_len, ))), Alphabet.DNAAlphabet()) chr = cell.species_types.get_or_create( id='chr_{}'.format(i_chr + 1), __type=wc_kb.core.DnaSpeciesType) chr.name = 'Chromosome {}'.format(i_chr + 1) chr.circular = chromosome_topology == 'circular' chr.double_stranded = True chr.seq = seq gene_starts = numpy.int64( numpy.cumsum( numpy.concatenate(([0], gene_lens[0:-1])) + numpy.concatenate((numpy.round(intergene_lens[0:1] / 2), intergene_lens[1:])))) for i_gene in range(num_genes): tu = cell.loci.get_or_create( id='tu_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.TranscriptionUnitLocus) tu.polymer = chr tu.name = 'Transcription unit {}-{}'.format( i_chr + 1, i_gene + 1) tu.start = gene_starts[i_gene] tu.end = gene_starts[i_gene] + gene_lens[i_gene] - 1 tu.strand = random.choice((wc_kb.core.PolymerStrand.positive, wc_kb.core.PolymerStrand.negative)) gene = cell.loci.get_or_create( id='gene_{}_{}'.format(i_chr + 1, i_gene + 1), __type=wc_kb.prokaryote.GeneLocus) gene.polymer = chr gene.transcription_units.append(tu) gene.name = 'Gene {}-{}'.format(i_chr + 1, i_gene + 1) gene.start = gene_starts[i_gene] gene.end = gene_starts[i_gene] + gene_lens[i_gene] - 1 gene.type = wc_kb.core.GeneType.mRna gene.strand = tu.strand
def translate(seq): return Seq(seq.replace('-', 'N'), Alphabet.DNAAlphabet()).translate().tostring()