def get_gene_distance(seq_1, seq_2): """ Returns hamming distance between two DNA sequences Alignment based on Striped Smith-Waterman algorithm """ query = StripedSmithWaterman(seq_1.upper()) alignment = query(seq_2.upper()) q = DNA(alignment.aligned_query_sequence) t = DNA(alignment.aligned_target_sequence) return q.distance(t)
def test_transcribe_preserves_all_metadata(self): im = IntervalMetadata(4) im.add([(0, 2)], metadata={'gene': 'p53'}) exp = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) seq = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) self.assertEqual(seq.transcribe(), exp)
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False): roi_dict = {'region':roi.position_range} range_match = re.search('(\d*)-(\d*)', roi.position_range) if not range_match: return roi_dict start = int(range_match.group(1)) - 1 end = int(range_match.group(2)) aa_sequence_counter = Counter() nt_sequence_counter = Counter() depth = 0 for read in samdata.fetch(amplicon_ref, start, end): rstart = read.reference_start if rstart <= start: nt_sequence = DNA(read.query_alignment_sequence[start-rstart:end-rstart]) if reverse_comp: nt_sequence = nt_sequence.reverse_complement() #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now if nt_sequence.has_degenerates(): continue aa_sequence = nt_sequence.translate() aa_string = str(aa_sequence).replace('*', 'x') if aa_string: nt_sequence_counter.update([str(nt_sequence)]) aa_sequence_counter.update([aa_string]) depth += 1 if len(aa_sequence_counter) == 0: roi_dict['flag'] = "region not found" return roi_dict aa_consensus = aa_sequence_counter.most_common(1)[0][0] nt_consensus = nt_sequence_counter.most_common(1)[0][0] num_changes = 0 reference = roi.aa_sequence consensus = aa_consensus if roi.nt_sequence: reference = roi.nt_sequence consensus = nt_consensus for i in range(len(reference)): if len(consensus) <= i or reference[i] != consensus[i]: num_changes += 1 roi_dict['most_common_aa_sequence'] = aa_consensus roi_dict['most_common_nt_sequence'] = nt_consensus roi_dict['reference'] = reference roi_dict['changes'] = str(num_changes) roi_dict['aa_sequence_distribution'] = aa_sequence_counter roi_dict['nt_sequence_distribution'] = nt_sequence_counter roi_dict['depth'] = str(depth) return roi_dict
def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def reformat_egid(genbank_fp, output_dir): """ Reformat input genome to the formats accepted by EGID. Parameters ---------- genbank_fp: string file path to genome in GenBank format output_dir: string output directory path Notes ----- Input to EGID are five obsolete NCBI standard files: gbk, fna, faa, ffn and ptt. """ (gb, genes) = _merge_genbank_seqs(genbank_fp) DNA.write(gb, join(output_dir, 'id.fna'), format='fasta') DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank') nucl_seq = str(gb) output_f = {} for x in ('faa', 'ffn', 'ptt'): output_f[x] = open(join(output_dir, 'id.' + x), 'w') output_f['ptt'].write('locus001\n' + str(len(genes)) + ' proteins\n') # a ptt file contains the following columns: fields = ('Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym', 'Code', 'COG', 'Product') output_f['ptt'].write('\t'.join(fields) + '\n') gid = 1 # assign an incremental integer to the current gene for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]): output_f['faa'].write('>' + gene + '\n' + l[0] + '\n') output_f['ptt'].write(str(l[1]) + '..' + str(l[2]) + '\t' + l[3] + '\t' + str(len(l[0])) + '\t' + str(gid) + '\t-\tgene' + str(gid) + '\t-\t-\t-\n') if l[3] == '+': # positive strand output_f['ffn'].write('>locus001:' + str(l[1]) + '-' + str(l[2]) + '\n' + nucl_seq[l[1]-1:l[2]] + '\n') else: # negative strand (reverse complement) rc_seq = str(DNA(nucl_seq[l[1]-1:l[2]]).reverse_complement()) output_f['ffn'].write('>locus001:c' + str(l[2]) + '-' + str(l[1]) + '\n' + rc_seq + '\n') gid += 1 for x in output_f: output_f[x].close()
def test_embl_to_dna(self): i = 1 exp = self.multi[i] obs = _embl_to_dna(self.multi_fp, seq_num=i+1) exp = DNA(exp[0], metadata=exp[1], lowercase=True, interval_metadata=exp[2]) self.assertEqual(exp, obs)
def reformat_genemark(genbank_fp, output_dir): """ Reformat input genome to the formats accepted by GeneMark. Parameters ---------- genbank_fp: string file path to genome in GenBank format output_dir: string output directory path Notes ----- GeneMark's acceptable input file format is FASTA (genome sequence). """ gb = _merge_genbank_seqs(genbank_fp)[0] DNA.write(gb, join(output_dir, 'id.fna'), format='fasta') DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank')
def test_reverse_transcribe_preserves_all_metadata(self): seq = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) exp = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) self.assertEqual(seq.reverse_transcribe(), exp)
def generateReference(assay_list): from skbio import DNA for assay in assay_list: name = assay.name if assay.AND: for operand in assay.AND: if isinstance(operand, Target): name = name + "_%s" % operand.gene_name if operand.gene_name else name for amplicon in operand.amplicons: name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, id=name) yield seq else: for amplicon in assay.target.amplicons: name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, {'id':name}) yield seq
def test_translate_ncbi_table_id(self): for seq in RNA('AAAUUUAUGCAU'), DNA('AAATTTATGCAT'): # default obs = seq.translate() self.assertEqual(obs, Protein('KFMH')) obs = seq.translate(9) self.assertEqual(obs, Protein('NFMH'))
def _find_approx_reverse(args): """ Finds an approximate match for a reverse primer """ [sequence, primer] = args primer = DNA(primer) align_ = _local_aln(sequence, primer) return align_[2][0][0]
def _find_approx_forward(args): """ Finds an approximate match for a forward primer """ [sequence, primer] = args primer = DNA(primer) align_ = _local_aln(sequence, primer) return align_[2][0][1] + 1
def gene_distance(A,B): '''compute sequence distance between two genes A and B ''' X,Y = '','' # new sequence removing common gaps for a,b in izip(A.values,B.values): if (a in A.gap_chars) and (b in B.gap_chars): continue if a in A.degenerate_chars: X += random_choice(list(A.degenerate_map[a])) else: X += a if b in B.degenerate_chars: Y += random_choice(list(B.degenerate_map[b])) else: Y += b newA = DNA(X,metadata={}) newB = DNA(Y,metadata={}) return newA.distance(newB)
def test_constructor_non_empty_no_keys(self): # 1x3 seqs = [DNA('ACG')] msa = TabularMSA(seqs) self.assertIs(msa.dtype, DNA) self.assertEqual(msa.shape, (1, 3)) with self.assertRaises(OperationError): msa.keys self.assertEqual(list(msa), seqs) # 3x1 seqs = [DNA('A'), DNA('C'), DNA('G')] msa = TabularMSA(seqs) self.assertIs(msa.dtype, DNA) self.assertEqual(msa.shape, (3, 1)) with self.assertRaises(OperationError): msa.keys self.assertEqual(list(msa), seqs)
def test_constructor_empty_no_keys(self): # sequence empty msa = TabularMSA([]) self.assertIsNone(msa.dtype) self.assertEqual(msa.shape, (0, 0)) with self.assertRaises(OperationError): msa.keys with self.assertRaises(StopIteration): next(iter(msa)) # position empty seqs = [DNA(''), DNA('')] msa = TabularMSA(seqs) self.assertIs(msa.dtype, DNA) self.assertEqual(msa.shape, (2, 0)) with self.assertRaises(OperationError): msa.keys self.assertEqual(list(msa), seqs)
def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([])
def test_genbank_to_dna(self): i = 1 exp = self.multi[i] obs = _genbank_to_dna(self.multi_fp, seq_num=i + 1) exp = DNA(exp[0], metadata=exp[1], lowercase=True, positional_metadata=exp[2]) self.assertEqual(exp, obs)
def setUp(self): self.seq_array = pd.Series( data=[DNA('AGTC', metadata={'id': 'A'}), DNA('ARWS', metadata={'id': 'B'}), DNA('CTWK', metadata={'id': 'C'}), DNA('GTCM', metadata={'id': 'D'}), DNA('ATGN', metadata={'id': 'E'})], index=['A', 'B', 'C', 'D', 'E'] ) self.seq_array.index = self.seq_array.index.astype(str) self.in_mer = pd.Series( data=np.array(['AGTCCATGC', 'TACGAGTGA', 'ACTCCATGC', 'AAAAAAAGT']) ) self.reads2 = pd.Series( data=np.array(['AGTC', 'WGWN', 'AGTT']), index=['r2.0', 'r2.1', 'r2.2'], )
def reformat_egid(genbank_fp, output_dir): """ Reformat input genome to the formats accepted by EGID. Parameters ---------- genbank_fp: string file path to genome in GenBank format output_dir: string output directory path Notes ----- Input to EGID are five obsolete NCBI standard files: gbk, fna, faa, ffn and ptt. """ (gb, genes) = _merge_genbank_seqs(genbank_fp) DNA.write(gb, join(output_dir, 'id.fna'), format='fasta') DNA.write(gb, join(output_dir, 'id.gbk'), format='genbank') nucl_seq = str(gb) output_f = {} for x in ('faa', 'ffn', 'ptt'): output_f[x] = open(join(output_dir, 'id.' + x), 'w') output_f['ptt'].write('locus001\n' + str(len(genes)) + ' proteins\n') # a ptt file contains the following columns: fields = ('Location', 'Strand', 'Length', 'PID', 'Gene', 'Synonym', 'Code', 'COG', 'Product') output_f['ptt'].write('\t'.join(fields) + '\n') gid = 1 # assign an incremental integer to the current gene for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]): output_f['faa'].write('>' + gene + '\n' + l[0] + '\n') output_f['ptt'].write( str(l[1]) + '..' + str(l[2]) + '\t' + l[3] + '\t' + str(len(l[0])) + '\t' + str(gid) + '\t-\tgene' + str(gid) + '\t-\t-\t-\n') if l[3] == '+': # positive strand output_f['ffn'].write('>locus001:' + str(l[1]) + '-' + str(l[2]) + '\n' + nucl_seq[l[1] - 1:l[2]] + '\n') else: # negative strand (reverse complement) rc_seq = str(DNA(nucl_seq[l[1] - 1:l[2]]).reverse_complement()) output_f['ffn'].write('>locus001:c' + str(l[2]) + '-' + str(l[1]) + '\n' + rc_seq + '\n') gid += 1 for x in output_f: output_f[x].close()
def test_stockholm_runon_gs(self): fp = get_data_path('stockholm_runon_gs_no_whitespace') msa = _stockholm_to_tabular_msa(fp, constructor=DNA) exp = TabularMSA( [DNA('ATCGTTCAGTG', metadata={'LN': 'This is a runon GS line.'})], index=['seq1']) self.assertEqual(msa, exp) fp = get_data_path('stockholm_runon_gs_with_whitespace') msa = _stockholm_to_tabular_msa(fp, constructor=DNA) self.assertEqual(msa, exp)
def test_msa_to_stockholm_minimal(self): fp = get_data_path('stockholm_minimal') msa = TabularMSA([DNA('TGTGTCGCAGTTGTCGTTTG')], index=['0235244']) fh = io.StringIO() _tabular_msa_to_stockholm(msa, fh) obs = fh.getvalue() fh.close() with io.open(fp) as fh: exp = fh.read() self.assertEqual(obs, exp)
def test_sort_on_key_with_some_repeats(self): msa = TabularMSA([ DNA('TCCG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10}), DNA('GGGG', metadata={'id': 8}), DNA('ACGT', metadata={'id': 0}), DNA('TAGG', metadata={'id': 10}) ], keys=range(5)) msa.sort(key='id') self.assertEqual( msa, TabularMSA([ DNA('ACGT', metadata={'id': 0}), DNA('GGGG', metadata={'id': 8}), DNA('TCCG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10}), DNA('TAGG', metadata={'id': 10}) ], keys=[3, 2, 0, 1, 4]))
def test_translate_six_frames_ncbi_table_id(self): # rc = CAAUUU for seq in RNA('AAAUUG'), DNA('AAATTG'): # default obs = list(seq.translate_six_frames()) self.assertEqual(obs, [Protein('KL'), Protein('N'), Protein('I'), Protein('QF'), Protein('N'), Protein('I')]) obs = list(seq.translate_six_frames(9)) self.assertEqual(obs, [Protein('NL'), Protein('N'), Protein('I'), Protein('QF'), Protein('N'), Protein('I')])
def test_omit_gap_sequences(self): expected = self.a2 self.assertEqual(self.a2.omit_gap_sequences(1.0), expected) self.assertEqual(self.a2.omit_gap_sequences(0.20), expected) expected = Alignment([self.r2]) self.assertEqual(self.a2.omit_gap_sequences(0.19), expected) self.assertEqual(self.empty.omit_gap_sequences(0.0), self.empty) self.assertEqual(self.empty.omit_gap_sequences(0.2), self.empty) self.assertEqual(self.empty.omit_gap_sequences(1.0), self.empty) # Test to ensure floating point precision bug isn't present. See the # tests for Alignment.position_frequencies for more details. aln = Alignment([ DNA('.' * 33, metadata={'id': 'abc'}), DNA('-' * 33, metadata={'id': 'def'}) ]) self.assertEqual(aln.omit_gap_sequences(1 - np.finfo(float).eps), Alignment([]))
def test_translate_preserves_metadata(self): metadata = {'foo': 'bar', 'baz': 42} positional_metadata = {'foo': range(3)} for seq in (RNA('AUG', metadata=metadata, positional_metadata=positional_metadata), DNA('ATG', metadata=metadata, positional_metadata=positional_metadata)): obs = seq.translate() # metadata retained, positional metadata dropped self.assertEqual(obs, Protein('M', metadata={'foo': 'bar', 'baz': 42}))
def test_reverse_transcribe_preserves_all_metadata(self): im = IntervalMetadata(4) im.add([(0, 2)], metadata={'gene': 'p53'}) seq = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) exp = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}, interval_metadata=im) self.assertEqual(seq.reverse_transcribe(), exp)
def remove_gapped_columns(msa, site_threshold=0.95): msa_dict = msa.to_dict() msa_df = pd.DataFrame(msa_dict) gapped_columns = msa_df.apply(gap_dectector, axis=1) nogaps_df = msa_df[gapped_columns < len(msa_df.columns) * site_threshold] nogap_seqs = [ DNA(nogaps_df[i].str.decode("utf-8").str.cat(), metadata={"id": i}) for i in nogaps_df ] msa_nogap = TabularMSA(nogap_seqs) return msa_nogap
def generateReference(assay_list): from skbio import DNA from skbio import SequenceCollection reference = [] for assay in assay_list: name = assay.name if assay.AND: for operand in assay.AND: if isinstance(operand, Target): name = name + "_%s" % operand.gene_name if operand.gene_name else name for amplicon in operand.amplicons: name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, id=name) reference.append(seq) else: for amplicon in assay.target.amplicons: name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, {'id': name}) reference.append(seq) return SequenceCollection(reference)
def dnaLocalAlignSsw(seq1, seq2): seq1 = seq1.upper() seq2 = seq2.upper() msa, score, _ = local_pairwise_align_ssw(DNA(seq1), DNA(seq2)) response = { 'seq1': str(seq1), 'aln1': str(msa[0]), 'aln2': str(msa[1]), 'score': score, 'similarity': float('{:.2f}'.format(msa[0].match_frequency(msa[1], relative=True) * 100)) } return response
def test_stockholm_runon_gf(self): fp = get_data_path('stockholm_runon_gf_no_whitespace') msa = _stockholm_to_tabular_msa(fp, constructor=DNA) exp = TabularMSA([DNA('ACTGGTTCAATG')], metadata={'CC': 'CBS domains are small intracellular' ' modules mostly found in 2 or four ' 'copies within a protein.'}, index=['GG1344']) self.assertEqual(msa, exp) fp = get_data_path('stockholm_runon_gf_with_whitespace') msa = _stockholm_to_tabular_msa(fp, constructor=DNA) self.assertEqual(msa, exp)
def test_translate_six_frames_genetic_code_object(self): gc = GeneticCode('M' * 64, '-' * 64) for seq in RNA('AAAUUG'), DNA('AAATTG'): obs = list(seq.translate_six_frames(gc)) self.assertEqual(obs, [ Protein('MM'), Protein('M'), Protein('M'), Protein('MM'), Protein('M'), Protein('M') ])
def test_embl_to_gb(self): # EMBL records have more features than genbank, (ex more than one date, # embl class, DOI cross references) so I can't convert an embl to gb # and then to embl keeping all those data. But I can start from # genbank record # do embl file -> embl object -> gb file -> gb object -> # embl file. Ensure that first and last files are identical embl = DNA.read(self.single_rna_simple_fp, format="embl") # "write" genbank record in a embl file with io.StringIO() as fh: DNA.write(embl, format="genbank", file=fh) # read genbank file fh.seek(0) genbank = DNA.read(fh, format="genbank") # "write" genbank record in a embl file with io.StringIO() as fh: DNA.write(genbank, format="embl", file=fh) # read file object obs = fh.getvalue() # test objects with open(self.single_rna_simple_fp) as fh: exp = fh.read() self.assertEqual(exp, obs)
def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc', quality=range(4)) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc', quality=range(4)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc1', quality=range(4)), DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1]) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)), DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map)
def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG") seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 131.0) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 97.0)
def test_motif_pyrimidine_run(self): seq = DNA("") self.assertEqual(list(seq.find_motifs("pyrimidine-run")), []) seq = DNA("AARC--TCRA") self.assertEqual(list(seq.find_motifs("pyrimidine-run")), [slice(3, 4), slice(6, 8)]) seq = DNA("AA-RC--TCR-A") self.assertEqual(list(seq.find_motifs("pyrimidine-run", min_length=3, ignore=seq.gaps())), [slice(4, 9)])
def test_motif_purine_run(self): seq = DNA("") self.assertEqual(list(seq.find_motifs("purine-run")), []) seq = DNA("AARC--TCRG") self.assertEqual(list(seq.find_motifs("purine-run")), [slice(0, 3), slice(8, 10)]) seq = DNA("AA-RC--TCR-G") self.assertEqual(list(seq.find_motifs("purine-run", min_length=3, ignore=seq.gaps())), [slice(0, 4)])
def test_gb_to_embl(self): genbank = DNA.read(self.genbank_fp, format="genbank") with io.StringIO() as fh: DNA.write(genbank, format="embl", file=fh) # EMBL can't deal with genbank version (ie M14399.1 GI:145229) # read embl data and write to gb fh.seek(0) embl = DNA.read(fh, format="embl") with io.StringIO() as fh: DNA.write(embl, format="genbank", file=fh) # read gb data obs = fh.getvalue() with open(self.genbank_fp) as fh: exp = fh.read() self.assertEqual(exp, obs)
def test_transcribe_does_not_modify_input(self): seq = DNA('ATAT') self.assertEqual(seq.transcribe(), RNA('AUAU')) self.assertEqual(seq, DNA('ATAT'))
def test_transcribe_preserves_all_metadata(self): exp = RNA('AGUU', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) seq = DNA('AGTT', metadata={'foo': 'bar'}, positional_metadata={'foo': range(4)}) self.assertEqual(seq.transcribe(), exp)
from skbio import DNA, read with open("outfile.fasta", "w") as outfile: for seq in read('test_sequences.fasta', format='fasta'): new_seq = DNA(seq) for protein in new_seq.translate_six_frames(): if not protein.has_stops(): outfile.write(">" + str(new_seq.metadata['id']) + "\n" + str(protein) + "\n")
from skbio import DNA from skbio.alignment import global_pairwise_align_nucleotide s1 = DNA.read("data/seq1") s2 = DNA.read("data/seq2") query = DNA("TTTTCTTGTTGATTCTGGTCCAGAGTAATCGCTTGAGTGTTG") def pairwise_similarity(seq, query): alignment = global_pairwise_align_nucleotide(seq, query) return alignment[0].fraction_same(alignment[1]) print "seq1: %s\nseq2: %s" % (s1, s2) print "seq1-query: %s" % pairwise_similarity(s1, query) print "seq2-query: %s" % pairwise_similarity(s2, query)
def _merge_genbank_seqs(genbank_fp): """ Merge one to multiple sequences in a GenBank file into one. Parameters ---------- genbank_fp: string file path to genome in GenBank format Returns ------- tuple of ( skbio.Sequence, Genome sequence, genes and metadata dict of { list of [ string, int, int, string ] } Gene name : translation, start, end, and strand ) """ loci = [] nucl_seq = '' genes = {} nseq = 0 # number of nucleotide sequences with open(genbank_fp, 'r') as input_f: for line in input_f: if line.startswith('//'): nseq += 1 abs_pos = 0 # absolute position in concantenated nucleotide sequence for i in range(nseq): gb = Sequence.read(genbank_fp, seq_num=i+1, format='genbank') locus_name = gb.metadata['LOCUS']['locus_name'] size = gb.metadata['LOCUS']['size'] loci.append([locus_name, size]) nucl_seq += str(gb) for feature in gb.interval_metadata._intervals: m = feature.metadata if m['type'] == 'CDS' and 'protein_id' in m: protein_id = m['protein_id'].replace('\"', '') if protein_id not in genes: translation = m['translation'].replace(' ', '') \ .replace('\"', '') strand = m['strand'] start = feature.bounds[0][0] + abs_pos + 1 end = feature.bounds[0][1] + abs_pos genes[protein_id] = [translation, start, end, strand] abs_pos += int(size) gb = DNA(nucl_seq) # generate mock metadata for the merged sequence gb.metadata['LOCUS'] = {'locus_name': 'locus001', 'size': len(nucl_seq), 'unit': 'bp', 'shape': 'circular', 'division': 'CON', 'mol_type': 'DNA', 'date': '01-JAN-1900'} gb.metadata['id'] = 'locus001' gid = 1 # assign an incremental integer to the current gene gb.interval_metadata._intervals = [] for (gene, l) in sorted(genes.items(), key=lambda x: x[1][1]): # generate "gene" and "CDS" records for each protein-coding gene location = str(l[1]) + '..' + str(l[2]) # start and end coordinates if l[3] == '-': # negative strand location = 'complement(' + location + ')' feature = {'type': 'gene', 'locus_tag': 'gene' + str(gid), '__location': location} gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature) feature = {'type': 'CDS', 'locus_tag': 'gene' + str(gid), '__location': location, 'protein_id': gene, 'translation': l[0]} gb.interval_metadata.add([(l[1] - 1, l[2])], metadata=feature) gid += 1 return (gb, genes)