def test_gaps_at_both_ends(self): s = 'aaaccggttt' s1 = DNA.makeSequence(s[:-2], Name="A") s2 = DNA.makeSequence(s[2:], Name="B") for a in self._aligned_both_ways(s1, s2, local=False): self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 10)
def test_assemble_seq(self): """should correctly fill in a sequence with N's""" expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN") frags = ["AAAAA","CCCCC","GGG"] positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect) positions = [(1, 6), (8, 13), (15, 18)] self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect) # should work with: # start matches first frag start expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN") positions = [(0, 5), (7, 12), (14, 17)] self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect) # end matches last frag_end expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG") positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect) # both start and end matched expect = DNA.makeSequence("AAAAANNCCCCCNNGGG") positions = [(10, 15), (17, 22), (24, 27)] self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect) # one frag expect = DNA.makeSequence(''.join(frags)) positions = [(10, 23)] self.assertEqual(_assemble_seq([''.join(frags)],10,23,positions), expect)
def test_assemble_seq(self): """should correctly fill in a sequence with N's""" expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN") frags = ["AAAAA", "CCCCC", "GGG"] positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect) positions = [(1, 6), (8, 13), (15, 18)] self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect) # should work with: # start matches first frag start expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN") positions = [(0, 5), (7, 12), (14, 17)] self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect) # end matches last frag_end expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG") positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect) # both start and end matched expect = DNA.makeSequence("AAAAANNCCCCCNNGGG") positions = [(10, 15), (17, 22), (24, 27)] self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect) # one frag expect = DNA.makeSequence(''.join(frags)) positions = [(10, 23)] self.assertEqual(_assemble_seq([''.join(frags)], 10, 23, positions), expect)
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence( 'cactc', Name= 'target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def setUp(self): self.cigar_text = '3D2M3D6MDM2D3MD' self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-') self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G') self.map, self.seq = self.aln_seq.parseOutGaps() self.map1, self.seq1 = self.aln_seq1.parseOutGaps() self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)] self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1}) self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)} self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence('cactc', Name='target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def test_picklability(self): """Pickle an alignment containing an annotated sequence""" # This depends on alignments, sequences, features, maps and spans # Doesn't test round trip result is correct, which should possibly # be done for maps/spans, but seqs/alignments are just simple # python classes without __getstate__ etc. import cPickle as pickle seq1 = DNA.makeSequence("aagaagaagaccccca") seq2 = DNA.makeSequence("aagaagaagaccccct") seq2.addFeature('exon', 'fred', [(10,15)]) aln = LoadSeqs(data={'a':seq1, 'b':seq2}) aln2 = pickle.loads(pickle.dumps(aln))
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence("AAAATGCTTA" * r) seq1 = DNA.makeSequence("AATTTTGCTG" * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def _make_utr_seq(self): if self.UntranslatedExons5 is None and self.UntranslatedExons3 is None: self._cached["Utr5"] = self.NULL_VALUE self._cached["Utr3"] = self.NULL_VALUE return Utr5_seq, Utr3_seq = DNA.makeSequence(""), DNA.makeSequence("") for exon in self.UntranslatedExons5: Utr5_seq += exon.Seq for exon in self.UntranslatedExons3: Utr3_seq += exon.Seq self._cached["Utr5"] = Utr5_seq self._cached["Utr3"] = Utr3_seq
def test_picklability(self): """Pickle an alignment containing an annotated sequence""" # This depends on alignments, sequences, features, maps and spans # Doesn't test round trip result is correct, which should possibly # be done for maps/spans, but seqs/alignments are just simple # python classes without __getstate__ etc. import pickle as pickle seq1 = DNA.makeSequence("aagaagaagaccccca") seq2 = DNA.makeSequence("aagaagaagaccccct") seq2.addFeature('exon', 'fred', [(10, 15)]) aln = LoadSeqs(data={'a': seq1, 'b': seq2}) aln2 = pickle.loads(pickle.dumps(aln))
def test_codon(self): s1 = DNA.makeSequence('tacgccgta', Name="A") s2 = DNA.makeSequence('tacgta', Name="B") codon_model = cogent.evolve.substitution_model.Codon( model_gaps=False, equal_motif_probs=True, mprob_model='conditional') tree = cogent.LoadTree(tip_names=['A', 'B']) lf = codon_model.makeLikelihoodFunction(tree, aligned=False) lf.setSequences(dict(A=s1, B=s2)) a = lf.getLogLikelihood().edge.getViterbiPath().getAlignment() self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 9)
def test_codon(self): s1 = DNA.makeSequence('tacgccgta', Name="A") s2 = DNA.makeSequence('tacgta', Name="B") codon_model = cogent.evolve.substitution_model.Codon( model_gaps=False, equal_motif_probs=True, mprob_model='conditional') tree = cogent.LoadTree(tip_names=['A', 'B']) lf = codon_model.makeLikelihoodFunction(tree, aligned=False) lf.setSequences(dict(A=s1, B=s2)) (score, a) = lf.getLogLikelihood().edge.getViterbiScoreAndAlignment() self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 9)
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int ( (len(seq1)*len(seq2))/t/1000 )
def test_inherit_feature(self): """should be able to subclass and extend _Feature""" class NewFeat(_Feature): def __init__(self, *args, **kwargs): super(NewFeat, self).__init__(*args, **kwargs) def newMethod(self): if len(self.map.spans) > 1: as_one = self.asOneSpan( ) # should create new instance of NewFeat return as_one.newMethod() return True seq = DNA.makeSequence('ACGTACGTACGT') f = seq.addAnnotation(NewFeat, as_map([(1, 3), (5, 7)], len(seq)), type='gene', Name='abcd') self.assertEqual(type(f.asOneSpan()), NewFeat) self.assertEqual(type(f.getShadow()), NewFeat) f2 = seq.addAnnotation(NewFeat, as_map([(3, 5)], len(seq)), type='gene', Name='def') self.assertEqual( type(seq.getRegionCoveringAll([f, f2], feature_class=NewFeat)), NewFeat) # now use the new method f.newMethod()
def CigarParser(seqs, cigars, sliced = False, ref_seqname = None, start = None, end = None, moltype=DNA): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Arguments: seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames MolType - optional default to DNA """ data = {} if not sliced: for seqname in seqs.keys(): aligned_seq = aligned_from_cigar(cigars[seqname], seqs[seqname], moltype=moltype) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], seqs[ref_seqname], moltype=moltype) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align = False) data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]] for seqname in [seqname for seqname in seqs.keys() if seqname != ref_seqname]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.makeSequence(seq) data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m) else: data[seqname] = DNA.makeSequence('-'*(aln_loc[1] - aln_loc[0])) aln = LoadSeqs(data = data, aligned = True) return aln
def findBestSeq(seqobject): dna_seq = str(seqobject.seq) my_seq = DNA.makeSequence(dna_seq,seqobject.id) # x=0 # framedict = dict() # while x < 3: # temp1 = my_seq[x:] # temp2 = temp1..withoutTerminalStopCodon() # framedict[x] = temp2.getTranslation() # x+=1 all_six = standard_code.sixframes(my_seq) seqlist = list() for frame in all_six: seqreturned = frame.split('*')[0] seqlist.append(seqreturned) longestseq = '' x=0 while x < 3: if len(longestseq) < len(seqlist[x]): longestseq = seqlist[x] correctdnaseq = my_seq[x:] x+=1 #longest_seq = max(seqlist, key=len) return longestseq, correctdnaseq
def _get_flanking_seq_data(self): # maps to flanking_sequence through variation_feature_id # if this fails, we grab from genomic sequence variation_id = self._table_rows['variation_feature']['variation_id'] flanking_seq_table = self.flanking_sequence_table query = sql.select([flanking_seq_table], flanking_seq_table.c.variation_id == variation_id) record = asserted_one(query.execute()) self._table_rows['flanking_sequence'] = record up_seq = record['up_seq'] down_seq = record['down_seq'] # the following two lines are because -- wait for it -- someone has # entered the string 'NULL' instead of NULL in the MySQL tables!!! up_seq = [up_seq, None][up_seq == 'NULL'] down_seq = [down_seq, None][down_seq == 'NULL'] seqs = dict(up=up_seq, down=down_seq) for name, seq in seqs.items(): if seq is not None: seq = DNA.makeSequence(seq) else: resized = [(-301, -1), (1, 301)][name == 'down'] if self.Location.Strand == -1: resized = [(1, 301), (-301, -1)][name == 'down'] flank = self.Location.resized(*resized) flanking = self.genome.getRegion(region=flank) seq = flanking.Seq seqs[name] = seq self._cached[('FlankingSeq')] = (seqs['up'][-300:],seqs['down'][:300])
def makeSampleSequence(): seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc' seq = DNA.makeSequence(seq) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20,35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39,49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49,60)]) return seq
def test_inherit_feature(self): """should be able to subclass and extend _Feature""" class NewFeat(_Feature): def __init__(self, *args, **kwargs): super(NewFeat, self).__init__(*args, **kwargs) def newMethod(self): if len(self.map.spans) > 1: as_one = self.asOneSpan() # should create new instance of NewFeat return as_one.newMethod() return True seq = DNA.makeSequence('ACGTACGTACGT') f = seq.addAnnotation(NewFeat, as_map([(1,3), (5,7)], len(seq)), type='gene', Name='abcd') self.assertEqual(type(f.asOneSpan()), NewFeat) self.assertEqual(type(f.getShadow()), NewFeat) f2 = seq.addAnnotation(NewFeat, as_map([(3,5)], len(seq)), type='gene', Name='def') self.assertEqual(type(seq.getRegionCoveringAll([f, f2], feature_class=NewFeat)), NewFeat) # now use the new method f.newMethod()
def _generate_unambiguous_sequences(self): unambiguous_conserved_sequences = dict() rev_unambiguous_conserved_sequences = dict() for pos,seq in self._CONSERVED_SEQUENCES.items(): dnaseq = DNA.makeSequence(seq) ret = self._disambiguate(dnaseq) if isinstance(ret, list): for dnaseq_r in ret: self.conserved_sequences[str(dnaseq_r)] = ConservedSequence(dnaseq_r, pos) else: self.conserved_sequences[str(ret)] = ConservedSequence(ret, pos) for seq,con_seq in self.conserved_sequences.items(): rc_seq = DNA.makeSequence(seq) rc_seq.rc() self.conserved_sequences[str(rc_seq)] = ConservedSequence(rc_seq, con_seq.pos, rc=True)
def test_stop_indexes(self): """should return stop codon indexes for a specified frame""" sgc = GeneticCode(self.SGC) seq = DNA.makeSequence('ATGCTAACATAAA') expected = [[9], [4], []] for frame, expect in enumerate(expected): got = sgc.getStopIndices(seq, start=frame) self.assertEqual(got, expect)
def makeSampleSequence(): seq = DNA.makeSequence('aaaccggttt' * 10) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39, 49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49, 60)]) return seq
def test_stop_indexes(self): """should return stop codon indexes for a specified frame""" sgc = GeneticCode(self.SGC) seq = DNA.makeSequence("ATGCTAACATAAA") expected = [[9], [4], []] for frame, expect in enumerate(expected): got = sgc.getStopIndices(seq, start=frame) self.assertEqual(got, expect)
def makeSampleSequence(): seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc' seq = DNA.makeSequence(seq) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39, 49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49, 60)]) return seq
def adjust_alignment(template,candidate,new_gaps): """adjust template/candidate aln to remove gaps added by pairwise alignment This step adjusts the alignment to reduce the length back to the template alignment length by introducing local misalignments to remove gap characters that are present in the pairwise alignment but not in the template alignment. """ template_l = list(template) candidate_l = list(candidate) new_gaps.reverse() for pos in new_gaps: del template_l[pos] del candidate_l[nearest_gap(candidate_l,pos)] return (DNA.makeSequence(''.join(template_l)), \ DNA.makeSequence(''.join(candidate_l)))
def adjust_alignment(template, candidate, new_gaps): """adjust template/candidate aln to remove gaps added by pairwise alignment This step adjusts the alignment to reduce the length back to the template alignment length by introducing local misalignments to remove gap characters that are present in the pairwise alignment but not in the template alignment. """ template_l = list(template) candidate_l = list(candidate) new_gaps.reverse() for pos in new_gaps: del template_l[pos] del candidate_l[nearest_gap(candidate_l, pos)] return (DNA.makeSequence(''.join(template_l)), \ DNA.makeSequence(''.join(candidate_l)))
def makeSampleSequence(with_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if with_gaps: raw_seq = raw_seq[:5] + '-----' +raw_seq[10:-2] + '--' seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def makeSampleSequence(with_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if with_gaps: raw_seq = raw_seq[:5] + '-----' + raw_seq[10:-2] + '--' seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def introduce_terminal_gaps(template,aligned_template,aligned_candidate): """ introduce terminal gaps from template into the aligned candidate seq """ # count the 5' gaps in the original aligned template original_five_prime_gaps = 0 for c in template: if c == '-': original_five_prime_gaps +=1 else: break # count the 5' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_five_prime_gaps = 0 for c in aligned_template: if c == '-': aligned_template_five_prime_gaps += 1 else: break # compute the number of 5' gaps that need to be added to get to the # original alignment length five_prime_gaps_to_add = \ original_five_prime_gaps - aligned_template_five_prime_gaps # count the 3' gaps in the original aligned template original_three_prime_gaps = 0 for c in reversed(template): if c == '-': original_three_prime_gaps +=1 else: break # count the 3' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_three_prime_gaps = 0 for c in reversed(aligned_template): if c == '-': aligned_template_three_prime_gaps += 1 else: break # compute the number of 3' gaps that need to be added to get to the # original alignment length three_prime_gaps_to_add = \ original_three_prime_gaps - aligned_template_three_prime_gaps # return the sequence with the 5' and 3' gaps added return DNA.makeSequence(''.join([\ '-'*five_prime_gaps_to_add,\ str(aligned_candidate),\ '-'*three_prime_gaps_to_add]),\ Name=aligned_candidate.Name)
def introduce_terminal_gaps(template, aligned_template, aligned_candidate): """ introduce terminal gaps from template into the aligned candidate seq """ # count the 5' gaps in the original aligned template original_five_prime_gaps = 0 for c in template: if c == '-': original_five_prime_gaps += 1 else: break # count the 5' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_five_prime_gaps = 0 for c in aligned_template: if c == '-': aligned_template_five_prime_gaps += 1 else: break # compute the number of 5' gaps that need to be added to get to the # original alignment length five_prime_gaps_to_add = \ original_five_prime_gaps - aligned_template_five_prime_gaps # count the 3' gaps in the original aligned template original_three_prime_gaps = 0 for c in reversed(template): if c == '-': original_three_prime_gaps += 1 else: break # count the 3' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_three_prime_gaps = 0 for c in reversed(aligned_template): if c == '-': aligned_template_three_prime_gaps += 1 else: break # compute the number of 3' gaps that need to be added to get to the # original alignment length three_prime_gaps_to_add = \ original_three_prime_gaps - aligned_template_three_prime_gaps # return the sequence with the 5' and 3' gaps added return DNA.makeSequence(''.join([\ '-'*five_prime_gaps_to_add,\ str(aligned_candidate),\ '-'*three_prime_gaps_to_add]),\ Name=aligned_candidate.Name)
def test_maps_on_maps(self): seq = DNA.makeSequence("ATCGATCGAT" * 5, Name="base") feat1 = annotate(seq, 10, 20, "fake") feat2 = annotate(feat1, 3, 5, "fake2") feat3 = annotate(seq, 1, 3, "left") seq2 = seq[5:] self.assertEqual( structure(seq), ("seq", 50, [("fake", "[10:20]/50", [("fake2", "[3:5]/10")]), ("left", "[1:3]/50")]) ) self.assertEqual(structure(seq2), ("seq", 45, [("fake", "[5:15]/45", [("fake2", "[3:5]/10")])]))
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int((len(seq1) * len(seq2)) / t / 1000)
def _get_sequence(self): if 'Seq' not in self._cached: try: seq = get_sequence(self.Location) except NoItemError: try: alt_loc = assembly_exception_coordinate(self.Location) seq = get_sequence(alt_loc) except NoItemError: seq = DNA.makeSequence("N"*len(self)) seq.Name = str(self.Location) self._cached['Seq'] = seq return self._cached['Seq']
def _make_aln(self, orig, model=dna_model, param_vals=None, indel_rate=0.1, indel_length=0.5, **kw): kw['indel_rate'] = indel_rate kw['indel_length'] = indel_length seqs = dict((key, DNA.makeSequence(value)) for (key, value) in orig.items()) if len(seqs) == 2: tree = cogent.LoadTree(tip_names=seqs.keys()) tree = cogent.LoadTree(treestring="(A:.1,B:.1)") else: tree = cogent.LoadTree(treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)") aln, tree = cogent.align.progressive.TreeAlign(model, seqs, tree=tree, param_vals=param_vals, show_progress=False, **kw) return aln
def test_maps_on_maps(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') feat1 = annotate(seq, 10, 20, 'fake') feat2 = annotate(feat1, 3, 5, 'fake2') feat3 = annotate(seq, 1, 3, 'left') seq2 = seq[5:] self.assertEqual( structure(seq), ('seq', 50, [('fake', '[10:20]/50', [('fake2', '[3:5]/10')]), ('left', '[1:3]/50')])) self.assertEqual( structure(seq2), ('seq', 45, [('fake', '[5:15]/45', [('fake2', '[3:5]/10')])]))
def makeSampleSequence(mid_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if mid_gaps: rev_seq = raw_seq[:5] + '-----' +raw_seq[10:] raw_seq = rev_seq # annotations only make sense when they're on the raw sequence cds = (10, 20) utr = (5, 8) seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def remove_template_terminal_gaps(candidate,template): """Remove template terminal gaps and corresponding bases in candidate """ if len(template) != len(candidate): raise ValueError, \ "Sequences must be aligned, but their "+\ "lengths aren't equal. %d != %d" % (len(candidate),len(template)) if len(template) == 0: return candidate, template degapped_candidate_len = len(candidate.degap()) candidate = DNA.makeSequence(candidate) template = DNA.makeSequence(template) template_gap_vector = template.gapVector() first_non_gap = template_gap_vector.index(False) num_three_prime_gaps = template_gap_vector[::-1].index(False) last_non_gap = len(template_gap_vector) - num_three_prime_gaps # Construct the candidate name, which will include the range of bases # from the original sequence candidate = candidate[first_non_gap:last_non_gap] template = template[first_non_gap:last_non_gap] candidate_start_pos = first_non_gap + 1 candidate_end_pos = degapped_candidate_len - num_three_prime_gaps candidate_name = candidate.Name if candidate_name.endswith('RC'): name_delimiter = ':' else: name_delimiter = ' ' candidate_name = '%s%s%d..%d' %\ (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos) return DNA.makeSequence(candidate,Name=candidate_name), template
def remove_template_terminal_gaps(candidate, template): """Remove template terminal gaps and corresponding bases in candidate """ if len(template) != len(candidate): raise ValueError, \ "Sequences must be aligned, but their "+\ "lengths aren't equal. %d != %d" % (len(candidate),len(template)) if len(template) == 0: return candidate, template degapped_candidate_len = len(candidate.degap()) candidate = DNA.makeSequence(candidate) template = DNA.makeSequence(template) template_gap_vector = template.gapVector() first_non_gap = template_gap_vector.index(False) num_three_prime_gaps = template_gap_vector[::-1].index(False) last_non_gap = len(template_gap_vector) - num_three_prime_gaps # Construct the candidate name, which will include the range of bases # from the original sequence candidate = candidate[first_non_gap:last_non_gap] template = template[first_non_gap:last_non_gap] candidate_start_pos = first_non_gap + 1 candidate_end_pos = degapped_candidate_len - num_three_prime_gaps candidate_name = candidate.Name if candidate_name.endswith('RC'): name_delimiter = ':' else: name_delimiter = ' ' candidate_name = '%s%s%d..%d' %\ (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos) return DNA.makeSequence(candidate, Name=candidate_name), template
def _assemble_seq(frags, start, end, frag_positions): """returns a single string in which missing sequence is replaced by 'N'""" prev_end = start assert len(frag_positions) == len(frags), "Mismatched number of "\ "fragments and positions" assembled = [] for index, (frag_start, frag_end) in enumerate(frag_positions): diff = frag_start - prev_end assert diff >= 0, 'fragment position start < previous end: %s, %s' %\ (frag_start, prev_end) assembled += ['N'*diff, frags[index]] prev_end = frag_end diff = end - frag_end assert diff >= 0, 'end[%s] < previous frag_end[%s]' % (end, frag_end) assembled += ['N' * diff] return DNA.makeSequence(''.join(assembled))
def test_simulateAlignment_root_sequence(self): """provide a root sequence for simulating an alignment""" def use_root_seq(root_sequence): al = LoadSeqs(data={'a':'ggaatt','c':'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence)) root_sequence = DNA.makeSequence('GTAATT') use_root_seq(root_sequence) # as a sequence instance use_root_seq('GTAATC') # as a string
def _assemble_seq(frags, start, end, frag_positions): """returns a single string in which missing sequence is replaced by 'N'""" prev_end = start assert len(frag_positions) == len(frags), "Mismatched number of "\ "fragments and positions" assembled = [] for index, (frag_start, frag_end) in enumerate(frag_positions): diff = frag_start - prev_end assert diff >= 0, 'fragment position start < previous end: %s, %s' %\ (frag_start, prev_end) assembled += ['N' * diff, frags[index]] prev_end = frag_end diff = end - frag_end assert diff >= 0, 'end[%s] < previous frag_end[%s]' % (end, frag_end) assembled += ['N' * diff] return DNA.makeSequence(''.join(assembled))
def test_maps_on_maps(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') feat1 = annotate(seq, 10, 20, 'fake') feat2 = annotate(feat1, 3, 5, 'fake2') feat3 = annotate(seq, 1, 3, 'left') seq2 = seq[5:] self.assertEqual(structure(seq), ('seq', 50, [('fake', '[10:20]/50', [('fake2', '[3:5]/10')]), ('left', '[1:3]/50')]) ) self.assertEqual(structure(seq2), ('seq', 45, [('fake', '[5:15]/45', [('fake2', '[3:5]/10')])]) )
def test_simulateAlignment_root_sequence(self): """provide a root sequence for simulating an alignment""" def use_root_seq(root_sequence): al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence)) root_sequence = DNA.makeSequence('GTAATT') use_root_seq(root_sequence) # as a sequence instance use_root_seq('GTAATC') # as a string
def CigarParser(seqs, cigars, sliced=False, ref_seqname=None, start=None, end=None, moltype=DNA): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Arguments: seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames MolType - optional default to DNA """ data = {} if not sliced: for seqname in list(seqs.keys()): aligned_seq = aligned_from_cigar(cigars[seqname], seqs[seqname], moltype=moltype) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], seqs[ref_seqname], moltype=moltype) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align=False) data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]] for seqname in [ seqname for seqname in list(seqs.keys()) if seqname != ref_seqname ]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.makeSequence(seq) data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m) else: data[seqname] = DNA.makeSequence('-' * (aln_loc[1] - aln_loc[0])) aln = LoadSeqs(data=data, aligned=True) return aln
def test_other_repeat(self): """should apply repeat feature data in a manner consistent with strand""" coord = dict(CoordName=13, Start=32890200, End=32890500) ps_repeat = self.human.getRegion(Strand=1, **coord) ms_repeat = self.human.getRegion(Strand=-1, **coord) exp = DNA.makeSequence('CTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTG'\ 'TCCAAACCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGA'\ 'TTTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG') self.assertEquals(ms_repeat.Seq, ps_repeat.Seq.rc()) ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat') ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat') ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0] ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0] self.assertEquals(ms_seq.getSlice(), ps_seq.getSlice()) self.assertEquals(ps_seq.getSlice(), exp)
def test_other_repeat(self): """should apply repeat feature data in a manner consistent with strand""" coord=dict(CoordName=13, Start=32890200, End=32890500) ps_repeat = self.human.getRegion(Strand=1, **coord) ms_repeat = self.human.getRegion(Strand=-1, **coord) exp = DNA.makeSequence('CTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTG'\ 'TCCAAACCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGA'\ 'TTTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG') self.assertEquals(ms_repeat.Seq, ps_repeat.Seq.rc()) ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat') ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat') ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0] ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0] self.assertEquals(ms_seq.getSlice(), ps_seq.getSlice()) self.assertEquals(ps_seq.getSlice(), exp)
def test_getByAnnotation(self): seq = DNA.makeSequence("ATCGATCGAT" * 5, Name="base") seq.addAnnotation(Feature, "test_type", "test_label", [(5, 10)]) seq.addAnnotation(Feature, "test_type", "test_label2", [(15, 18)]) answer = list(seq.getByAnnotation("test_type")) self.assertEqual(len(answer), 2) self.assertEqual(str(answer[0]), "TCGAT") self.assertEqual(str(answer[1]), "TCG") answer = list(seq.getByAnnotation("test_type", "test_label")) self.assertEqual(len(answer), 1) self.assertEqual(str(answer[0]), "TCGAT") # test ignoring of a partial annotation sliced_seq = seq[:17] answer = list(sliced_seq.getByAnnotation("test_type", ignore_partial=True)) self.assertEqual(len(answer), 1) self.assertEqual(str(answer[0]), "TCGAT")
def test_other_repeat(self): """should apply repeat feature data in a manner consistent with strand""" coord = dict(CoordName=13, Start=32316063, End=32316363) # 13:32316063 -32316363 ps_repeat = self.human.getRegion(Strand=1, **coord) ms_repeat = self.human.getRegion(Strand=-1, **coord) # note this MER3 repeat is annotated on the -1 strand exp = DNA.makeSequence('AGCTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTGTCCAAA'\ 'CCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGAT'\ 'TTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG') self.assertEqual(ms_repeat.Seq, ps_repeat.Seq.rc()) ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat') ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat') ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0] ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0] self.assertEqual(ms_seq.getSlice(), ps_seq.getSlice()) self.assertEqual(ps_seq.getSlice(), exp)
def test_getByAnnotation(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') seq.addAnnotation(Feature, 'test_type', 'test_label', [(5,10)]) seq.addAnnotation(Feature, 'test_type', 'test_label2', [(15,18)]) answer = list(seq.getByAnnotation('test_type')) self.assertEqual( len(answer), 2) self.assertEqual( str(answer[0]), 'TCGAT') self.assertEqual( str(answer[1]), 'TCG') answer = list(seq.getByAnnotation('test_type', 'test_label')) self.assertEqual( len(answer), 1) self.assertEqual( str(answer[0]), 'TCGAT') # test ignoring of a partial annotation sliced_seq = seq[:17] answer = list(sliced_seq.getByAnnotation('test_type', ignore_partial=True)) self.assertEqual(len(answer), 1) self.assertEqual( str(answer[0]), 'TCGAT')
def _get_sequence_from_direct_assembly(coord=None, DEBUG=False): # TODO clean up use of a coord genome = coord.genome # no matter what strand user provide, we get the + sequence first coord.Strand = 1 species = genome.Species coord_type = CoordSystem(species=species, core_db=genome.CoreDb, seq_level=True) if DEBUG: print('Created Coordinate:', coord, coord.EnsemblStart, coord.EnsemblEnd) print(coord.CoordType, coord_type) assemblies = get_coord_conversion(coord, coord_type, genome.CoreDb) if not assemblies: raise NoItemError('no assembly for %s' % coord) dna = genome.CoreDb.getTable('dna') seqs, positions = [], [] for q_loc, t_loc in assemblies: assert q_loc.Strand == 1 length = len(t_loc) # get MySQL to do the string slicing via substr function query = sql.select([ substr(dna.c.sequence, t_loc.EnsemblStart, length).label('sequence') ], dna.c.seq_region_id == t_loc.seq_region_id) record = asserted_one(query.execute().fetchall()) seq = record['sequence'] seq = DNA.makeSequence(seq) if t_loc.Strand == -1: seq = seq.rc() seqs.append(str(seq)) positions.append((q_loc.Start, q_loc.End)) sequence = _assemble_seq(seqs, coord.Start, coord.End, positions) return sequence
def _make_aln(self, orig, model=dna_model, param_vals=None, indel_rate=0.1, indel_length=0.5, **kw): kw['indel_rate'] = indel_rate kw['indel_length'] = indel_length seqs = dict( (key, DNA.makeSequence(value)) for (key, value) in orig.items()) if len(seqs) == 2: tree = cogent.LoadTree(tip_names=seqs.keys()) tree = cogent.LoadTree(treestring="(A:.1,B:.1)") else: tree = cogent.LoadTree( treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)") aln, tree = cogent.align.progressive.TreeAlign(model, seqs, tree=tree, show_progress=False, param_vals=param_vals, **kw) return aln
def get_primers(primers_data=None, primer_name=None, primer_sequence=None): """ Gets primers from filepath or from single specified primer name/seq primers_data: If specified, open file object for primers data primer_name: single primer name to analyze primer_sequence: single primer sequence to analyze If only the primers_data is provided, all primers in the file will be read and appended to a list as PyCogent DNA.Sequence objects. If a single primer name is provided along with the primers_data, but no primer_sequence, then the primers list will be populated by the single primer and its sequence in the provided primers_data (if not found an error will be raised). If both a primer_name and primer_sequence are provided, only a single primer will be constructed from these, and any primers file data passed will be ignored. """ # User must specify a primers filepath, or a primer_name and # primer_sequence, error check for this. if not primers_data and not(primer_name and primer_sequence): raise ValueError,("Missing primer(s) data. User must specify either "+\ "a primers filepath, or a primer name and sequence. See the -P, -p,"+\ " and -s parameters.") primers = [] # Check for correct naming convention of single primer specified if primer_name: # Fix primer name if not in correct format (followed by lower case f or # r. Leave other components unchanged. primer_name = correct_primer_name(primer_name) # Check primer name for proper 'r' or 'f' ending if not (primer_name.split('_')[0].endswith('f') or primer_name.split('_')[0].endswith('r')): raise ValueError, ('Primer name %s ' % primer_name +'does not '+\ 'end with "f" or "r". The initial alphanumeric name of the '+\ 'primer must be followed by "f" or "r". Example: 22f_archaeal') # If both primer name and seq provided, return single DNA sequence object # for that primer if primer_name and primer_sequence: primers.append(DNA.makeSequence(primer_sequence, Name=primer_name)) return primers # Parse out primers data from formatted primers file, returns list # of tuples with (primer name, primer seq) raw_primers = parse_formatted_primers_data(primers_data) # Test all primer names for proper suffix of 'f' or 'r' for p in raw_primers: if not(p[0].split('_')[0].endswith('f') or p[0].split('_')[0].endswith('r')): raise ValueError,('Primer %s ' % p[0] +'does not end '+\ 'with "f" or "r". The initial alphanumeric name of the '+\ 'primer must be followed by "f" or "r". Example: 22f_archaeal') # If primer_name provided, return single DNA.Sequence object with that # particular primer name and sequence from the primers file if primer_name: # Search raw_primers for primer name that matches one provided for p in raw_primers: if p[0] == primer_name: primers.append(DNA.makeSequence(p[1], Name=primer_name)) return primers # If primer name not found, raise value error raise ValueError,('Primer %s ' % primer_name +'not found in input '+\ 'primers file, please add to primers file or specify sequence with '+\ 'the -s parameter.') # If not using a single primer, build all primers in input primers file for p in raw_primers: primers.append(DNA.makeSequence(p[1], p[0])) # Raise error if nothing built from input file if not(primers): raise ValueError,('No primers were read from input primers file, '+\ 'please check file format.') return primers
def test_short(self): s1 = DNA.makeSequence('tacagta', Name="A") s2 = DNA.makeSequence('tacgtc', Name="B") for a in self._aligned_both_ways(s1, s2, local=False): self.assertEqual(matchedColumns(a), 5) self.assertEqual(len(a), 7)
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, output_fp, temp_dir="/tmp", params=None): """Assign DNA reads to a database fasta of protein sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. A temporary file must be written containing the translated sequences from the input query fasta file because BLAT cannot do this automatically. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing protein sequences. output_fp: absolute path where the output file will be generated. temp_dir: optional. Change the location where the translated sequences will be written before being used as the query. Defaults to /tmp. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from protein and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'prot', '-q': 'prot'} # make sure temp_dir specifies an absolute path if not isabs(temp_dir): raise ApplicationError("temp_dir must be an absolute path.") # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError( "Cannot change database or query types " "when using assign_dna_reads_to_dna_database. Use " "assign_reads_to_database instead.") if 'genetic_code' in params: my_genetic_code = GeneticCodes[params['genetic_code']] del params['genetic_code'] else: my_genetic_code = GeneticCodes[1] my_params.update(params) # get six-frame translation of the input DNA sequences and write them to # temporary file. tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str) tmp_out = open(tmp, 'w') for label, sequence in MinimalFastaParser(open(query_fasta_fp)): seq_id = label.split()[0] s = DNA.makeSequence(sequence) translations = my_genetic_code.sixframes(s) frames = [1, 2, 3, -1, -2, -3] translations = dict(zip(frames, translations)) for frame, translation in sorted(translations.iteritems()): entry = '>{seq_id}_frame_{frame}\n{trans}\n' entry = entry.format(seq_id=seq_id, frame=frame, trans=translation) tmp_out.write(entry) tmp_out.close() result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, params=my_params) remove(tmp) return result
def matchedColumns(align): """Count the matched columns in an alignment""" def all_same(column): consensus = None for motif in column: if consensus is None: consensus = motif elif motif != consensus: return False return True return len(align.filtered(all_same)) seq1 = DNA.makeSequence('aaaccggacattacgtgcgta', Name='FAKE01') seq2 = DNA.makeSequence('ccggtcaggttacgtacgtt', Name='FAKE02') class AlignmentTestCase(unittest.TestCase): def _aligned_both_ways(self, seq1, seq2, **kw): S = make_dna_scoring_dict(10, -1, -8) a1 = classic_align_pairwise(seq1, seq2, S, 10, 2, **kw) a2 = classic_align_pairwise(seq2, seq1, S, 10, 2, **kw) return [a1, a2] def test_local(self): for a in self._aligned_both_ways(seq1, seq2, local=True): self.assertEqual(matchedColumns(a), 15) self.assertEqual(len(a), 19)