def test_assemble_seq(self): """should correctly fill in a sequence with N's""" expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN") frags = ["AAAAA","CCCCC","GGG"] positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect) positions = [(1, 6), (8, 13), (15, 18)] self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect) # should work with: # start matches first frag start expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN") positions = [(0, 5), (7, 12), (14, 17)] self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect) # end matches last frag_end expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG") positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect) # both start and end matched expect = DNA.makeSequence("AAAAANNCCCCCNNGGG") positions = [(10, 15), (17, 22), (24, 27)] self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect) # one frag expect = DNA.makeSequence(''.join(frags)) positions = [(10, 23)] self.assertEqual(_assemble_seq([''.join(frags)],10,23,positions), expect)
def test_gaps_at_both_ends(self): s = 'aaaccggttt' s1 = DNA.makeSequence(s[:-2], Name="A") s2 = DNA.makeSequence(s[2:], Name="B") for a in self._aligned_both_ways(s1, s2, local=False): self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 10)
def test_assemble_seq(self): """should correctly fill in a sequence with N's""" expect = DNA.makeSequence("NAAAAANNCCCCCNNGGGNNN") frags = ["AAAAA", "CCCCC", "GGG"] positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect) positions = [(1, 6), (8, 13), (15, 18)] self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect) # should work with: # start matches first frag start expect = DNA.makeSequence("AAAAANNCCCCCNNGGGNNN") positions = [(0, 5), (7, 12), (14, 17)] self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect) # end matches last frag_end expect = DNA.makeSequence("NAAAAANNCCCCCNNGGG") positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect) # both start and end matched expect = DNA.makeSequence("AAAAANNCCCCCNNGGG") positions = [(10, 15), (17, 22), (24, 27)] self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect) # one frag expect = DNA.makeSequence(''.join(frags)) positions = [(10, 23)] self.assertEqual(_assemble_seq([''.join(frags)], 10, 23, positions), expect)
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence( 'cactc', Name= 'target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def setUp(self): self.cigar_text = '3D2M3D6MDM2D3MD' self.aln_seq = DNA.makeSequence('---AA---GCTTAG-A--CCT-') self.aln_seq1 = DNA.makeSequence('CCAAAAAA---TAGT-GGC--G') self.map, self.seq = self.aln_seq.parseOutGaps() self.map1, self.seq1 = self.aln_seq1.parseOutGaps() self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)] self.aln = LoadSeqs(data = {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1}) self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)} self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence('cactc', Name='target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def _make_utr_seq(self): if self.UntranslatedExons5 is None and self.UntranslatedExons3 is None: self._cached["Utr5"] = self.NULL_VALUE self._cached["Utr3"] = self.NULL_VALUE return Utr5_seq, Utr3_seq = DNA.makeSequence(""), DNA.makeSequence("") for exon in self.UntranslatedExons5: Utr5_seq += exon.Seq for exon in self.UntranslatedExons3: Utr3_seq += exon.Seq self._cached["Utr5"] = Utr5_seq self._cached["Utr3"] = Utr3_seq
def process_uclust_pw_alignment_results(fasta_pairs_lines, uc_lines): """ Process results of uclust search and align """ alignments = get_next_two_fasta_records(fasta_pairs_lines) for hit in get_next_record_type(uc_lines, 'H'): matching_strand = hit[4] if matching_strand == '-': strand_id = '-' target_rev_match = True elif matching_strand == '+': strand_id = '+' target_rev_match = False elif matching_strand == '.': # protein sequence, so no strand information strand_id = '' target_rev_match = False else: raise UclustParseError, "Unknown strand type: %s" % matching_strand uc_query_id = hit[8] uc_target_id = hit[9] percent_id = float(hit[3]) fasta_pair = alignments.next() fasta_query_id = fasta_pair[0][0] aligned_query = fasta_pair[0][1] if fasta_query_id != uc_query_id: raise UclustParseError,\ "Order of fasta and uc files do not match."+\ " Got query %s but expected %s." %\ (fasta_query_id, uc_query_id) fasta_target_id = fasta_pair[1][0] aligned_target = fasta_pair[1][1] if fasta_target_id != uc_target_id + strand_id: raise UclustParseError, \ "Order of fasta and uc files do not match."+\ " Got target %s but expected %s." %\ (fasta_target_id, uc_target_id + strand_id) if target_rev_match: query_id = uc_query_id + ' RC' aligned_query = DNA.rc(aligned_query) target_id = uc_target_id aligned_target = DNA.rc(aligned_target) else: query_id = uc_query_id aligned_query = aligned_query target_id = uc_target_id aligned_target = aligned_target yield (query_id, target_id, aligned_query, aligned_target, percent_id)
def test_picklability(self): """Pickle an alignment containing an annotated sequence""" # This depends on alignments, sequences, features, maps and spans # Doesn't test round trip result is correct, which should possibly # be done for maps/spans, but seqs/alignments are just simple # python classes without __getstate__ etc. import cPickle as pickle seq1 = DNA.makeSequence("aagaagaagaccccca") seq2 = DNA.makeSequence("aagaagaagaccccct") seq2.addFeature('exon', 'fred', [(10,15)]) aln = LoadSeqs(data={'a':seq1, 'b':seq2}) aln2 = pickle.loads(pickle.dumps(aln))
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines): """ Process results of uclust search and align """ alignments = get_next_two_fasta_records(fasta_pairs_lines) for hit in get_next_record_type(uc_lines,'H'): matching_strand = hit[4] if matching_strand == '-': strand_id = '-' target_rev_match = True elif matching_strand == '+': strand_id = '+' target_rev_match = False elif matching_strand == '.': # protein sequence, so no strand information strand_id = '' target_rev_match = False else: raise UclustParseError, "Unknown strand type: %s" % matching_strand uc_query_id = hit[8] uc_target_id = hit[9] percent_id = float(hit[3]) fasta_pair = alignments.next() fasta_query_id = fasta_pair[0][0] aligned_query = fasta_pair[0][1] if fasta_query_id != uc_query_id: raise UclustParseError,\ "Order of fasta and uc files do not match."+\ " Got query %s but expected %s." %\ (fasta_query_id, uc_query_id) fasta_target_id = fasta_pair[1][0] aligned_target = fasta_pair[1][1] if fasta_target_id != uc_target_id + strand_id: raise UclustParseError, \ "Order of fasta and uc files do not match."+\ " Got target %s but expected %s." %\ (fasta_target_id, uc_target_id + strand_id) if target_rev_match: query_id = uc_query_id + ' RC' aligned_query = DNA.rc(aligned_query) target_id = uc_target_id aligned_target = DNA.rc(aligned_target) else: query_id = uc_query_id aligned_query = aligned_query target_id = uc_target_id aligned_target = aligned_target yield (query_id, target_id, aligned_query, aligned_target, percent_id)
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence("AAAATGCTTA" * r) seq1 = DNA.makeSequence("AATTTTGCTG" * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def test_picklability(self): """Pickle an alignment containing an annotated sequence""" # This depends on alignments, sequences, features, maps and spans # Doesn't test round trip result is correct, which should possibly # be done for maps/spans, but seqs/alignments are just simple # python classes without __getstate__ etc. import pickle as pickle seq1 = DNA.makeSequence("aagaagaagaccccca") seq2 = DNA.makeSequence("aagaagaagaccccct") seq2.addFeature('exon', 'fred', [(10, 15)]) aln = LoadSeqs(data={'a': seq1, 'b': seq2}) aln2 = pickle.loads(pickle.dumps(aln))
def test_codon(self): s1 = DNA.makeSequence('tacgccgta', Name="A") s2 = DNA.makeSequence('tacgta', Name="B") codon_model = cogent.evolve.substitution_model.Codon( model_gaps=False, equal_motif_probs=True, mprob_model='conditional') tree = cogent.LoadTree(tip_names=['A', 'B']) lf = codon_model.makeLikelihoodFunction(tree, aligned=False) lf.setSequences(dict(A=s1, B=s2)) a = lf.getLogLikelihood().edge.getViterbiPath().getAlignment() self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 9)
def test_codon(self): s1 = DNA.makeSequence('tacgccgta', Name="A") s2 = DNA.makeSequence('tacgta', Name="B") codon_model = cogent.evolve.substitution_model.Codon( model_gaps=False, equal_motif_probs=True, mprob_model='conditional') tree = cogent.LoadTree(tip_names=['A', 'B']) lf = codon_model.makeLikelihoodFunction(tree, aligned=False) lf.setSequences(dict(A=s1, B=s2)) (score, a) = lf.getLogLikelihood().edge.getViterbiScoreAndAlignment() self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 9)
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int ( (len(seq1)*len(seq2))/t/1000 )
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = DNA.rc(bc_read) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record(read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]) output_fastq1.write(seq_lines) return
def makeSampleSequence(): seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc' seq = DNA.makeSequence(seq) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20,35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39,49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49,60)]) return seq
def test_inherit_feature(self): """should be able to subclass and extend _Feature""" class NewFeat(_Feature): def __init__(self, *args, **kwargs): super(NewFeat, self).__init__(*args, **kwargs) def newMethod(self): if len(self.map.spans) > 1: as_one = self.asOneSpan() # should create new instance of NewFeat return as_one.newMethod() return True seq = DNA.makeSequence('ACGTACGTACGT') f = seq.addAnnotation(NewFeat, as_map([(1,3), (5,7)], len(seq)), type='gene', Name='abcd') self.assertEqual(type(f.asOneSpan()), NewFeat) self.assertEqual(type(f.getShadow()), NewFeat) f2 = seq.addAnnotation(NewFeat, as_map([(3,5)], len(seq)), type='gene', Name='def') self.assertEqual(type(seq.getRegionCoveringAll([f, f2], feature_class=NewFeat)), NewFeat) # now use the new method f.newMethod()
def parse_illumina_line(l,barcode_length,rev_comp_barcode, barcode_in_sequence=False): """Parses a single line of Illumina data """ fields = l.strip().split(':') y_position_subfields = fields[4].split('#') y_position = int(y_position_subfields[0]) sequence = fields[5] qual_string = fields[6] if barcode_in_sequence: barcode = sequence[:barcode_length] sequence = sequence[barcode_length:] qual_string = qual_string[barcode_length:] else: barcode = y_position_subfields[1][:barcode_length] if rev_comp_barcode: barcode = DNA.rc(barcode) result = {\ 'Full description':':'.join(fields[:5]),\ 'Machine Name':fields[0],\ 'Channel Number':int(fields[1]),\ 'Tile Number':int(fields[2]),\ 'X Position':int(fields[3]),\ 'Y Position':y_position,\ 'Barcode':barcode,\ 'Full Y Position Field':fields[4],\ 'Sequence':sequence,\ 'Quality Score':qual_string} return result
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = DNA.rc(bc_read) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record( read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:] ) output_fastq1.write(seq_lines) return
def parse_illumina_single_end_read_file(read_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length, rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0): """Parses Illumina single-end read file """ for read_line in read_file: read = parse_illumina_line(read_line,barcode_length, rev_comp_barcode,barcode_in_seq) read_desc = illumina_read_description_from_read_data(read) read_barcode = read['Barcode'] if read_barcode.count('N') > barcode_max_N: continue seq, qual = read_qual_score_filter(\ read['Sequence'], read['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N): continue if rev_comp: seq = DNA.rc(seq) qual = qual[::-1] yield read_desc, read_barcode, seq, qual
def CigarParser(seqs, cigars, sliced = False, ref_seqname = None, start = None, end = None, moltype=DNA): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Arguments: seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames MolType - optional default to DNA """ data = {} if not sliced: for seqname in seqs.keys(): aligned_seq = aligned_from_cigar(cigars[seqname], seqs[seqname], moltype=moltype) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], seqs[ref_seqname], moltype=moltype) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align = False) data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]] for seqname in [seqname for seqname in seqs.keys() if seqname != ref_seqname]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.makeSequence(seq) data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m) else: data[seqname] = DNA.makeSequence('-'*(aln_loc[1] - aln_loc[0])) aln = LoadSeqs(data = data, aligned = True) return aln
def parse_illumina_line(l, barcode_length, rev_comp_barcode, barcode_in_sequence=False): """Parses a single line of Illumina data """ fields = l.strip().split(':') y_position_subfields = fields[4].split('#') y_position = int(y_position_subfields[0]) sequence = fields[5] qual_string = fields[6] if barcode_in_sequence: barcode = sequence[:barcode_length] sequence = sequence[barcode_length:] qual_string = qual_string[barcode_length:] else: barcode = y_position_subfields[1][:barcode_length] if rev_comp_barcode: barcode = DNA.rc(barcode) result = { 'Full description': ':'.join(fields[:5]), 'Machine Name': fields[0], 'Channel Number': int(fields[1]), 'Tile Number': int(fields[2]), 'X Position': int(fields[3]), 'Y Position': y_position, 'Barcode': barcode, 'Full Y Position Field': fields[4], 'Sequence': sequence, 'Quality Score': qual_string} return result
def findBestSeq(seqobject): dna_seq = str(seqobject.seq) my_seq = DNA.makeSequence(dna_seq,seqobject.id) # x=0 # framedict = dict() # while x < 3: # temp1 = my_seq[x:] # temp2 = temp1..withoutTerminalStopCodon() # framedict[x] = temp2.getTranslation() # x+=1 all_six = standard_code.sixframes(my_seq) seqlist = list() for frame in all_six: seqreturned = frame.split('*')[0] seqlist.append(seqreturned) longestseq = '' x=0 while x < 3: if len(longestseq) < len(seqlist[x]): longestseq = seqlist[x] correctdnaseq = my_seq[x:] x+=1 #longest_seq = max(seqlist, key=len) return longestseq, correctdnaseq
def test_inherit_feature(self): """should be able to subclass and extend _Feature""" class NewFeat(_Feature): def __init__(self, *args, **kwargs): super(NewFeat, self).__init__(*args, **kwargs) def newMethod(self): if len(self.map.spans) > 1: as_one = self.asOneSpan( ) # should create new instance of NewFeat return as_one.newMethod() return True seq = DNA.makeSequence('ACGTACGTACGT') f = seq.addAnnotation(NewFeat, as_map([(1, 3), (5, 7)], len(seq)), type='gene', Name='abcd') self.assertEqual(type(f.asOneSpan()), NewFeat) self.assertEqual(type(f.getShadow()), NewFeat) f2 = seq.addAnnotation(NewFeat, as_map([(3, 5)], len(seq)), type='gene', Name='def') self.assertEqual( type(seq.getRegionCoveringAll([f, f2], feature_class=NewFeat)), NewFeat) # now use the new method f.newMethod()
def _get_flanking_seq_data(self): # maps to flanking_sequence through variation_feature_id # if this fails, we grab from genomic sequence variation_id = self._table_rows['variation_feature']['variation_id'] flanking_seq_table = self.flanking_sequence_table query = sql.select([flanking_seq_table], flanking_seq_table.c.variation_id == variation_id) record = asserted_one(query.execute()) self._table_rows['flanking_sequence'] = record up_seq = record['up_seq'] down_seq = record['down_seq'] # the following two lines are because -- wait for it -- someone has # entered the string 'NULL' instead of NULL in the MySQL tables!!! up_seq = [up_seq, None][up_seq == 'NULL'] down_seq = [down_seq, None][down_seq == 'NULL'] seqs = dict(up=up_seq, down=down_seq) for name, seq in seqs.items(): if seq is not None: seq = DNA.makeSequence(seq) else: resized = [(-301, -1), (1, 301)][name == 'down'] if self.Location.Strand == -1: resized = [(1, 301), (-301, -1)][name == 'down'] flank = self.Location.resized(*resized) flanking = self.genome.getRegion(region=flank) seq = flanking.Seq seqs[name] = seq self._cached[('FlankingSeq')] = (seqs['up'][-300:],seqs['down'][:300])
def _generate_unambiguous_sequences(self): unambiguous_conserved_sequences = dict() rev_unambiguous_conserved_sequences = dict() for pos,seq in self._CONSERVED_SEQUENCES.items(): dnaseq = DNA.makeSequence(seq) ret = self._disambiguate(dnaseq) if isinstance(ret, list): for dnaseq_r in ret: self.conserved_sequences[str(dnaseq_r)] = ConservedSequence(dnaseq_r, pos) else: self.conserved_sequences[str(ret)] = ConservedSequence(ret, pos) for seq,con_seq in self.conserved_sequences.items(): rc_seq = DNA.makeSequence(seq) rc_seq.rc() self.conserved_sequences[str(rc_seq)] = ConservedSequence(rc_seq, con_seq.pos, rc=True)
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc): """ """ for seq_id, seq in parse_fasta(fasta_lines): seq_id = seq_desc_mapper(seq_id) seq = DNA.rc(seq.upper()) yield seq_id, seq return
def test_stop_indexes(self): """should return stop codon indexes for a specified frame""" sgc = GeneticCode(self.SGC) seq = DNA.makeSequence('ATGCTAACATAAA') expected = [[9], [4], []] for frame, expect in enumerate(expected): got = sgc.getStopIndices(seq, start=frame) self.assertEqual(got, expect)
def makeSampleSequence(): seq = DNA.makeSequence('aaaccggttt' * 10) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39, 49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49, 60)]) return seq
def test_stop_indexes(self): """should return stop codon indexes for a specified frame""" sgc = GeneticCode(self.SGC) seq = DNA.makeSequence("ATGCTAACATAAA") expected = [[9], [4], []] for frame, expect in enumerate(expected): got = sgc.getStopIndices(seq, start=frame) self.assertEqual(got, expect)
def adjust_alignment(template, candidate, new_gaps): """adjust template/candidate aln to remove gaps added by pairwise alignment This step adjusts the alignment to reduce the length back to the template alignment length by introducing local misalignments to remove gap characters that are present in the pairwise alignment but not in the template alignment. """ template_l = list(template) candidate_l = list(candidate) new_gaps.reverse() for pos in new_gaps: del template_l[pos] del candidate_l[nearest_gap(candidate_l, pos)] return (DNA.makeSequence(''.join(template_l)), \ DNA.makeSequence(''.join(candidate_l)))
def adjust_alignment(template,candidate,new_gaps): """adjust template/candidate aln to remove gaps added by pairwise alignment This step adjusts the alignment to reduce the length back to the template alignment length by introducing local misalignments to remove gap characters that are present in the pairwise alignment but not in the template alignment. """ template_l = list(template) candidate_l = list(candidate) new_gaps.reverse() for pos in new_gaps: del template_l[pos] del candidate_l[nearest_gap(candidate_l,pos)] return (DNA.makeSequence(''.join(template_l)), \ DNA.makeSequence(''.join(candidate_l)))
def makeSampleSequence(): seq = 'tgccnwsrygagcgtgttaaacaatggccaactctctaccttcctatgttaaacaagtgagatcgcaggcgcgccaaggc' seq = DNA.makeSequence(seq) v = seq.addAnnotation(annotation.Feature, 'exon', 'exon', [(20, 35)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'repeat_unit', [(39, 49)]) v = seq.addAnnotation(annotation.Feature, 'repeat_unit', 'rep2', [(49, 60)]) return seq
def makeSampleSequence(with_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if with_gaps: raw_seq = raw_seq[:5] + '-----' +raw_seq[10:-2] + '--' seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def makeSampleSequence(with_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if with_gaps: raw_seq = raw_seq[:5] + '-----' + raw_seq[10:-2] + '--' seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def test_maps_on_maps(self): seq = DNA.makeSequence("ATCGATCGAT" * 5, Name="base") feat1 = annotate(seq, 10, 20, "fake") feat2 = annotate(feat1, 3, 5, "fake2") feat3 = annotate(seq, 1, 3, "left") seq2 = seq[5:] self.assertEqual( structure(seq), ("seq", 50, [("fake", "[10:20]/50", [("fake2", "[3:5]/10")]), ("left", "[1:3]/50")]) ) self.assertEqual(structure(seq2), ("seq", 45, [("fake", "[5:15]/45", [("fake2", "[3:5]/10")])]))
def introduce_terminal_gaps(template, aligned_template, aligned_candidate): """ introduce terminal gaps from template into the aligned candidate seq """ # count the 5' gaps in the original aligned template original_five_prime_gaps = 0 for c in template: if c == '-': original_five_prime_gaps += 1 else: break # count the 5' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_five_prime_gaps = 0 for c in aligned_template: if c == '-': aligned_template_five_prime_gaps += 1 else: break # compute the number of 5' gaps that need to be added to get to the # original alignment length five_prime_gaps_to_add = \ original_five_prime_gaps - aligned_template_five_prime_gaps # count the 3' gaps in the original aligned template original_three_prime_gaps = 0 for c in reversed(template): if c == '-': original_three_prime_gaps += 1 else: break # count the 3' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_three_prime_gaps = 0 for c in reversed(aligned_template): if c == '-': aligned_template_three_prime_gaps += 1 else: break # compute the number of 3' gaps that need to be added to get to the # original alignment length three_prime_gaps_to_add = \ original_three_prime_gaps - aligned_template_three_prime_gaps # return the sequence with the 5' and 3' gaps added return DNA.makeSequence(''.join([\ '-'*five_prime_gaps_to_add,\ str(aligned_candidate),\ '-'*three_prime_gaps_to_add]),\ Name=aligned_candidate.Name)
def introduce_terminal_gaps(template,aligned_template,aligned_candidate): """ introduce terminal gaps from template into the aligned candidate seq """ # count the 5' gaps in the original aligned template original_five_prime_gaps = 0 for c in template: if c == '-': original_five_prime_gaps +=1 else: break # count the 5' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_five_prime_gaps = 0 for c in aligned_template: if c == '-': aligned_template_five_prime_gaps += 1 else: break # compute the number of 5' gaps that need to be added to get to the # original alignment length five_prime_gaps_to_add = \ original_five_prime_gaps - aligned_template_five_prime_gaps # count the 3' gaps in the original aligned template original_three_prime_gaps = 0 for c in reversed(template): if c == '-': original_three_prime_gaps +=1 else: break # count the 3' gaps already existing in the pairwise aligned template # (because we don't need to add these) aligned_template_three_prime_gaps = 0 for c in reversed(aligned_template): if c == '-': aligned_template_three_prime_gaps += 1 else: break # compute the number of 3' gaps that need to be added to get to the # original alignment length three_prime_gaps_to_add = \ original_three_prime_gaps - aligned_template_three_prime_gaps # return the sequence with the 5' and 3' gaps added return DNA.makeSequence(''.join([\ '-'*five_prime_gaps_to_add,\ str(aligned_candidate),\ '-'*three_prime_gaps_to_add]),\ Name=aligned_candidate.Name)
def get_reverse_primers(id_map): """ Return a dictionary with barcodes and rev-complement of rev primers """ rev_primers = {} for n in id_map.items(): # Generate a dictionary with Barcode:reverse primer # Convert to reverse complement of the primer so its in the # proper orientation with the input fasta sequences rev_primers[n[1]['BarcodeSequence']]=DNA.rc(n[1]['ReversePrimer']) return rev_primers
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int((len(seq1) * len(seq2)) / t / 1000)
def _get_sequence(self): if 'Seq' not in self._cached: try: seq = get_sequence(self.Location) except NoItemError: try: alt_loc = assembly_exception_coordinate(self.Location) seq = get_sequence(alt_loc) except NoItemError: seq = DNA.makeSequence("N"*len(self)) seq.Name = str(self.Location) self._cached['Seq'] = seq return self._cached['Seq']
def test_maps_on_maps(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') feat1 = annotate(seq, 10, 20, 'fake') feat2 = annotate(feat1, 3, 5, 'fake2') feat3 = annotate(seq, 1, 3, 'left') seq2 = seq[5:] self.assertEqual( structure(seq), ('seq', 50, [('fake', '[10:20]/50', [('fake2', '[3:5]/10')]), ('left', '[1:3]/50')])) self.assertEqual( structure(seq2), ('seq', 45, [('fake', '[5:15]/45', [('fake2', '[3:5]/10')])]))
def makeSampleSequence(mid_gaps=False): raw_seq = 'AACCCAAAATTTTTTGGGGGGGGGGCCCC' cds = (15, 25) utr = (12, 15) if mid_gaps: rev_seq = raw_seq[:5] + '-----' +raw_seq[10:] raw_seq = rev_seq # annotations only make sense when they're on the raw sequence cds = (10, 20) utr = (5, 8) seq = DNA.makeSequence(raw_seq) seq.addAnnotation(Feature, 'CDS', 'CDS', [cds]) seq.addAnnotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def _make_aln(self, orig, model=dna_model, param_vals=None, indel_rate=0.1, indel_length=0.5, **kw): kw['indel_rate'] = indel_rate kw['indel_length'] = indel_length seqs = dict((key, DNA.makeSequence(value)) for (key, value) in orig.items()) if len(seqs) == 2: tree = cogent.LoadTree(tip_names=seqs.keys()) tree = cogent.LoadTree(treestring="(A:.1,B:.1)") else: tree = cogent.LoadTree(treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)") aln, tree = cogent.align.progressive.TreeAlign(model, seqs, tree=tree, param_vals=param_vals, show_progress=False, **kw) return aln
def remove_template_terminal_gaps(candidate,template): """Remove template terminal gaps and corresponding bases in candidate """ if len(template) != len(candidate): raise ValueError, \ "Sequences must be aligned, but their "+\ "lengths aren't equal. %d != %d" % (len(candidate),len(template)) if len(template) == 0: return candidate, template degapped_candidate_len = len(candidate.degap()) candidate = DNA.makeSequence(candidate) template = DNA.makeSequence(template) template_gap_vector = template.gapVector() first_non_gap = template_gap_vector.index(False) num_three_prime_gaps = template_gap_vector[::-1].index(False) last_non_gap = len(template_gap_vector) - num_three_prime_gaps # Construct the candidate name, which will include the range of bases # from the original sequence candidate = candidate[first_non_gap:last_non_gap] template = template[first_non_gap:last_non_gap] candidate_start_pos = first_non_gap + 1 candidate_end_pos = degapped_candidate_len - num_three_prime_gaps candidate_name = candidate.Name if candidate_name.endswith('RC'): name_delimiter = ':' else: name_delimiter = ' ' candidate_name = '%s%s%d..%d' %\ (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos) return DNA.makeSequence(candidate,Name=candidate_name), template
def remove_template_terminal_gaps(candidate, template): """Remove template terminal gaps and corresponding bases in candidate """ if len(template) != len(candidate): raise ValueError, \ "Sequences must be aligned, but their "+\ "lengths aren't equal. %d != %d" % (len(candidate),len(template)) if len(template) == 0: return candidate, template degapped_candidate_len = len(candidate.degap()) candidate = DNA.makeSequence(candidate) template = DNA.makeSequence(template) template_gap_vector = template.gapVector() first_non_gap = template_gap_vector.index(False) num_three_prime_gaps = template_gap_vector[::-1].index(False) last_non_gap = len(template_gap_vector) - num_three_prime_gaps # Construct the candidate name, which will include the range of bases # from the original sequence candidate = candidate[first_non_gap:last_non_gap] template = template[first_non_gap:last_non_gap] candidate_start_pos = first_non_gap + 1 candidate_end_pos = degapped_candidate_len - num_three_prime_gaps candidate_name = candidate.Name if candidate_name.endswith('RC'): name_delimiter = ':' else: name_delimiter = ' ' candidate_name = '%s%s%d..%d' %\ (candidate_name,name_delimiter,candidate_start_pos,candidate_end_pos) return DNA.makeSequence(candidate, Name=candidate_name), template
def _assemble_seq(frags, start, end, frag_positions): """returns a single string in which missing sequence is replaced by 'N'""" prev_end = start assert len(frag_positions) == len(frags), "Mismatched number of "\ "fragments and positions" assembled = [] for index, (frag_start, frag_end) in enumerate(frag_positions): diff = frag_start - prev_end assert diff >= 0, 'fragment position start < previous end: %s, %s' %\ (frag_start, prev_end) assembled += ['N' * diff, frags[index]] prev_end = frag_end diff = end - frag_end assert diff >= 0, 'end[%s] < previous frag_end[%s]' % (end, frag_end) assembled += ['N' * diff] return DNA.makeSequence(''.join(assembled))
def test_simulateAlignment_root_sequence(self): """provide a root sequence for simulating an alignment""" def use_root_seq(root_sequence): al = LoadSeqs(data={'a': 'ggaatt', 'c': 'cctaat'}) t = LoadTree(treestring="(a,c);") sm = substitution_model.Dinucleotide(mprob_model='tuple') lf = sm.makeParamController(t) lf.setAlignment(al) simalign = lf.simulateAlignment(exclude_internal=False, root_sequence=root_sequence) root = simalign.NamedSeqs['root'] self.assertEqual(str(root), str(root_sequence)) root_sequence = DNA.makeSequence('GTAATT') use_root_seq(root_sequence) # as a sequence instance use_root_seq('GTAATC') # as a string
def test_maps_on_maps(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') feat1 = annotate(seq, 10, 20, 'fake') feat2 = annotate(feat1, 3, 5, 'fake2') feat3 = annotate(seq, 1, 3, 'left') seq2 = seq[5:] self.assertEqual(structure(seq), ('seq', 50, [('fake', '[10:20]/50', [('fake2', '[3:5]/10')]), ('left', '[1:3]/50')]) ) self.assertEqual(structure(seq2), ('seq', 45, [('fake', '[5:15]/45', [('fake2', '[3:5]/10')])]) )
def CigarParser(seqs, cigars, sliced=False, ref_seqname=None, start=None, end=None, moltype=DNA): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Arguments: seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames MolType - optional default to DNA """ data = {} if not sliced: for seqname in list(seqs.keys()): aligned_seq = aligned_from_cigar(cigars[seqname], seqs[seqname], moltype=moltype) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], seqs[ref_seqname], moltype=moltype) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align=False) data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]] for seqname in [ seqname for seqname in list(seqs.keys()) if seqname != ref_seqname ]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.makeSequence(seq) data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m) else: data[seqname] = DNA.makeSequence('-' * (aln_loc[1] - aln_loc[0])) aln = LoadSeqs(data=data, aligned=True) return aln
def get_rev_primer_seqs(mapping_fp): """ Parses mapping file to get dictionary of SampleID:Rev primer mapping_fp: mapping filepath """ hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_fp, has_barcodes=False, disable_primer_check=True) if errors: for curr_err in errors: if curr_err.startswith("Duplicate SampleID"): raise ValueError,('Errors were found with mapping file, '+\ 'please run check_id_map.py to identify problems.') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] reverse_primers = {} for curr_id in id_map.keys(): try: reverse_primers[curr_id] =\ [DNA.rc(curr_rev_primer) for curr_rev_primer in\ id_map[curr_id]['ReversePrimer'].split(',')] except KeyError: raise KeyError,("Reverse primer not found in mapping file, "+\ "please include a 'ReversePrimer' column.") # Check for valid reverse primers # Will have been detected as warnings from mapping file for curr_err in errors: if curr_err.startswith("Invalid DNA sequence detected"): raise ValueError,("Problems found with reverse primers, please "+\ "check mapping file with check_id_map.py") return reverse_primers
def test_other_repeat(self): """should apply repeat feature data in a manner consistent with strand""" coord = dict(CoordName=13, Start=32890200, End=32890500) ps_repeat = self.human.getRegion(Strand=1, **coord) ms_repeat = self.human.getRegion(Strand=-1, **coord) exp = DNA.makeSequence('CTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTG'\ 'TCCAAACCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGA'\ 'TTTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG') self.assertEquals(ms_repeat.Seq, ps_repeat.Seq.rc()) ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat') ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat') ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0] ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0] self.assertEquals(ms_seq.getSlice(), ps_seq.getSlice()) self.assertEquals(ps_seq.getSlice(), exp)
def test_getByAnnotation(self): seq = DNA.makeSequence('ATCGATCGAT' * 5, Name='base') seq.addAnnotation(Feature, 'test_type', 'test_label', [(5,10)]) seq.addAnnotation(Feature, 'test_type', 'test_label2', [(15,18)]) answer = list(seq.getByAnnotation('test_type')) self.assertEqual( len(answer), 2) self.assertEqual( str(answer[0]), 'TCGAT') self.assertEqual( str(answer[1]), 'TCG') answer = list(seq.getByAnnotation('test_type', 'test_label')) self.assertEqual( len(answer), 1) self.assertEqual( str(answer[0]), 'TCGAT') # test ignoring of a partial annotation sliced_seq = seq[:17] answer = list(sliced_seq.getByAnnotation('test_type', ignore_partial=True)) self.assertEqual(len(answer), 1) self.assertEqual( str(answer[0]), 'TCGAT')
def test_other_repeat(self): """should apply repeat feature data in a manner consistent with strand""" coord = dict(CoordName=13, Start=32316063, End=32316363) # 13:32316063 -32316363 ps_repeat = self.human.getRegion(Strand=1, **coord) ms_repeat = self.human.getRegion(Strand=-1, **coord) # note this MER3 repeat is annotated on the -1 strand exp = DNA.makeSequence('AGCTTACTGTGAGGATGGGAACATTTTACAGCTGTGCTGTCCAAA'\ 'CCGGTGCCACTAGCCACATTAAGCACTCGAAACGTGGCTAGTGCGACTAGAGAAGAGGAT'\ 'TTTCATACGATTTAGTTTCAATCACGCTAACCAGTGACGCGTGGCTAGTGG') self.assertEqual(ms_repeat.Seq, ps_repeat.Seq.rc()) ps_annot_seq = ps_repeat.getAnnotatedSeq(feature_types='repeat') ms_annot_seq = ms_repeat.getAnnotatedSeq(feature_types='repeat') ps_seq = ps_annot_seq.getAnnotationsMatching('repeat')[0] ms_seq = ms_annot_seq.getAnnotationsMatching('repeat')[0] self.assertEqual(ms_seq.getSlice(), ps_seq.getSlice()) self.assertEqual(ps_seq.getSlice(), exp)