def check_variant_strand(var_details, intron_locs): var_details.sort(key=itemgetter(2)) count, reverse = 0, 0 var_locs_reversed = list() var_locs = [v[2] for v in var_details] for intron in intron_locs: count += 1 if intron[3] == 1: continue #intron is on forward strand reverse += 1 intron_start = intron[1] intron_end = intron[2] if intron_end < var_locs[0]: continue a = bisect.bisect(var_locs, intron_start) b = bisect.bisect(var_locs, intron_end) for i in np.arange(a, b): var_details[i] = (var_details[i][0], var_details[i][1], var_details[i][2], \ DNA.complement(var_details[i][3]), DNA.complement(var_details[i][4])) var_locs_reversed.append(i) print('Number of introns processed' , count) print('Number of reverse strand introns', reverse) print('Number of variants ', len(var_details)) print('Number of variants on (-) strand', len(var_locs_reversed)) return var_details, var_locs_reversed
def test_assemble_seq(self): """should correctly fill in a sequence with N's""" expect = DNA.make_seq("NAAAAANNCCCCCNNGGGNNN") frags = ["AAAAA", "CCCCC", "GGG"] positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 31, positions), expect) positions = [(1, 6), (8, 13), (15, 18)] self.assertEqual(_assemble_seq(frags, 0, 21, positions), expect) # should work with: # start matches first frag start expect = DNA.make_seq("AAAAANNCCCCCNNGGGNNN") positions = [(0, 5), (7, 12), (14, 17)] self.assertEqual(_assemble_seq(frags, 0, 20, positions), expect) # end matches last frag_end expect = DNA.make_seq("NAAAAANNCCCCCNNGGG") positions = [(11, 16), (18, 23), (25, 28)] self.assertEqual(_assemble_seq(frags, 10, 28, positions), expect) # both start and end matched expect = DNA.make_seq("AAAAANNCCCCCNNGGG") positions = [(10, 15), (17, 22), (24, 27)] self.assertEqual(_assemble_seq(frags, 10, 27, positions), expect) # one frag expect = DNA.make_seq("".join(frags)) positions = [(10, 23)] self.assertEqual(_assemble_seq(["".join(frags)], 10, 23, positions), expect)
def test_gap_coords_to_map(self): """construct a Map from coordinates of gap alone""" m, seq = DNA.make_seq("-AC--GT-TTA--").parse_out_gaps() gap_coords = {0: 1, 2: 2, 4: 1, 7: 2} seqlen = 70 got = gap_coords_to_map(gap_coords, seqlen) self.assertEqual(len(got), seqlen + sum(gap_coords.values())) gap_coords = {5: 2, 17: 3, 10: 2} seqlen = 20 got = gap_coords_to_map(gap_coords, seqlen) self.assertEqual(len(got), sum(gap_coords.values()) + seqlen) # roundtrip from Map.get_gap_coordinates() self.assertEqual(dict(got.get_gap_coordinates()), gap_coords) # and no gaps m, seq = DNA.make_seq("ACGTTTA").parse_out_gaps() got = gap_coords_to_map({}, len(seq)) self.assertEqual(len(got), len(m)) self.assertEqual(got.get_coordinates(), m.get_coordinates()) # and gaps outside sequence with self.assertRaises(ValueError): got = gap_coords_to_map({20: 1}, len(seq))
def test_gaps_at_both_ends(self): s = "aaaccggttt" s1 = DNA.make_seq(s[:-2], name="A") s2 = DNA.make_seq(s[2:], name="B") for a in self._aligned_both_ways(s1, s2, local=False): self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 10)
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.make_seq("cwc", name="pattern") two_hit = DNA.make_seq("cactc", name="target") aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.named_seqs["target"] self.assertEqual(str(hit).lower(), "cac")
def setUp(self): self.cigar_text = "3D2M3D6MDM2D3MD" self.aln_seq = DNA.make_seq("---AA---GCTTAG-A--CCT-") self.aln_seq1 = DNA.make_seq("CCAAAAAA---TAGT-GGC--G") self.map, self.seq = self.aln_seq.parse_out_gaps() self.map1, self.seq1 = self.aln_seq1.parse_out_gaps() self.slices = [(1, 4), (0, 8), (7, 12), (0, 1), (3, 5)] self.aln = make_aligned_seqs( {"FAKE01": self.aln_seq, "FAKE02": self.aln_seq1}, array_align=False ) self.cigars = {"FAKE01": self.cigar_text, "FAKE02": map_to_cigar(self.map1)} self.seqs = {"FAKE01": str(self.seq), "FAKE02": str(self.seq1)}
def test_codon(self): s1 = DNA.make_seq("tacgccgta", name="A") s2 = DNA.make_seq("tacgta", name="B") codon_model = cogent3.evolve.substitution_model.TimeReversibleCodon( model_gaps=False, equal_motif_probs=True, mprob_model="conditional") tree = cogent3.make_tree(tip_names=["A", "B"]) lf = codon_model.make_likelihood_function(tree, aligned=False) lf.set_sequences(dict(A=s1, B=s2)) a = lf.get_log_likelihood().edge.get_viterbi_path().get_alignment() self.assertEqual(matchedColumns(a), 6) self.assertEqual(len(a), 9)
def get_rc_record(alleles, ancestor, allele_freqs, flank_5, flank_3): """reverse complements the alleles, ancestror, flanking seqs, and allele freqs """ complement = DNA.complement alleles_rc = set([complement(b) for b in alleles]) ancestor_rc = complement(ancestor) allele_freqs_rc = {} for allele, freq in allele_freqs.items(): allele_freqs_rc[complement(allele)] = freq flank_5_rc = str(DNA.make_seq(flank_5).rc()) flank_3_rc = str(DNA.make_seq(flank_3).rc()) return alleles_rc, ancestor_rc, allele_freqs_rc, flank_3_rc, flank_5_rc
def _make_aln( self, orig, model=dna_model, param_vals=None, indel_rate=0.1, indel_length=0.5, **kw, ): kw["indel_rate"] = indel_rate kw["indel_length"] = indel_length seqs = { key: DNA.make_seq(value) for (key, value) in list(orig.items()) } if len(seqs) == 2: tree = cogent3.make_tree(treestring="(A:.1,B:.1)") else: tree = cogent3.make_tree( treestring="(((A:.1,B:.1):.1,C:.1):.1,D:.1)") aln, tree = cogent3.align.progressive.TreeAlign(model, seqs, tree=tree, param_vals=param_vals, show_progress=False, **kw) return aln
def setUp(self): # A Sequence with a couple of exons on it. self.s = DNA.make_seq( "AAGAAGAAGACCCCCAAAAAAAAAATTTTTTTTTTAAAAAAAAAAAAA", name="Orig") self.exon1 = self.s.add_annotation(Feature, "exon", "fred", [(10, 15)]) self.exon2 = self.s.add_annotation(Feature, "exon", "trev", [(30, 40)]) self.nested_feature = self.exon1.add_feature("repeat", "bob", [(2, 5)])
def test_inherit_feature(self): """should be able to subclass and extend _Feature""" class NewFeat(_Feature): def __init__(self, *args, **kwargs): super(NewFeat, self).__init__(*args, **kwargs) def newMethod(self): if len(self.map.spans) > 1: as_one = self.as_one_span() # should create new instance of NewFeat return as_one.newMethod() return True seq = DNA.make_seq("ACGTACGTACGT") f = seq.add_annotation( NewFeat, as_map([(1, 3), (5, 7)], len(seq)), type="gene", name="abcd" ) self.assertEqual(type(f.as_one_span()), NewFeat) self.assertEqual(type(f.get_shadow()), NewFeat) f2 = seq.add_annotation( NewFeat, as_map([(3, 5)], len(seq)), type="gene", name="def" ) self.assertEqual( type(seq.get_region_covering_all([f, f2], feature_class=NewFeat)), NewFeat ) # now use the new method f.newMethod()
def test_annotate_matches_to(self): """annotate_matches_to attaches annotations correctly to a Sequence """ seq = DNA.make_seq("TTCCACTTCCGCTT", name="x") pattern = "CCRC" annot = seq.annotate_matches_to(pattern=pattern, annot_type="domain", name="fred", allow_multiple=True) self.assertEqual([a.get_slice() for a in annot], ["CCAC", "CCGC"]) annot = seq.annotate_matches_to(pattern=pattern, annot_type="domain", name="fred", allow_multiple=False) self.assertEqual(len(annot), 1) fred = annot[0].get_slice() self.assertEqual(str(fred), "CCAC") # For Sequence objects of a non-IUPAC MolType, annotate_matches_to # should return an empty annotation. seq = ASCII.make_seq(seq="TTCCACTTCCGCTT") annot = seq.annotate_matches_to(pattern=pattern, annot_type="domain", name="fred", allow_multiple=False) self.assertEqual(annot, [])
def test_constructor_equivalence(self): """""" # These different constructions should generate the same output. data = [["human", "CGAAACGTTT"], ["mouse", "CTAAACGTCG"]] as_series = make_aligned_seqs(data=data, array_align=False) as_items = make_aligned_seqs(data=data, array_align=False) serial = as_series.with_masked_annotations(["cpgsite"]) itemwise = as_items.with_masked_annotations(["cpgsite"]) self.assertEqual(str(serial), str(itemwise)) # Annotations should be correctly masked, # whether the sequence has been reverse complemented or not. # We use the plus/minus strand CDS containing sequences created above. plus = DNA.make_seq("AAGGGGAAAACCCCCAAAAAAAAAATTTTTTTTTTAAA", name="plus") _ = plus.add_annotation(Feature, "CDS", "gene", [(2, 6), (10, 15), (25, 35)]) minus = plus.rc() self.assertEqual( str(plus.with_masked_annotations("CDS")), "AA????AAAA?????AAAAAAAAAA??????????AAA", ) self.assertEqual( str(minus.with_masked_annotations("CDS")), "TTT??????????TTTTTTTTTT?????TTTT????TT", )
def test_picklability(self): """Pickle an alignment containing an annotated sequence""" # This depends on alignments, sequences, features, maps and spans # Doesn't test round trip result is correct, which should possibly # be done for maps/spans, but seqs/alignments are just simple # python classes without __getstate__ etc. import pickle as pickle seq1 = DNA.make_seq("aagaagaagaccccca") seq2 = DNA.make_seq("aagaagaagaccccct") seq2.add_feature("exon", "fred", [(10, 15)]) aln = make_aligned_seqs(data={"a": seq1, "b": seq2}) # TODO the ability to pickle/unpickle depends on the protocol # in Py3 for reasons that are not clear. This needs to be looked # more closely dmp = pickle.dumps(aln, protocol=1) aln2 = pickle.loads(dmp)
def test_translate_frames(self): """returns translated sequences""" seq = DNA.make_seq("ATGCTGACATAAA", name="fake1") tr = translate_frames(seq) self.assertEqual(tr, ["MLT*", "C*HK", "ADI"]) # with the bacterial nuclear and plant plastid code tr = translate_frames(seq, gc="Euplotid Nuclear") self.assertEqual(tr, ["MLT*", "CCHK", "ADI"])
def test_stop_indexes(self): """should return stop codon indexes for a specified frame""" sgc = GeneticCode(self.SGC) seq = DNA.make_seq("ATGCTAACATAAA") expected = [[9], [4], []] for frame, expect in enumerate(expected): got = sgc.get_stop_indices(seq, start=frame) self.assertEqual(got, expect)
def test_roundtrip_variable(self): """should recover the Variable feature type""" seq = DNA.make_seq("AAGGGGAAAACCCCCAAAAAAAAAATTTTTTTTTTAAA", name="plus") xx_y = [[[2, 6], 2.4], [[10, 15], 5.1], [[25, 35], 1.3]] y_valued = seq.add_annotation(Variable, "SNP", "freq", xx_y) json = seq.to_json() new = deserialise_object(json) got = list(new.get_annotations_matching("SNP"))[0] # annoyingly, comes back as list of lists self.assertEqual(got.xxy_list, [[list(xx), y] for xx, y in y_valued.xxy_list])
def makeSampleSequence(with_gaps=False): raw_seq = "AACCCAAAATTTTTTGGGGGGGGGGCCCC" cds = (15, 25) utr = (12, 15) if with_gaps: raw_seq = raw_seq[:5] + "-----" + raw_seq[10:-2] + "--" seq = DNA.make_seq(raw_seq) seq.add_annotation(Feature, "CDS", "CDS", [cds]) seq.add_annotation(Feature, "5'UTR", "5' UTR", [utr]) return seq
def test_convert_input(self): """converts data for dotplotting""" m, seq = DNA.make_seq("ACGGT--A").parse_out_gaps() aligned_seq = Aligned(m, seq) mapped_gap, new_seq = _convert_input(aligned_seq, None) self.assertIs(new_seq.moltype, DNA) self.assertIs(mapped_gap, m) self.assertIs(new_seq, seq) mapped_gap, new_seq = _convert_input("ACGGT--A", DNA) self.assertEqual(str(mapped_gap), str(m)) self.assertEqual(str(new_seq), str(seq))
def test_get_align_coords(self): """correctly returns the alignment coordinates""" # 01234 5 # ACGGT--A # 012345 # --GGTTTA m1, seq1 = DNA.make_seq("ACGGT--A").parse_out_gaps() m2, seq2 = DNA.make_seq("--GGTTTA").parse_out_gaps() x, y = get_align_coords(m1, m2) expect = [2, 4, None, 5, 5], [0, 2, None, 5, 5] self.assertEqual((x, y), expect) # we have no gaps, so coords will be None m1, s1 = seq1.parse_out_gaps() m2, s2 = seq2.parse_out_gaps() self.assertEqual(get_align_coords(m1, m2), None) # unless we indicate the seqs came from an Alignment m1, seq1 = DNA.make_seq("ACGGTTTA").parse_out_gaps() m2, seq2 = DNA.make_seq("GGGGTTTA").parse_out_gaps() x, y = get_align_coords(m1, m2, aligned=True) self.assertEqual((x, y), ([0, len(seq1)], [0, len(seq1)])) # raises an exception if the Aligned seqs are different lengths m1, seq1 = DNA.make_seq("ACGGTTTA").parse_out_gaps() m2, seq2 = DNA.make_seq("GGGGTT").parse_out_gaps() with self.assertRaises(AssertionError): get_align_coords(m1, m2, aligned=True)
def test_feature_from_alignment(self): """ seq features obtained from the alignment""" # Sequence features can be accessed via a containing Alignment: aln = make_aligned_seqs(data=[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False) self.assertEqual(str(aln), ">x\n-AAAAAAAAA\n>y\nTTTT--TTTT\n") exon = aln.get_seq("x").add_annotation(Feature, "exon", "fred", [(3, 8)]) aln_exons = aln.get_annotations_from_seq("x", "exon") aln_exons = aln.get_annotations_from_any_seq("exon") # But these will be returned as **alignment** # features with locations in alignment coordinates. self.assertEqual(str(exon), 'exon "fred" at [3:8]/9') self.assertEqual(str(aln_exons[0]), 'exon "fred" at [4:9]/10') self.assertEqual(str(aln_exons[0].get_slice()), ">x\nAAAAA\n>y\n--TTT\n") aln_exons[0].attach() self.assertEqual(len(aln.annotations), 1) # Similarly alignment features can be projected onto the aligned sequences, # where they may end up falling across gaps: exons = aln.get_projected_annotations("y", "exon") self.assertEqual(str(exons), '[exon "fred" at [-2-, 4:7]/8]') self.assertEqual(str(aln.get_seq("y")[exons[0].map.without_gaps()]), "TTT") # We copy the annotations from another sequence, aln = make_aligned_seqs(data=[["x", "-AAAAAAAAA"], ["y", "TTTT--CCCC"]], array_align=False) self.s = DNA.make_seq("AAAAAAAAA", name="x") exon = self.s.add_annotation(Feature, "exon", "fred", [(3, 8)]) exon = aln.get_seq("x").copy_annotations(self.s) aln_exons = list(aln.get_annotations_from_seq("x", "exon")) self.assertEqual(str(aln_exons), '[exon "fred" at [4:9]/10]') # even if the name is different. exon = aln.get_seq("y").copy_annotations(self.s) aln_exons = list(aln.get_annotations_from_seq("y", "exon")) self.assertEqual(str(aln_exons), '[exon "fred" at [3:4, 6:10]/10]') self.assertEqual(str(aln[aln_exons]), ">x\nAAAAA\n>y\nTCCCC\n") # default for get_annotations_from_any_seq is return all features got = aln.get_annotations_from_any_seq() self.assertEqual(len(got), 2)
def test_seq_shorter(self): """lost spans on shorter sequences""" # If the sequence is shorter, again you get a lost span. aln = make_aligned_seqs(data=[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False) diff_len_seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCCCCCCCCCC", "x") nonmatch = diff_len_seq.add_feature("repeat", "A", [(12, 14)]) aln.get_seq("y").copy_annotations(diff_len_seq) copied = list(aln.get_annotations_from_seq("y", "repeat")) self.assertEqual(str(copied), '[repeat "A" at [10:10, -6-]/10]')
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.make_seq("AAAATGCTTA" * r) seq1 = DNA.make_seq("AATTTTGCTG" * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of # the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return "*" else: t = time.clock() - t0 return int((len(seq1) * len(seq2)) / t / 1000)
def test_lost_spans(self): """features no longer included in an alignment represented by lost spans""" # If the feature lies outside the sequence being copied to, you get a # lost span aln = make_aligned_seqs(data=[["x", "-AAAA"], ["y", "TTTTT"]], array_align=False) seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCC", "x") exon = seq.add_feature("exon", "A", [(5, 8)]) aln.get_seq("x").copy_annotations(seq) copied = list(aln.get_annotations_from_seq("x", "exon")) self.assertEqual(str(copied), '[exon "A" at [5:5, -4-]/5]') self.assertEqual(str(copied[0].get_slice()), ">x\n----\n>y\n----\n")
def test_seq_different_name_with_same_length(self): """copying features between sequences""" # You can copy to a sequence with a different name, # in a different alignment if the feature lies within the length aln = make_aligned_seqs(data=[["x", "-AAAAAAAAA"], ["y", "TTTT--TTTT"]], array_align=False) seq = DNA.make_seq("CCCCCCCCCCCCCCCCCCCC", "x") match_exon = seq.add_feature("exon", "A", [(5, 8)]) aln.get_seq("y").copy_annotations(seq) copied = list(aln.get_annotations_from_seq("y", "exon")) self.assertEqual(str(copied), '[exon "A" at [7:10]/10]')
def test_score_seq_obj(self): """produce correct score from seq""" from cogent3 import DNA data = [ [0.1, 0.3, 0.5, 0.1], [0.25, 0.25, 0.25, 0.25], [0.05, 0.8, 0.05, 0.1], [0.7, 0.1, 0.1, 0.1], [0.6, 0.15, 0.05, 0.2], ] pssm = PSSM(data, "ACTG") seq = DNA.make_seq("".join("ACTG"[i] for i in [3, 1, 2, 0, 2, 2, 3])) scores = pssm.score_seq(seq) assert_allclose(scores, [-4.481, -5.703, -2.966], atol=1e-3)
def CigarParser(seqs, cigars, sliced=False, ref_seqname=None, start=None, end=None, moltype=DNA): """return an alignment from raw sequences and cigar strings if sliced, will return an alignment correspondent to ref sequence start to end Parameters ---------- seqs - raw sequences as {seqname: seq} cigars - corresponding cigar text as {seqname: cigar_text} cigars and seqs should have the same seqnames moltype - optional default to DNA """ data = {} if not sliced: for seqname in list(seqs.keys()): aligned_seq = aligned_from_cigar(cigars[seqname], seqs[seqname], moltype=moltype) data[seqname] = aligned_seq else: ref_aln_seq = aligned_from_cigar(cigars[ref_seqname], seqs[ref_seqname], moltype=moltype) m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align=False) data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]] for seqname in [ seqname for seqname in list(seqs.keys()) if seqname != ref_seqname ]: m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1]) if seq_loc: seq = seqs[seqname] if isinstance(seq, str): seq = moltype.make_seq(seq) data[seqname] = seq[seq_loc[0]:seq_loc[1]].gapped_by_map(m) else: data[seqname] = DNA.make_seq("-" * (aln_loc[1] - aln_loc[0])) aln = make_aligned_seqs(data) return aln
def get_var_data(aln, variant, ref_name, aln_flank, min_length, chroms): if variant == ['']: return None [ var_name, var_chrom, exon_strand, var_effects, var_alleles, flank_5_seq, flank_3_seq, var_coord ] = variant if not is_correct_chrom(chroms, var_chrom): return None var_alleles = set(var_alleles.split('/')) var_start = int(var_coord.split(',')[0]) #get alignment syn_aln = get_syntenic_alignment(aln, var_name, var_start, ref_name, aln_flank) syn_aln = LoadSeqs(data=copy.deepcopy(syn_aln.todict()), moltype=DNA, array_align=False) #check alignments and only keep the alignment meet requirements checked_aln = align_checker(syn_aln, ref_name, aln_flank, min_length) if not checked_aln: return None start_base = get_start_state(checked_aln, ref_name, aln_flank) end_base = get_end_state(start_base, var_alleles) if not end_base: return None if end_base is '': return None nbr_seq = DNA.make_seq(flank_5_seq + flank_3_seq) gc_content = get_gc(nbr_seq) allele_freqs = pep_alleles = gene_loc = gene_id = 'None' response = '-1' return (var_name, var_chrom, exon_strand, var_effects, allele_freqs, str(var_alleles), str(start_base), str(end_base), str(flank_5_seq), str(flank_3_seq), str(gc_content), pep_alleles, gene_loc, gene_id, response)
def _reverse_complement(table): '''returns a table with sequences reverse complemented''' pos_indices = [i for i, c in enumerate( table.header) if c.startswith('pos')] rows = table.tolist() for row in rows: # we use the cogent3 DnaSeq object to do reverse complementing seq = DNA.make_seq(''.join(row[i] for i in pos_indices)) seq = list(seq.rc()) for i, index in enumerate(pos_indices): row[index] = seq[i] if rows: new = make_table(header=table.header, rows=rows) else: new = None return new
def test_roundtrip_json(self): """features can roundtrip from json""" from cogent3.util.deserialise import deserialise_seq seq = DNA.make_seq("AAAAATATTATTGGGT") seq.add_annotation(Feature, "exon", "myname", [(0, 5)]) got = seq.to_json() new = deserialise_object(got) feat = new.get_annotations_matching("exon")[0] self.assertEqual(str(feat.get_slice()), "AAAAA") # now with a list span seq = seq[3:] feat = seq.get_annotations_matching("exon")[0] got = seq.to_json() new = deserialise_object(got) feat = new.get_annotations_matching("exon")[0] self.assertEqual(str(feat.get_slice(complete=False)), "AA")