def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG") seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 131.0) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual( obs_msa, TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G"), DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")])) self.assertEqual(obs_score, 97.0)
def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG" seq2 = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25 aln1 = ("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------") aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA") expected = (aln1, aln2, 131.0) with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) aln1 = ("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G") aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA") expected = (aln1, aln2, 97.0) with warnings.catch_warnings(): warnings.simplefilter("ignore") actual = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2])
def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self): # in these tests one sequence is about 3x the length of the other. # we toggle penalize_terminal_gaps to confirm that it results in # different alignments and alignment scores. seq1 = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG" seq2 = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25 aln1 = ("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTTG-------------------------") aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA") expected = (aln1, aln2, 131.0) actual = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=False) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) aln1 = ("-------------------------ACCGTGGACCGTTAGGA" "TTGGACCCAAGGTT-------------------------G") aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA" "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA") expected = (aln1, aln2, 97.0) actual = global_pairwise_align_nucleotide( seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4, penalize_terminal_gaps=True) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2])
def test_global_pairwise_align_nucleotide(self): obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 41.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"), DNA("GAACTTTGAC---GTAAC")])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # DNA sequences with metadata obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})])) self.assertEqual(obs_score, 32.0) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # Align one DNA sequence and one TabularMSA, score computed manually obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide( TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]), DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual( obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}), DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})])) self.assertEqual(obs_score, 27.5) self.assertEqual(obs_start_end, [(0, 16), (0, 14)]) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, DNA("ACGT")) self.assertRaises(TypeError, global_pairwise_align_nucleotide, DNA("ACGT"), 42)
def test_nucleotide_aligners_use_substitution_matrices(self): alt_sub = make_identity_substitution_matrix(10, -10) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with local alignment actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score()) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with global alignment actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = global_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score())
def test_nucleotide_aligners_use_substitution_matrices(self): alt_sub = make_identity_substitution_matrix(10, -10) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with local alignment with warnings.catch_warnings(): warnings.simplefilter("ignore") actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score()) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with global alignment with warnings.catch_warnings(): warnings.simplefilter("ignore") actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = global_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score())
def global_alignments(ref, q): s1 = DNA(ref) s2 = DNA(q) alignment, score, start_end_positions = global_pairwise_align_nucleotide( s1, s2, match_score=4, mismatch_score=1) return alignments_to_cigar(alignment[0]._string.decode("utf-8"), alignment[1]._string.decode("utf-8"))
def test_nucleotide_aligners_use_substitution_matrices(self): alt_sub = make_identity_substitution_matrix(10, -10) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with local alignment msa_no_sub, score_no_sub, start_end_no_sub = \ local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) msa_alt_sub, score_alt_sub, start_end_alt_sub = \ local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(msa_no_sub, msa_alt_sub) self.assertNotEqual(score_no_sub, score_alt_sub) self.assertNotEqual(start_end_no_sub, start_end_alt_sub) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with global alignment msa_no_sub, score_no_sub, start_end_no_sub = \ global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) msa_alt_sub, score_alt_sub, start_end_alt_sub = \ global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(msa_no_sub, msa_alt_sub) self.assertNotEqual(score_no_sub, score_alt_sub) self.assertEqual(start_end_no_sub, start_end_alt_sub)
def global_align(seq1_1hot, seq2_1hot): """Align two 1-hot encoded sequences.""" align_opts = { 'gap_open_penalty': 10, 'gap_extend_penalty': 1, 'match_score': 5, 'mismatch_score': -4 } seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot)) seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot)) # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0] seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, gap_open_penalty=10, gap_extend_penalty=1, match_score=5, mismatch_score=-4)[0] seq1_align = str(seq_align[0]) seq2_align = str(seq_align[1]) return seq1_align, seq2_align
def dnaAlign(seq1, seq2, gap_open_penalty, gap_extend_penalty, local=False): seq1 = seq1.upper() seq2 = seq2.upper() if local: aln, score, _ = local_pairwise_align_nucleotide( DNA(seq1), DNA(seq2), gap_open_penalty, gap_extend_penalty) else: aln, score, _ = global_pairwise_align_nucleotide( DNA(seq1), DNA(seq2), gap_open_penalty, gap_extend_penalty) response = { 'aln1': str(aln[0]), 'aln2': str(aln[1]), 'score': score, 'similarity': float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) * 100)) } return response
def test_global_pairwise_align_nucleotide_invalid_dtype(self): with self.assertRaisesRegex(TypeError, "TabularMSA with DNA or RNA dtype.*dtype " "'Protein'"): global_pairwise_align_nucleotide(TabularMSA([DNA('ACGT')]), TabularMSA([Protein('PAW')]))
def test_global_pairwise_align_nucleotide(self): expected = ("G-ACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 41.0, 0, 0) actual = global_pairwise_align_nucleotide("GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), list('01')) expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0) actual = global_pairwise_align_nucleotide("GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), list('01')) # DNA (rather than str) as input expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0) actual = global_pairwise_align_nucleotide(DNA("GACCTTGACCAGGTACC", "s1"), DNA("GAACTTTGACGTAAC", "s2"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), ["s1", "s2"]) # Align one DNA sequence and one Alignment, score computed manually expected = ("-GACCTTGACCAGGTACC", "-GACCATGACCAGGTACC", "GAACTTTGAC---GTAAC", 27.5, 0, 0) actual = global_pairwise_align_nucleotide(Alignment( [DNA("GACCTTGACCAGGTACC", "s1"), DNA("GACCATGACCAGGTACC", "s2")]), DNA("GAACTTTGACGTAAC", "s3"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(str(actual[2]), expected[2]) self.assertEqual(actual.score(), expected[3]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), ["s1", "s2", "s3"]) # ids are provided if they're not passed in actual = global_pairwise_align_nucleotide(DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(actual.ids(), list('01')) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, "HEAGAWGHEE") self.assertRaises(TypeError, global_pairwise_align_nucleotide, "HEAGAWGHEE", 42)
def test_global_pairwise_align_nucleotide(self): expected = ("G-ACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 41.0, 0, 0) actual = global_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), list('01')) expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0) actual = global_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), list('01')) # DNA (rather than str) as input expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0) actual = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(actual.score(), expected[2]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), ["s1", "s2"]) # Align one DNA sequence and one Alignment, score computed manually expected = ("-GACCTTGACCAGGTACC", "-GACCATGACCAGGTACC", "GAACTTTGAC---GTAAC", 27.5, 0, 0) actual = global_pairwise_align_nucleotide( Alignment([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}), DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]), DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(str(actual[0]), expected[0]) self.assertEqual(str(actual[1]), expected[1]) self.assertEqual(str(actual[2]), expected[2]) self.assertEqual(actual.score(), expected[3]) self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)]) self.assertEqual(actual.ids(), ["s1", "s2", "s3"]) # ids are provided if they're not passed in actual = global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5, mismatch_score=-4) self.assertEqual(actual.ids(), list('01')) # TypeError on invalid input self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42, "HEAGAWGHEE") self.assertRaises(TypeError, global_pairwise_align_nucleotide, "HEAGAWGHEE", 42)
def pairwise_similarity(seq, query): alignment = global_pairwise_align_nucleotide(seq, query) return alignment[0].fraction_same(alignment[1])
def get_meth_profile(args, seg_chrom, seg_start, seg_end, seg_name, seg_strand): logger.info('profiling %s %s:%d-%d:%s' % (seg_name, seg_chrom, seg_start, seg_end, seg_strand)) te_ref_seq = single_seq_fa(args.teref) ref = pysam.Fastafile(args.ref) meth_tbx = pysam.Tabixfile(args.meth) tmp_methdata = str(uuid4()) + '.tmp.methdata.tsv' with open(tmp_methdata, 'w') as meth_out: # header with gzip.open(args.meth, 'rt') as _: for line in _: assert line.startswith('chromosome') meth_out.write(line) break assert seg_chrom in meth_tbx.contigs for rec in meth_tbx.fetch(seg_chrom, seg_start, seg_end): meth_out.write(str(rec) + '\n') # index by read_name methdata = pd.read_csv(tmp_methdata, sep='\t', header=0, index_col=4) os.remove(tmp_methdata) reads = [] if args.excl_ambig: reads = exclude_ambiguous_reads(args.bam, seg_chrom, seg_start, seg_end) else: reads = get_reads(args.bam, seg_chrom, seg_start, seg_end) reads = list(set(reads).intersection(set(methdata.index))) methdata = methdata.loc[reads] seg_reads = {} for index, row in methdata.iterrows(): r_start = row['start'] r_end = row['end'] llr = row['log_lik_ratio'] seq = row['sequence'] # get per-CG position (nanopolish/calculate_methylation_frequency.py) cg_pos = seq.find("CG") first_cg_pos = cg_pos while cg_pos != -1: cg_start = r_start + cg_pos - first_cg_pos cg_pos = seq.find("CG", cg_pos + 1) cg_seg_start = cg_start - seg_start if cg_start >= seg_start and cg_start <= seg_end: if index not in seg_reads: seg_reads[index] = Read(index, cg_seg_start, llr) else: seg_reads[index].add_cpg(cg_seg_start, llr) meth_table = dd(dict) sample = '.'.join(args.bam.split('.')[:-1]) for name, read in seg_reads.items(): for loc in read.llrs.keys(): uuid = str(uuid4()) meth_table[uuid]['loc'] = loc meth_table[uuid]['llr'] = read.llrs[loc] meth_table[uuid]['read'] = name meth_table[uuid]['sample'] = sample meth_table[uuid]['call'] = read.meth_calls[loc] meth_table = pd.DataFrame.from_dict(meth_table).T meth_table['loc'] = pd.to_numeric(meth_table['loc']) meth_table['llr'] = pd.to_numeric(meth_table['llr']) meth_table['orig_loc'] = meth_table['loc'] meth_table['loc'] = ss.rankdata(meth_table['loc'], method='dense') coord_to_cpg = {} cpg_to_coord = {} for orig_loc, new_loc in zip(meth_table['orig_loc'], meth_table['loc']): coord_to_cpg[orig_loc] = new_loc cpg_to_coord[new_loc] = orig_loc windowed_methfrac, meth_n = slide_window(meth_table, sample, width=int(args.slidingwindowsize), slide=int(args.slidingwindowstep)) if len(windowed_methfrac) <= int(args.smoothwindowsize): logger.warning('too few sites after windowing: %s:%d-%d' % (seg_chrom, seg_start, seg_end)) return [], [] smoothed_methfrac = smooth(np.asarray(list(windowed_methfrac.values())), window_len=int(args.smoothwindowsize)) coord_meth_pos = [] cpg_meth_pos = list(windowed_methfrac.keys()) for cpg in cpg_meth_pos: if seg_strand == '+': coord_meth_pos.append(cpg_to_coord[cpg]) if seg_strand == '-': coord_meth_pos.append((seg_end - seg_start) - cpg_to_coord[cpg]) # alignment to ref elt elt_seq = ref.fetch(seg_chrom, seg_start, seg_end) if seg_strand == '-': elt_seq = rc(elt_seq) te_ref_seq = te_ref_seq.upper() elt_seq = elt_seq.upper() s_ref = skseq.DNA(te_ref_seq) s_elt = skseq.DNA(elt_seq) aln_res = [] try: if args.globalign: aln_res = skalign.global_pairwise_align_nucleotide(s_ref, s_elt) else: aln_res = skalign.local_pairwise_align_ssw(s_ref, s_elt) except IndexError: # scikit-bio throws this if no bases align >:| logger.warning('no align on seg: %s:%d-%d' % (seg_chrom, seg_start, seg_end)) return [], [] coord_ref, coord_elt = aln_res[2] len_ref = coord_ref[1] - coord_ref[0] len_elt = coord_elt[1] - coord_elt[0] if len_ref / len(te_ref_seq) < float(args.lenfrac): logger.warning( 'ref align too short on seg: %s:%d-%d (%f)' % (seg_chrom, seg_start, seg_end, len_ref / len(te_ref_seq))) return [], [] if len_elt / len(elt_seq) < float(args.lenfrac): logger.warning('elt align too short on seg: %s:%d-%d (%f)' % (seg_chrom, seg_start, seg_end, len_elt / len(elt_seq))) return [], [] tab_msa = aln_res[0] elt_to_ref_coords = {} pos_ref = coord_ref[0] pos_elt = coord_elt[0] for pos in tab_msa.iter_positions(): pos = list(pos) b_ref = pos[0] b_elt = pos[1] if '-' not in pos: elt_to_ref_coords[pos_elt] = pos_ref pos_ref += 1 pos_elt += 1 if b_elt == '-': pos_ref += 1 if b_ref == '-': elt_to_ref_coords[pos_elt] = 'na' pos_elt += 1 revised_coord_meth_pos = [] meth_profile = [] for pos, meth in zip(coord_meth_pos, smoothed_methfrac): if pos not in elt_to_ref_coords: continue revised_pos = elt_to_ref_coords[pos] if revised_pos != 'na': revised_coord_meth_pos.append(revised_pos) meth_profile.append(meth) return revised_coord_meth_pos, meth_profile