def pair(self): pair = Pair() pair.set_from_data(self.id, self.r1, self.r2) if self.r1_quality: pair.r1.quality = self.r1_quality pair.r2.quality = self.r2_quality return pair
def test_single_R1_match_with_adapter_multiple_without(self): pair = Pair() pair.set_from_data('M02465:8:000000000-A5D', 'CCCGCCGTCCTTGGTGCCCGAGTGAGATCGGAAGA','CACTCGGGCACCAAGGACGGCGGGAGATCGGAAGA') self.spats.run.debug = True self.spats.run.algorithm = "find_partial" self.spats.process_pair(pair) self.assertEqual(None, pair.target) self.assertEqual(1, self.spats.counters.multiple_R1_match)
def run_case(case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) spats.process_pair(pair) print diagram(pair, spats.run) if case[3] != pair.site: spats._case_errors = True print "******* mismatch: {} != {}".format(case[3], pair.site)
def compare(self): from spats_shape_seq import Spats from spats_shape_seq.pair import Pair json_base = { 'target' : self.config['target'], 'config' : { 'algorithm' : 'find_partial', 'debug' : True }, 'expect' : {}} spats_fp = Spats(cotrans = self.cotrans) spats_lookup = Spats(cotrans = self.cotrans) self._update_run_config(spats_fp.run) self._update_run_config(spats_lookup.run, json_base['config']) spats_fp.run.algorithm = 'find_partial' spats_lookup.run.algorithm = 'lookup' spats_fp.addTargets(self.config['target']) spats_lookup.addTargets(self.config['target']) count = 0 match = 0 with FastFastqParser(self.r1, self.r2) as parser: total = parser.appx_number_of_pairs() for batch in parser.iterator(5000): for item in batch: pair_fp = Pair() pair_lookup = Pair() pair_fp.set_from_data(str(item[0]), item[1], item[2]) pair_lookup.set_from_data(str(item[0]), item[1], item[2]) try: spats_fp.process_pair(pair_fp) spats_lookup.process_pair(pair_lookup) except: print('Error after {}/{}'.format(match, count)) raise if (pair_fp.has_site == pair_lookup.has_site): if not pair_fp.has_site: count += 1 continue elif (pair_fp.target.name == pair_lookup.target.name and pair_fp.end == pair_lookup.end and pair_fp.site == pair_lookup.site and pair_fp.mutations == pair_lookup.mutations): count += 1 match += 1 continue json_base["id"] = str(item[0]) json_base["R1"] = str(item[1]) json_base["R2"] = str(item[2]) print('After {}/{} matches; mismatched pair: {} != {}\n{}'.format(match, count, pair_fp, pair_lookup, json.dumps(json_base, sort_keys = True,indent = 4, separators = (',', ': ')))) return print('{}/{}-{}...'.format(match, count, total)) spats_fp.counters.total_pairs = count spats_lookup.counters.total_pairs = count print('All match {}/{}.'.format(match, count)) print(spats_fp._report_counts()) print(spats_lookup._report_counts())
def test_single_R1_match_with_adapter_multiple_without(self): pair = Pair() pair.set_from_data('M02465:8:000000000-A5D', 'CCCGCCGTCCTTGGTGCCCGAGTGAGATCGGAAGA', 'CACTCGGGCACCAAGGACGGCGGGAGATCGGAAGA') self.spats.run.debug = True self.spats.run.algorithm = "find_partial" self.spats.process_pair(pair) self.assertEqual(None, pair.target) self.assertEqual(1, self.spats.counters.multiple_R1_match)
def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) if len(case) > 6: pair.r1.quality = case[6] pair.r2.quality = case[7] else: pair.r1.quality = 'K' * len(case[1]) pair.r2.quality = 'K' * len(case[2]) return pair
def cotrans_debug(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True #s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' #s.run.algorithm = "find_partial" #s.run._p_v102_compat = True s.run.minimum_target_match_length = 10 bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) #c = ['683779', 'TCCGGTCCTTGGTGCCCGAGTCAGAAAAAAATAGAA', 'TCTATTTTTTTCTGACTCGGGCACCAAGGACCGGAA', 82, 71] #c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "301028", "AAGTGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAAGGACACTTAGATCGGA", 96, 92 ] #c = [ "31631284", "TTCAGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAATGACCGGAAGATCGGA", 96, 92 ] #c = [ "7232", "AGGTGTCCTTGGTGCCCGAGTCAGTAGCTAAGAAAT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", -1, -1 ] #c = [ "16845404", "AAATGTCCTTGGTGCCCGAGTCAGACTGGTAGGAGT", "TCTTATAGGCGATGGAGTTCGCCATAAACGCTGCTT", -1, -1 ] #c = [ "24102328", "AAGCGTCCTTGGTGCCCGAGTCAGGAGTCATAGATC", "ATGACTCCTGACTCGGGCACCAAGGACGCTTAGATC", 46, 39 ] #c = [ "51216106", "GGGTGTCCTTGGTGCCCGAGTCAGATTAGCTAAGCA", "AGCTAATCTGACTCGGGCACCAAGGACGCTGCTTAG", 41, 34 ] c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "41823514", "GAATGTCCTTGGTGCCCGAGTCAGAACTCCAAGATC", "TGGAGTTCTGACTCGGGCACCAAGGACATTCAGATC", -1, -1 ] #c = [ "180", "AAGCTGTCCTTGGTGCCCGAGTCAGGAAAAGTTCTT", "TTTTTTTAGGAGGAAGGATCTATGAGCAAAGGAGAA", 120, 75 ] #c = [ "67219", "GAGTGTCCTTGGTGCCCGAGTCAGTCGACAACTCCA", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 134, 0 ] #c = [ "58726", "GGATGTCCTTGGTGCCCGAGTCAGCCTTAGATCGGA", "AAGGCTGACTCGGGCACCAAGGACATCCAGATCGGA", None, None ] #c = [ "188425", "GGACGTCCTTGGTGCCCGAGTCAGTATAGATCGGAA", "ATACTGACTCGGGCACCAAGGACTTCCAGATCGGAA", 24, 21 ] #c = [ "jjb_L21", "GGACGTCCTTGGTGCCCGAGTCAGGGCGAACTAGAT", "AGTTCGCCCTGACTCGGGCACCAAGGACGTCCAGAT", 21, 13 ] #c = [ "jjb_L20", "GGACGTCCTTGGTGCCCGAGTCAGGCGAACTCAGAT", "GAGTTCGCCTGACTCGGGCACCAAGGACGTCCAGAT", 20, 12 ] #c = [ "jjb_L19", "GGACGTCCTTGGTGCCCGAGTCAGCGAACTCCAGAT", "GGAGTTCGCTGACTCGGGCACCAAGGACGTCCAGAT", None, None ] #c = [ "406149", "AGGTGTCCTTGGTGCCCGAGTCAGGACAACTCCAGT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 132, 0 ] #c = [ "89185", "TCCAGTCCTTGGTGCCCGAGTCAGCTAAGCAGCGTT", "AATGACTCCTACCAGTATCACTACTGGTAGGAGTCT", 36, 38 ] #c = [ "3185000", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCC", -1, -1 ] #c = [ "jjb_3185000'", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCA", None, None ] #c = ['1', 'TCTGAGATCGGAAGAGCACACGTCTGAACTCCAGT', 'CAGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGT', None, None] #c = ['24941', 'TCCAGTCCTTGGTGCCCGAGTCAGAGACTCCTACCA', 'TATAGGCGATGGAGTTCGCCATAAACGCTGCTTAGC', -1, -1] c = [ 'jjbn', 'TTTGGTCCTTGGTGCCCGAGTCAGTAAAAAAATAGA', 'TCTATTTTTTTACTGACTCGGGCACCAAGGACCAAA', 83, 71 ] pair.set_from_data(c[0], c[1], c[2]) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.end) else: print "FAIL: {}".format(pair.failure)
def cotrans_test(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq import Spats s = Spats() from spats_shape_seq.partial import PartialFindProcessor #s.run._processor_class = PartialFindProcessor s.addTargets(bp + "F_wt.fa") from spats_shape_seq.pair import Pair pair = Pair() pair.set_from_data('x', 'GAGCGTCCTTGGTGCCCGAGTCAGAAATAGACTCCT', 'TATCACTACTGGTAGGAGTCTATTTCTGACTCGGGC') s.process_pair(pair) print "{}: {}".format(pair.target.name, pair.site)
def tmut_case(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" spats = Spats(cotrans=False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = "find_partial" spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut" spats.addTargets(bp + "mut_single.fa") from spats_shape_seq.pair import Pair pair = Pair() #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGTCCTTGGTGCCCGAGTCAGTCCTTGGTTCCCGAGTCACTCCTTTGTTCCCC', 'AGGACTGACTCGGGCACCAAGGACTTTCTCGTTCACCTATTTCTTTCTCTTCCCCCTTTTTCTTTCTCTTTCTCC' ] #c = [ 'GAGCGTCCTTGGTGCCCGAGTCAGATGCCGACCCGGGTGGGGGCCCTGCCAGCTACATCCCGGCACACGCGTCAT', 'TAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCAGGGCCCCCACCCG' ] #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGGACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAAC', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCA' ] #c = [ 'AGGCGTCCTTGGTGCCCGAGTCAGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTAGCGTTGCGG', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCT' ] #c = [ 'TTCAGTCCTTGGTGCCCGAGTCAGCCAGCTACATCCCGGCACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGA', 'AGGTCAGATCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCTGACTCGGGCACCAA' ] #c = [ 'AAATGTCCTTGGTGCCCGAGTCAGATCTGCCTTAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGA', 'TAAGGCAGATCTGACTCGGGCACCAAGGACATTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCG' ] #c = [ 'CTCAGTCCTTGGTGCCCGAGTCAGTGAGCTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTC', 'AGCTCACTGACTCGGGCACCAAGGACTGAGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGG' ] #c = [ 'AAGCGTCCTTGGTGCCCGAGTCAGTGGAGGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCT', 'ACCTCCACTGACTCGGGCACCAAGGACGCTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG' ] #c = [ 'TCCGGTCCTTGGTGCCCGAGTCAGATGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGT', 'ACATCTGACTCGGGCACCAAGGACCGGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTC' ] #c = [ 'TTTAAGTCCTTGGTGCCCGAGTCAGGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTA', 'TACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACCTGACTCGGGCACCAAGGACTTAAA' ] #c = [ 'TTCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTACCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'AGATCAACAAGAATTAGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAAC' ] #c = [ 'AAATCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'AATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGATT' ] #c = [ 'TCCGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCGGA' ] #c = [ 'TCCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCATTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'GGGTCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTTAGATCGGAAGAGCACAC', 'AAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGACCCAGATCGGAAGAGCGTCG' ] c = [ 'GAACCAACAAGAATTGGGACAACTCCAGTGAAAGGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAAGATCGGA', 'TCAGGAGGTTAATGATGAGCAAAGGAGAAGAACCTTTCACTGGAGTTGTCCCAATTCTTGTTGGTTCAGATCGGA' ] #c = [ 'CCTACAACAAGAATTGGGACAACTCCAGTGAGAAGTTCTTCTCCTTTGCTCATCATTAAGATCGGAAGAGCACAC', 'TAATGATGAGCAAAGGAGAAGAACTTCTCACTGGAGTTGTCCCAATTCTTGTTGTAGGAGATCGGAAGAGCGTCG' ] #c = [ 'CTTGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCTTTAACCTCCTGAATCACTAA', 'TAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCAAGA' ] pair.set_from_data('x', c[0], c[1]) spats.process_pair(pair) print diagram(pair, spats.run) if pair.has_site: print "{}: {} / {} {}".format(pair.target.name, pair.site, pair.end, pair.mutations) else: print "FAIL: {}".format(pair.failure)
def rdiff_func(db_path, rs1_name, rs2_name, diag_spats=None): from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram from spats_shape_seq.pair import Pair db = PairDB(db_path) n1 = db.num_results(rs1_name) n2 = db.num_results(rs2_name) print "{}: {} results / {}: {} results".format(rs1_name, n1, rs2_name, n2) if not n1 or not n2: print "** Abort." exit(1) print "Diffs:" ours_only = [] theirs_only = [] differences = [] for r in db.differing_results(rs1_name, rs2_name): if r[4] == -1: assert (r[9] != -1) theirs_only.append(r) elif r[9] == -1: ours_only.append(r) else: differences.append(r) all_lists = [ours_only, theirs_only, differences] for l in all_lists: reasons = {} for r in l: key = r[7] or r[12] or "different values" assert (key) rlist = reasons.get(key) if not rlist: rlist = [] reasons[key] = rlist rlist.append(r) for reason, rlist in reasons.iteritems(): for r in rlist[:min(len(rlist), 10)]: print " {}:{} s{}m{} ({}) -- {}:{} s{}m{} ({}) ([ '{}', '{}', '{}', {}, {}, [ {} ] ])".format( r[3] or 'x', r[4], r[5], r[6], r[7] or "OK", r[8] or 'x', r[9], r[10], r[11], r[12] or "OK", r[0], r[1], r[2], r[4], r[5], "" if -1 == r[6] else r[6]) if len(rlist) > 0: print "... {} total.".format(len(rlist)) if diag_spats: pair = Pair() pair.set_from_data(str(r[0]), str(r[1]), str(r[2])) diag_spats.process_pair(pair) print diagram(pair, diag_spats.run) print "{} total diffs.".format(sum(map(len, all_lists)))
def cotrans_debug(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True #s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' #s.run.algorithm = "find_partial" #s.run._p_v102_compat = True s.run.minimum_target_match_length = 10 bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) #c = ['683779', 'TCCGGTCCTTGGTGCCCGAGTCAGAAAAAAATAGAA', 'TCTATTTTTTTCTGACTCGGGCACCAAGGACCGGAA', 82, 71] #c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "301028", "AAGTGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAAGGACACTTAGATCGGA", 96, 92 ] #c = [ "31631284", "TTCAGTCCTTGGTGCCCGAGTCAGAGATAGATCGGA", "ATCTCTGACTCGGGCACCAATGACCGGAAGATCGGA", 96, 92 ] #c = [ "7232", "AGGTGTCCTTGGTGCCCGAGTCAGTAGCTAAGAAAT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", -1, -1 ] #c = [ "16845404", "AAATGTCCTTGGTGCCCGAGTCAGACTGGTAGGAGT", "TCTTATAGGCGATGGAGTTCGCCATAAACGCTGCTT", -1, -1 ] #c = [ "24102328", "AAGCGTCCTTGGTGCCCGAGTCAGGAGTCATAGATC", "ATGACTCCTGACTCGGGCACCAAGGACGCTTAGATC", 46, 39 ] #c = [ "51216106", "GGGTGTCCTTGGTGCCCGAGTCAGATTAGCTAAGCA", "AGCTAATCTGACTCGGGCACCAAGGACGCTGCTTAG", 41, 34 ] c = [ "1116:19486:8968", "TCCGGTCCTTGGTGCCCGAGTCAGTCCTTCCTCCTA", "GAGTCTATTTTTTTAGGAGGAAGGACTGACTCGGGC", 93, 68 ] #c = [ "41823514", "GAATGTCCTTGGTGCCCGAGTCAGAACTCCAAGATC", "TGGAGTTCTGACTCGGGCACCAAGGACATTCAGATC", -1, -1 ] #c = [ "180", "AAGCTGTCCTTGGTGCCCGAGTCAGGAAAAGTTCTT", "TTTTTTTAGGAGGAAGGATCTATGAGCAAAGGAGAA", 120, 75 ] #c = [ "67219", "GAGTGTCCTTGGTGCCCGAGTCAGTCGACAACTCCA", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 134, 0 ] #c = [ "58726", "GGATGTCCTTGGTGCCCGAGTCAGCCTTAGATCGGA", "AAGGCTGACTCGGGCACCAAGGACATCCAGATCGGA", None, None ] #c = [ "188425", "GGACGTCCTTGGTGCCCGAGTCAGTATAGATCGGAA", "ATACTGACTCGGGCACCAAGGACTTCCAGATCGGAA", 24, 21 ] #c = [ "jjb_L21", "GGACGTCCTTGGTGCCCGAGTCAGGGCGAACTAGAT", "AGTTCGCCCTGACTCGGGCACCAAGGACGTCCAGAT", 21, 13 ] #c = [ "jjb_L20", "GGACGTCCTTGGTGCCCGAGTCAGGCGAACTCAGAT", "GAGTTCGCCTGACTCGGGCACCAAGGACGTCCAGAT", 20, 12 ] #c = [ "jjb_L19", "GGACGTCCTTGGTGCCCGAGTCAGCGAACTCCAGAT", "GGAGTTCGCTGACTCGGGCACCAAGGACGTCCAGAT", None, None ] #c = [ "406149", "AGGTGTCCTTGGTGCCCGAGTCAGGACAACTCCAGT", "TTATAGGCGATGGAGTTCGCCATAAACGCTGCTTAG", 132, 0 ] #c = [ "89185", "TCCAGTCCTTGGTGCCCGAGTCAGCTAAGCAGCGTT", "AATGACTCCTACCAGTATCACTACTGGTAGGAGTCT", 36, 38 ] #c = [ "3185000", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCC", -1, -1 ] #c = [ "jjb_3185000'", "GAACGTCCTTGGTGCCCGAGTCAGGTTTATGGCGAA", "TCGCCATAAACCTGACTCGGGCACCAAGGACGTTCA", None, None ] #c = ['1', 'TCTGAGATCGGAAGAGCACACGTCTGAACTCCAGT', 'CAGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGT', None, None] #c = ['24941', 'TCCAGTCCTTGGTGCCCGAGTCAGAGACTCCTACCA', 'TATAGGCGATGGAGTTCGCCATAAACGCTGCTTAGC', -1, -1] c = ['jjbn', 'TTTGGTCCTTGGTGCCCGAGTCAGTAAAAAAATAGA', 'TCTATTTTTTTACTGACTCGGGCACCAAGGACCAAA', 83, 71 ] pair.set_from_data(c[0], c[1], c[2]) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.end) else: print "FAIL: {}".format(pair.failure)
def rdiff_func(db_path, rs1_name, rs2_name, diag_spats = None): from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram from spats_shape_seq.pair import Pair db = PairDB(db_path) n1 = db.num_results(rs1_name) n2 = db.num_results(rs2_name) print "{}: {} results / {}: {} results".format(rs1_name, n1, rs2_name, n2) if not n1 or not n2: print "** Abort." exit(1) print "Diffs:" ours_only = [] theirs_only = [] differences = [] for r in db.differing_results(rs1_name, rs2_name): if r[4] == -1: assert(r[9] != -1) theirs_only.append(r) elif r[9] == -1: ours_only.append(r) else: differences.append(r) all_lists = [ ours_only, theirs_only, differences ] for l in all_lists: reasons = {} for r in l: key = r[7] or r[12] or "different values" assert(key) rlist = reasons.get(key) if not rlist: rlist = [] reasons[key] = rlist rlist.append(r) for reason, rlist in reasons.iteritems(): for r in rlist[:min(len(rlist), 10)]: print " {}:{} s{}m{} ({}) -- {}:{} s{}m{} ({}) ([ '{}', '{}', '{}', {}, {}, [ {} ] ])".format(r[3] or 'x', r[4], r[5], r[6], r[7] or "OK", r[8] or 'x', r[9], r[10], r[11], r[12] or "OK", r[0], r[1], r[2], r[4], r[5], "" if -1 == r[6] else r[6] ) if len(rlist) > 0: print "... {} total.".format(len(rlist)) if diag_spats: pair = Pair() pair.set_from_data(str(r[0]), str(r[1]), str(r[2])) diag_spats.process_pair(pair) print diagram(pair, diag_spats.run) print "{} total diffs.".format(sum(map(len, all_lists)))
def tmut_case(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" spats = Spats(cotrans = False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = "find_partial" spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut" spats.addTargets(bp + "mut_single.fa") from spats_shape_seq.pair import Pair pair = Pair() #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGTCCTTGGTGCCCGAGTCAGTCCTTGGTTCCCGAGTCACTCCTTTGTTCCCC', 'AGGACTGACTCGGGCACCAAGGACTTTCTCGTTCACCTATTTCTTTCTCTTCCCCCTTTTTCTTTCTCTTTCTCC' ] #c = [ 'GAGCGTCCTTGGTGCCCGAGTCAGATGCCGACCCGGGTGGGGGCCCTGCCAGCTACATCCCGGCACACGCGTCAT', 'TAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCAGGGCCCCCACCCG' ] #c = [ 'GAATGTCCTTGGTGCCCGAGTCAGGACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAAC', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCA' ] #c = [ 'AGGCGTCCTTGGTGCCCGAGTCAGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTAGCGTTGCGG', 'ATCGGGGGCTCTGTTGGTTCCCCCGCAACGCTACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGTCT' ] #c = [ 'TTCAGTCCTTGGTGCCCGAGTCAGCCAGCTACATCCCGGCACACGCGTCATCTGCCTTGGCTGCTTCCTTCCGGA', 'AGGTCAGATCCGGAAGGAAGCAGCCAAGGCAGATGACGCGTGTGCCGGGATGTAGCTGGCTGACTCGGGCACCAA' ] #c = [ 'AAATGTCCTTGGTGCCCGAGTCAGATCTGCCTTAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGA', 'TAAGGCAGATCTGACTCGGGCACCAAGGACATTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCG' ] #c = [ 'CTCAGTCCTTGGTGCCCGAGTCAGTGAGCTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTC', 'AGCTCACTGACTCGGGCACCAAGGACTGAGAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGG' ] #c = [ 'AAGCGTCCTTGGTGCCCGAGTCAGTGGAGGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCT', 'ACCTCCACTGACTCGGGCACCAAGGACGCTTAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG' ] #c = [ 'TCCGGTCCTTGGTGCCCGAGTCAGATGTAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGT', 'ACATCTGACTCGGGCACCAAGGACCGGAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTC' ] #c = [ 'TTTAAGTCCTTGGTGCCCGAGTCAGGTCATCTGCCTTGGCTGCTTCCTTCCGGACCTGACCTGGTAAACAGAGTA', 'TACTCTGTTTACCAGGTCAGGTCCGGAAGGAAGCAGCCAAGGCAGATGACCTGACTCGGGCACCAAGGACTTAAA' ] #c = [ 'TTCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTACCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'AGATCAACAAGAATTAGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAAC' ] #c = [ 'AAATCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'AATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGATT' ] #c = [ 'TCCGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAATCACTAT', 'ATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCGGA' ] #c = [ 'TCCACAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCATTTGCTCATCATTAACCTCCTGAATCACTAT', 'GGACAAGCAATGCTTGCCTTGATGTTGAACTTTTGAATAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGA' ] #c = [ 'GGGTCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCATTTAGATCGGAAGAGCACAC', 'AAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGACCCAGATCGGAAGAGCGTCG' ] c = [ 'GAACCAACAAGAATTGGGACAACTCCAGTGAAAGGTTCTTCTCCTTTGCTCATCATTAACCTCCTGAAGATCGGA', 'TCAGGAGGTTAATGATGAGCAAAGGAGAAGAACCTTTCACTGGAGTTGTCCCAATTCTTGTTGGTTCAGATCGGA' ] #c = [ 'CCTACAACAAGAATTGGGACAACTCCAGTGAGAAGTTCTTCTCCTTTGCTCATCATTAAGATCGGAAGAGCACAC', 'TAATGATGAGCAAAGGAGAAGAACTTCTCACTGGAGTTGTCCCAATTCTTGTTGTAGGAGATCGGAAGAGCGTCG' ] #c = [ 'CTTGCAACAAGAATTGGGACAACTCCAGTGAAAAGTTCTTCTCCTTTGCTCATCTTTAACCTCCTGAATCACTAA', 'TAGTGATTCAGGAGGTTAATGATGAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGCAAGA' ] pair.set_from_data('x', c[0], c[1]) spats.process_pair(pair) print diagram(pair, spats.run) if pair.has_site: print "{}: {} / {} {}".format(pair.target.name, pair.site, pair.end, pair.mutations) else: print "FAIL: {}".format(pair.failure)
def test_tags(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/5sq_dev/" from spats_shape_seq import Spats s = Spats() from spats_shape_seq.tag import TagProcessor s.run._processor_class = TagProcessor #from spats_shape_seq.target import Targets #s.addTargets(bp + "5S.fa") s.addTarget("5s", "GGATGCCTGGCGGCCGTAGCGCGGTGGTCCCACCTGACCCCATGCCGAACTCAGAAGTGAAACGCCGTAGCGCCGATGGTAGTGTGGGGTCTCCCCATGCGAGAGTAGGGAACTGCCAGGCATCTGACTCGGGCACCAAGGAC") #s.addTarget("rc(5s)", "GTCCTTGGTGCCCGAGTCAGATGCCTGGCAGTTCCCTACTCTCGCATGGGGAGACCCCACACTACCATCGGCGCTACGGCGTTTCACTTCTGAGTTCGGCATGGGGTCAGGTGGGACCACCGCGCTACGGCCGCCAGGCATCC") #s.addTarget("adapter_t", s.run.adapter_t) #s.addTarget("adapter_b", s.run.adapter_b) #s._targets._index_word_length = 8 #s._targets._minimum_length = 8 #s.addTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) #s.addTarget("adapter_b_rc", reverse_complement(s.run.adapter_b)) p = s._processor p.addTagTarget("5s", "GGATGCCTGGCGGCCGTAGCGCGGTGGTCCCACCTGACCCCATGCCGAACTCAGAAGTGAAACGCCGTAGCGCCGATGGTAGTGTGGGGTCTCCCCATGCGAGAGTAGGGAACTGCCAGGCATCTGACTCGGGCACCAAGGAC") p.addTagTarget("5s_rc", "GTCCTTGGTGCCCGAGTCAGATGCCTGGCAGTTCCCTACTCTCGCATGGGGAGACCCCACACTACCATCGGCGCTACGGCGTTTCACTTCTGAGTTCGGCATGGGGTCAGGTGGGACCACCGCGCTACGGCCGCCAGGCATCC") from spats_shape_seq.util import reverse_complement p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) from spats_shape_seq.pair import Pair cases = [ [ "1101:20069:1063", "TTTAGTCCTTGGTGCCCGAGTCAGATGCCTGGCAG", "TCCCACCTGACCCCATGCCGAACTCAGAAGTGAAA" ], [ "1101:11562:1050", "AAACGTCCTTGGTGCCCGAGTCAGATGCCTGGCAG", "CCACCTGACCCCATGCCGAACTCAGAAGTGAAACG" ], [ "21189", "TTTGGTCCTTGGTGCCCGAGTCAGAGATCGGAAGA", "CTGACTCGGGCACCAAGGACCAAAAGATCGGAAGA" ], [ "1101:12888:8140", "GGATGTCCTTGGTGCCCGAGTCAGATGCCAGATCG", "GGCATCTGACTCGGGCACCAAGGACATACAGATCG" ], [ "18333", "GAGTGTCCTTGGTGCCCGAGTCAGTGGTAGATCGG", "ACCACTGACTCGGGCACCAAGGACACTCAGATCGG" ], ] pair = Pair() for case in cases: pair.set_from_data(case[0], case[1], case[2]) s.process_pair(pair) print pair.r1.original_seq print pair.r1.tags print pair.r2.original_seq print pair.r2.tags print "-----------------------------"
def align_pairs(): from spats_shape_seq.pair import Pair from spats_shape_seq.target import Targets from spats_shape_seq.util import reverse_complement, AlignmentParams from spats_shape_seq.mask import Mask, match_mask_optimized, base_similarity_ind target_seq = "GGACCCGATGCCGGACGAAAGTCCGCGCATCAACTATGCCTCTACCTGCTTCGGCCGATAAAGCCGACGATAATACTCCCAAAGCCC" # HairpinC_SS2 r1_seq = "GGGTGAGCGTGCTTTGGGAGTATTATCGTCGGCTTTATCGGCCGAAGCAGGTAGTGCATAGTTGATGCTCGGACTTTCG" r2_seq = "GGACCCGATGCCGGACGAAAGTCCGAGCATCAACTATGCCCTACCTGCTTCGGCCGATAAAGCCAAAAGACGATAAT" pair = Pair() pair.set_from_data("TEST_PAIR", r1_seq, r2_seq) targets = Targets() targets.minimum_match_length = 10 targets.addTarget("TEST_TARGET", target_seq, 0) targets.index() mask = match_mask_optimized(pair.r1.original_seq) assert (mask) pair.set_mask(Mask(mask)) target = pair.r1.find_in_targets(targets) pair.target = pair.r2.find_in_targets(targets, force_target=target) assert (pair.matched) masklen = pair.mask.length() adapter_t = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT" r2suffix = reverse_complement( pair.r1.original_seq[:masklen]) + reverse_complement(adapter_t) simfn = lambda nt1, nt2: base_similarity_ind(nt1, nt2, 3, 2, 1.5) ap = AlignmentParams(simfn, 5, 1) pair.r2.align_with_target(pair.target, ap, r2suffix) r2_adapter_trim = max( 0, pair.r2.match_index + pair.r2.match_len - pair.target.n) r1_adapter_trim = pair.r1.seq_len - (pair.target.n - pair.r2.match_index) if r1_adapter_trim > 0: pair.r1.rtrim += r1_adapter_trim pair.r1.match_start -= r1_adapter_trim pair.r1.align_with_target(pair.target, ap) exit(0)
def tag_test(): from spats_shape_seq import Spats s = Spats() s.run.cotrans = True s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' s.run.algorithm = "find_partial" bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" s.addTargets(bp + "cotrans_single.fa") from spats_shape_seq.pair import Pair pair = Pair() import cjb.util d = cjb.util.jsonAtPath("/tmp/spats_test.json") pair.set_from_data(str(d['id']), str(d['r1']), str(d['r2'])) print "{}\n{} / {}".format(pair.identifier, pair.r1.original_seq, pair.r2.original_seq) s.process_pair(pair) if pair.has_site: print "{}: {} / {}".format(pair.target.name, pair.site, pair.right) else: print "FAIL: {}".format(pair.failure)
def align_pairs(): from spats_shape_seq.pair import Pair from spats_shape_seq.target import Targets from spats_shape_seq.util import reverse_complement, AlignmentParams from spats_shape_seq.mask import Mask, match_mask_optimized, base_similarity_ind target_seq = "GGACCCGATGCCGGACGAAAGTCCGCGCATCAACTATGCCTCTACCTGCTTCGGCCGATAAAGCCGACGATAATACTCCCAAAGCCC" # HairpinC_SS2 r1_seq = "GGGTGAGCGTGCTTTGGGAGTATTATCGTCGGCTTTATCGGCCGAAGCAGGTAGTGCATAGTTGATGCTCGGACTTTCG" r2_seq = "GGACCCGATGCCGGACGAAAGTCCGAGCATCAACTATGCCCTACCTGCTTCGGCCGATAAAGCCAAAAGACGATAAT" pair = Pair() pair.set_from_data("TEST_PAIR", r1_seq, r2_seq) targets = Targets() targets.minimum_match_length = 10 targets.addTarget("TEST_TARGET", target_seq, 0) targets.index() mask = match_mask_optimized(pair.r1.original_seq) assert(mask) pair.set_mask(Mask(mask)) target = pair.r1.find_in_targets(targets) pair.target = pair.r2.find_in_targets(targets, force_target = target) assert(pair.matched) masklen = pair.mask.length() adapter_t = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT" r2suffix = reverse_complement(pair.r1.original_seq[:masklen]) + reverse_complement(adapter_t) simfn = lambda nt1, nt2: base_similarity_ind(nt1, nt2, 3, 2, 1.5) ap = AlignmentParams(simfn, 5, 1) pair.r2.align_with_target(pair.target, ap, r2suffix) r2_adapter_trim = max(0, pair.r2.match_index + pair.r2.match_len - pair.target.n) r1_adapter_trim = pair.r1.seq_len - (pair.target.n - pair.r2.match_index) if r1_adapter_trim > 0: pair.r1.rtrim += r1_adapter_trim pair.r1.match_start -= r1_adapter_trim pair.r1.align_with_target(pair.target, ap) exit(0)
def test_find_partial_weird_case(self): pair = Pair() pair.set_from_data("x", 'CTCAGTCCTTGGTGCCCGAGTCAGGATCGGAAGAG', 'TGACTCGGGCACCAAAGACTGAGAGATCGGAAGAG') self.spats.process_pair(pair) print("{} / {}".format(pair.site, pair.failure))
def processed_pair(self, matched_pair): pair = Pair() pair.set_from_data(matched_pair.identifier, matched_pair.r1, matched_pair.r2, matched_pair.multiplicity) self.ui.processor.process_pair_detail(pair) return pair
def pair_for_case(self, case): pair = Pair() pair.set_from_data(case[0], case[1], case[2]) return pair