def addv102(): db_path = sys.argv[2] targets_path = sys.argv[3] out_path = sys.argv[4] from spats_shape_seq.db import PairDB db = PairDB(db_path) db.add_v102_comparison(targets_path, out_path)
def run_dataset(self, case, algorithm): bp = "test/{}/".format(case) test_file = bp + "test.spats.tmp" try: shutil.copyfile(bp + "ds.spats", test_file) db = PairDB(test_file) s = Spats() db.load_run(s.run) if not s.run.cotrans and algorithm == "native": return s.run.writeback_results = True s.run.result_set_name = "test" s.run.algorithm = algorithm s.run.quiet = True s.loadTargets(db) if not s._processor.exists(): # just ignore the native test if it's not available self.assertEqual("native", algorithm) return s.process_pair_db(db, batch_size = 1024) # small batch_size just to exercise multiprocessing code msg = None count = 0 for res in db.differing_results("test", "test_validation"): msg = str([str(x) for x in res]) count += 1 self.assertEqual(0, count, "{} differing results: {} / {} \n{}".format(count, case, algorithm, msg)) finally: if os.path.exists(test_file): os.remove(test_file)
def makedb(): db_path = sys.argv[2] targets_path = sys.argv[3] r1_path = sys.argv[4] r2_path = sys.argv[5] from spats_shape_seq.db import PairDB db = PairDB(db_path) db.show_progress_every = 200000 db.load_and_index(targets_path, r1_path, r2_path)
def d5s_writeback_run(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "dev_out/pairs.db") pair_db.add_targets_table(bp + "5s/5S.fa") from spats_shape_seq import Spats s = Spats() s.addTargets(bp + "5s/5S.fa") s.writeback_results = True s.result_set_name = "pure_python" s.process_pair_db(pair_db)
def rdiff_func(db_path, rs1_name, rs2_name, diag_spats=None): from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram from spats_shape_seq.pair import Pair db = PairDB(db_path) n1 = db.num_results(rs1_name) n2 = db.num_results(rs2_name) print "{}: {} results / {}: {} results".format(rs1_name, n1, rs2_name, n2) if not n1 or not n2: print "** Abort." exit(1) print "Diffs:" ours_only = [] theirs_only = [] differences = [] for r in db.differing_results(rs1_name, rs2_name): if r[4] == -1: assert (r[9] != -1) theirs_only.append(r) elif r[9] == -1: ours_only.append(r) else: differences.append(r) all_lists = [ours_only, theirs_only, differences] for l in all_lists: reasons = {} for r in l: key = r[7] or r[12] or "different values" assert (key) rlist = reasons.get(key) if not rlist: rlist = [] reasons[key] = rlist rlist.append(r) for reason, rlist in reasons.iteritems(): for r in rlist[:min(len(rlist), 10)]: print " {}:{} s{}m{} ({}) -- {}:{} s{}m{} ({}) ([ '{}', '{}', '{}', {}, {}, [ {} ] ])".format( r[3] or 'x', r[4], r[5], r[6], r[7] or "OK", r[8] or 'x', r[9], r[10], r[11], r[12] or "OK", r[0], r[1], r[2], r[4], r[5], "" if -1 == r[6] else r[6]) if len(rlist) > 0: print "... {} total.".format(len(rlist)) if diag_spats: pair = Pair() pair.set_from_data(str(r[0]), str(r[1]), str(r[2])) diag_spats.process_pair(pair) print diagram(pair, diag_spats.run) print "{} total diffs.".format(sum(map(len, all_lists)))
def rdiff_func(db_path, rs1_name, rs2_name, diag_spats = None): from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram from spats_shape_seq.pair import Pair db = PairDB(db_path) n1 = db.num_results(rs1_name) n2 = db.num_results(rs2_name) print "{}: {} results / {}: {} results".format(rs1_name, n1, rs2_name, n2) if not n1 or not n2: print "** Abort." exit(1) print "Diffs:" ours_only = [] theirs_only = [] differences = [] for r in db.differing_results(rs1_name, rs2_name): if r[4] == -1: assert(r[9] != -1) theirs_only.append(r) elif r[9] == -1: ours_only.append(r) else: differences.append(r) all_lists = [ ours_only, theirs_only, differences ] for l in all_lists: reasons = {} for r in l: key = r[7] or r[12] or "different values" assert(key) rlist = reasons.get(key) if not rlist: rlist = [] reasons[key] = rlist rlist.append(r) for reason, rlist in reasons.iteritems(): for r in rlist[:min(len(rlist), 10)]: print " {}:{} s{}m{} ({}) -- {}:{} s{}m{} ({}) ([ '{}', '{}', '{}', {}, {}, [ {} ] ])".format(r[3] or 'x', r[4], r[5], r[6], r[7] or "OK", r[8] or 'x', r[9], r[10], r[11], r[12] or "OK", r[0], r[1], r[2], r[4], r[5], "" if -1 == r[6] else r[6] ) if len(rlist) > 0: print "... {} total.".format(len(rlist)) if diag_spats: pair = Pair() pair.set_from_data(str(r[0]), str(r[1]), str(r[2])) diag_spats.process_pair(pair) print diagram(pair, diag_spats.run) print "{} total diffs.".format(sum(map(len, all_lists)))
def make_test_dataset(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/data/" from spats_shape_seq import Spats from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "ds.spats") pair_db.add_targets_table(bp + "../cotrans_single.fa") pair_db.parse(bp + "med_R1.fq", bp + "med_R2.fq") s = Spats(cotrans=True) s.run.num_workers = 1 s.run.writeback_results = True s.run._process_all_pairs = True s.run.algorithm = "find_partial" s.run.result_set_name = "test_validation" s.process_pair_db(pair_db) pair_db.store_run(s.run) pair_db.store_counters('spats', s.counters)
def make_test_dataset(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/data/" from spats_shape_seq import Spats from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "ds.spats") pair_db.add_targets_table(bp + "../cotrans_single.fa") pair_db.parse(bp + "med_R1.fq", bp + "med_R2.fq") s = Spats(cotrans = True) s.run.num_workers = 1 s.run.writeback_results = True s.run._process_all_pairs = True s.run.algorithm = "find_partial" s.run.result_set_name = "test_validation" s.process_pair_db(pair_db) pair_db.store_run(s.run) pair_db.store_counters('spats', s.counters)
def dbrun(): db_path = sys.argv[2] run_name = sys.argv[3] from spats_shape_seq import Spats from spats_shape_seq.db import PairDB db = PairDB(db_path) s = Spats() s.run.writeback_results = True s.run.result_set_name = run_name #s.run.resume_processing = True s.process_pair_db(db)
class TestDatasets(unittest.TestCase): def test_datasets(self): for case in cases: for alg in algorithms: if os.environ.get('SKIP_SLOW_TESTS') and alg == 'native': raise nose.SkipTest('skipping slow tests') self.run_dataset(case, alg) print("Ran {} datasets.".format(len(cases))) def run_dataset(self, case, algorithm): bp = "test/{}/".format(case) test_file = bp + "test.spats.tmp" try: shutil.copyfile(bp + "ds.spats", test_file) db = PairDB(test_file) s = Spats() db.load_run(s.run) if not s.run.cotrans and algorithm == "native": return s.run.writeback_results = True s.run.result_set_name = "test" s.run.algorithm = algorithm s.run.quiet = True s.loadTargets(db) if not s._processor.exists(): # just ignore the native test if it's not available self.assertEqual("native", algorithm) return s.process_pair_db( db, batch_size=1024 ) # small batch_size just to exercise multiprocessing code msg = None count = 0 for res in db.differing_results("test", "test_validation"): msg = str([str(x) for x in res]) count += 1 self.assertEqual( 0, count, "{} differing results: {} / {} \n{}".format( count, case, algorithm, msg)) finally: if os.path.exists(test_file): os.remove(test_file)
def tquery(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "db/pairs.db") print pair_db.results_matching(1, ["linker_cotrans", "adapter"], ["match"])
def tags(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "db/pairs.db") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "cotrans_single.fa") pair_db.parse(bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R1.fastq", bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R2.fastq", sample_size=100000) from spats_shape_seq import Spats from spats_shape_seq.tag import TagProcessor from spats_shape_seq.util import reverse_complement s = Spats() s.run._processor_class = TagProcessor s.run.writeback_results = True s.run.result_set_name = "tags" s.run.num_workers = 1 s.run.cotrans = True s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' s.loadTargets(pair_db) s.run.allow_indeterminate = True s.run.allowed_target_errors = 2 s.run.allowed_adapter_errors = 2 p = s._processor for target in pair_db.targets(): p.addTagTarget(target[0], target[1]) p.addTagTarget(target[0] + "_rc", reverse_complement(target[1])) p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) if s.run.cotrans: p.addTagTarget("linker_cotrans", s.run.cotrans_linker) p.addTagTarget("linker_cotrans_rc", reverse_complement(s.run.cotrans_linker)) s.process_pair_db(pair_db) rsid = pair_db.result_set_id_for_name(s.run.result_set_name) pair_db.count_tags(rsid) print pair_db.tag_counts(rsid)
def tags(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "db/pairs.db") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "cotrans_single.fa") pair_db.parse(bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R1.fastq", bp + "data/EJS_6_F_10mM_NaF_Rep1_GCCAAT_R2.fastq", sample_size = 100000) from spats_shape_seq import Spats from spats_shape_seq.tag import TagProcessor from spats_shape_seq.util import reverse_complement s = Spats() s.run._processor_class = TagProcessor s.run.writeback_results = True s.run.result_set_name = "tags" s.run.num_workers = 1 s.run.cotrans = True s.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' s.loadTargets(pair_db) s.run.allow_indeterminate = True s.run.allowed_target_errors = 2 s.run.allowed_adapter_errors = 2 p = s._processor for target in pair_db.targets(): p.addTagTarget(target[0], target[1]) p.addTagTarget(target[0] + "_rc", reverse_complement(target[1])) p.addTagTarget("adapter_t_rc", reverse_complement(s.run.adapter_t)) p.addTagTarget("adapter_b", s.run.adapter_b) if s.run.cotrans: p.addTagTarget("linker_cotrans", s.run.cotrans_linker) p.addTagTarget("linker_cotrans_rc", reverse_complement(s.run.cotrans_linker)) s.process_pair_db(pair_db) rsid = pair_db.result_set_id_for_name(s.run.result_set_name) pair_db.count_tags(rsid) print pair_db.tag_counts(rsid)
def pair_db(self): """Access the underlying :class:`.db.PairDB`. """ if not self._pair_db: self._pair_db = PairDB(self.db_path) return self._pair_db
def tquery(): bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/datasets/cotrans/" from spats_shape_seq.db import PairDB pair_db = PairDB(bp + "db/pairs.db") print pair_db.results_matching(1, [ "linker_cotrans", "adapter" ], [ "match" ])
def tmut(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" pair_db = PairDB(bp + "ds_cmp.spats") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "mut_single.fa") fq_name = "mut2" pair_db.parse(bp + fq_name + "_R1.fastq", bp + fq_name + "_R2.fastq") spatss = [] for alg in ["find_partial", "lookup"]: spats = Spats(cotrans=False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = alg spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut_" + alg spats.process_pair_db(pair_db) pair_db.store_run(spats.run) pair_db.store_counters(spats.run.result_set_name, spats.counters) spatss.append(spats) rdiff_func(bp + "ds_cmp.spats", "mut_find_partial", "mut_lookup", diag_spats=spatss[0])
def tmut(): from spats_shape_seq import Spats from spats_shape_seq.db import PairDB from spats_shape_seq.diagram import diagram bp = "/Users/jbrink/mos/tasks/1RwIBa/tmp/mutsl/" pair_db = PairDB(bp + "ds_cmp.spats") if True: print "Parsing to db..." pair_db.wipe() pair_db.add_targets_table(bp + "mut_single.fa") fq_name = "mut2" pair_db.parse(bp + fq_name + "_R1.fastq", bp + fq_name + "_R2.fastq") spatss = [] for alg in [ "find_partial", "lookup" ]: spats = Spats(cotrans = False) spats.run.cotrans_linker = 'CTGACTCGGGCACCAAGGAC' spats.run.count_mutations = True spats.run.algorithm = alg spats.run.allowed_target_errors = 1 spats.run.adapter_b = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" spats.run._process_all_pairs = True spats.run.writeback_results = True spats.run.num_workers = 1 spats.run.result_set_name = "mut_" + alg spats.process_pair_db(pair_db) pair_db.store_run(spats.run) pair_db.store_counters(spats.run.result_set_name, spats.counters) spatss.append(spats) rdiff_func(bp + "ds_cmp.spats", "mut_find_partial", "mut_lookup", diag_spats = spatss[0])
def open_spats(self, path): self._db = PairDB(path) self._loadDBAndModel() self.db_name = os.path.basename(path) cjb.util.writeJsonToPath({ "last" : path}, self.last_path)