def test_not_redundant_with_heuristic(self): # Sequences need to be long because probe.shares_some_kmers # will use a default k-mer size of 20 fn = nrf.redundant_longest_common_substring( 2, 25, prune_with_heuristic_and_anchor=True) # Test when they do not share k-mers a = probe.Probe.from_str('ATCGATCGATCGAAAAAAAAAAAAAAATTTTT') b = probe.Probe.from_str('CCGGCCGGCCGGTTTTTTTTTTTTTTTAAAAA') self.assertFalse(fn(a, b)) # Test when they do share k-mers but are still not # redundant a = probe.Probe.from_str('ATCGATCGATCGAAAAAAAAAAAAAAATTTTT') b = probe.Probe.from_str('CCGGCCGGCCGGAAAAAAAAAAAAAAATTTTT') self.assertFalse(fn(a, b)) # Test when they do share k-mers but are still not # redundant fn = nrf.redundant_longest_common_substring( 2, 60, prune_with_heuristic_and_anchor=True) a = probe.Probe.from_str(('A' * 40 + 'DEF') * 4) b = probe.Probe.from_str(('A' * 40 + 'XYZ') * 4) self.assertFalse(fn(a, b)) # Test when they do share k-mers but are still not # redundant a = probe.Probe.from_str('ATCG' + 'AATT' * 10 + 'CC' + 'GGCC' * 4 + 'AT' + 'ATCG') b = probe.Probe.from_str('AATT' * 10 + 'GG' + 'GGCC' * 4 + 'CG' + 'CCCC' + 'CCGG') self.assertFalse(fn(a, b))
def test_is_redundant_with_heuristic(self): # Sequences need to be long because probe.shares_some_kmers # will use a default k-mer size of 20 fn = nrf.redundant_longest_common_substring( 2, 25, prune_with_heuristic_and_anchor=True) # Test when they do share k-mers and are redundant a = probe.Probe.from_str('ATCGATCGATCG' + 'A' * 50) b = probe.Probe.from_str('CGATAGCTAGAT' + 'A' * 50) self.assertTrue(fn(a, b)) # Test when they do share k-mers and are redundant fn = nrf.redundant_longest_common_substring( 2, 60, prune_with_heuristic_and_anchor=True) a = probe.Probe.from_str('ATCG' + 'AATT' * 10 + 'CC' + 'GGCC' * 4 + 'AT' + 'ATCG') b = probe.Probe.from_str('AATT' * 10 + 'GG' + 'GGCC' * 4 + 'AT' + 'CCCC' + 'CCGG') self.assertTrue(fn(a, b))
def main(args): # Read the FASTA sequences ds = args.dataset try: if os.path.isfile(ds): # Process a custom fasta file with sequences seqs = [seq_io.read_genomes_from_fasta(ds)] else: dataset = importlib.import_module( 'catch.datasets.' + ds) seqs = [seq_io.read_dataset_genomes(dataset)] except ImportError: raise ValueError("Unknown file or dataset '%s'" % ds) if (args.limit_target_genomes and args.limit_target_genomes_randomly_with_replacement): raise Exception(("Cannot --limit-target-genomes and " "--limit-target-genomes-randomly-with-replacement at " "the same time")) elif args.limit_target_genomes: seqs = [genomes[:args.limit_target_genomes] for genomes in seqs] elif args.limit_target_genomes_randomly_with_replacement: k = args.limit_target_genomes_randomly_with_replacement seqs = [random.choices(genomes, k=k) for genomes in seqs] # Setup the filters needed for replication filters = [] # The filters we use are, in order: # Duplicate filter (df) -- condense all candidate probes that # are identical down to one; this is not necessary for # correctness, as the naive redundant filter achieves the same # task implicitly, but it does significantly lower runtime by # decreasing the input size to the naive redundant filter df = duplicate_filter.DuplicateFilter() filters += [df] if args.naive_redundant_filter and args.dominating_set_filter: raise Exception(("Cannot use both 'naive_redundant_filter' and " "'dominating_set_filter' at the same time. (You could " "of course do one after the other, but it was probably " "a mistake to specify both.)")) elif args.naive_redundant_filter or args.dominating_set_filter: if args.naive_redundant_filter: # Naive redundant filter -- execute a greedy algorithm to # condense 'similar' probes down to one mismatches, lcf_thres = args.naive_redundant_filter filt_class = naive_redundant_filter.NaiveRedundantFilter if args.dominating_set_filter: # Dominating set filter (dsf) -- construct a graph where each # node is a probe and edges connect 'similar' probes; then # approximate the smallest dominating set mismatches, lcf_thres = args.dominating_set_filter filt_class = dominating_set_filter.DominatingSetFilter # Construct a function to determine whether two probes are # redundant, and then instantiate the appropriate filter redundant_fn = naive_redundant_filter.redundant_longest_common_substring( mismatches, lcf_thres) filt = filt_class(redundant_fn) filters += [filt] if args.add_reverse_complements: # Reverse complement (rc) -- add the reverse complement of # each probe as a candidate rc = reverse_complement_filter.ReverseComplementFilter() filters += [rc] # Design the probes pb = probe_designer.ProbeDesigner(seqs, filters, probe_length=args.probe_length, probe_stride=args.probe_stride) pb.design() if args.print_analysis: if args.naive_redundant_filter or args.dominating_set_filter: mismatch_thres = mismatches else: mismatch_thres = 0 analyzer = coverage_analysis.Analyzer(pb.final_probes, mismatch_thres, args.probe_length, seqs, [args.dataset]) analyzer.run() analyzer.print_analysis() else: # Just print the number of probes print(len(pb.final_probes))
def test_is_redundant_without_heuristic(self): fn = nrf.redundant_longest_common_substring( 2, 5, prune_with_heuristic_and_anchor=False) a = probe.Probe.from_str('ATCGATCGATCG') b = probe.Probe.from_str('CGATAGCTAGAT') self.assertTrue(fn(a, b))
def test_not_redundant_without_heuristic(self): fn = nrf.redundant_longest_common_substring( 1, 5, prune_with_heuristic_and_anchor=False) a = probe.Probe.from_str('ATCGATCGATCG') b = probe.Probe.from_str('CAGGCCGGCTGA') self.assertFalse(fn(a, b))