def test_not_redundant_with_heuristic(self):
        # Sequences need to be long because probe.shares_some_kmers
        # will use a default k-mer size of 20
        fn = nrf.redundant_longest_common_substring(
            2, 25, prune_with_heuristic_and_anchor=True)

        # Test when they do not share k-mers
        a = probe.Probe.from_str('ATCGATCGATCGAAAAAAAAAAAAAAATTTTT')
        b = probe.Probe.from_str('CCGGCCGGCCGGTTTTTTTTTTTTTTTAAAAA')
        self.assertFalse(fn(a, b))

        # Test when they do share k-mers but are still not
        # redundant
        a = probe.Probe.from_str('ATCGATCGATCGAAAAAAAAAAAAAAATTTTT')
        b = probe.Probe.from_str('CCGGCCGGCCGGAAAAAAAAAAAAAAATTTTT')
        self.assertFalse(fn(a, b))

        # Test when they do share k-mers but are still not
        # redundant
        fn = nrf.redundant_longest_common_substring(
            2, 60, prune_with_heuristic_and_anchor=True)
        a = probe.Probe.from_str(('A' * 40 + 'DEF') * 4)
        b = probe.Probe.from_str(('A' * 40 + 'XYZ') * 4)
        self.assertFalse(fn(a, b))

        # Test when they do share k-mers but are still not
        # redundant
        a = probe.Probe.from_str('ATCG' + 'AATT' * 10 + 'CC' + 'GGCC' * 4 +
                                 'AT' + 'ATCG')
        b = probe.Probe.from_str('AATT' * 10 + 'GG' + 'GGCC' * 4 + 'CG' +
                                 'CCCC' + 'CCGG')
        self.assertFalse(fn(a, b))
    def test_is_redundant_with_heuristic(self):
        # Sequences need to be long because probe.shares_some_kmers
        # will use a default k-mer size of 20
        fn = nrf.redundant_longest_common_substring(
            2, 25, prune_with_heuristic_and_anchor=True)

        # Test when they do share k-mers and are redundant
        a = probe.Probe.from_str('ATCGATCGATCG' + 'A' * 50)
        b = probe.Probe.from_str('CGATAGCTAGAT' + 'A' * 50)
        self.assertTrue(fn(a, b))

        # Test when they do share k-mers and are redundant
        fn = nrf.redundant_longest_common_substring(
            2, 60, prune_with_heuristic_and_anchor=True)
        a = probe.Probe.from_str('ATCG' + 'AATT' * 10 + 'CC' + 'GGCC' * 4 +
                                 'AT' + 'ATCG')
        b = probe.Probe.from_str('AATT' * 10 + 'GG' + 'GGCC' * 4 + 'AT' +
                                 'CCCC' + 'CCGG')
        self.assertTrue(fn(a, b))
Esempio n. 3
0
def main(args):
    # Read the FASTA sequences
    ds = args.dataset
    try:
        if os.path.isfile(ds):
            # Process a custom fasta file with sequences
            seqs = [seq_io.read_genomes_from_fasta(ds)]
        else:
            dataset = importlib.import_module(
                'catch.datasets.' + ds)
            seqs = [seq_io.read_dataset_genomes(dataset)]
    except ImportError:
        raise ValueError("Unknown file or dataset '%s'" % ds)

    if (args.limit_target_genomes and
            args.limit_target_genomes_randomly_with_replacement):
        raise Exception(("Cannot --limit-target-genomes and "
                         "--limit-target-genomes-randomly-with-replacement at "
                         "the same time"))
    elif args.limit_target_genomes:
        seqs = [genomes[:args.limit_target_genomes] for genomes in seqs]
    elif args.limit_target_genomes_randomly_with_replacement:
        k = args.limit_target_genomes_randomly_with_replacement
        seqs = [random.choices(genomes, k=k) for genomes in seqs]

    # Setup the filters needed for replication
    filters = []
    # The filters we use are, in order:

    #  Duplicate filter (df) -- condense all candidate probes that
    #  are identical down to one; this is not necessary for
    #  correctness, as the naive redundant filter achieves the same
    #  task implicitly, but it does significantly lower runtime by
    #  decreasing the input size to the naive redundant filter
    df = duplicate_filter.DuplicateFilter()
    filters += [df]

    if args.naive_redundant_filter and args.dominating_set_filter:
        raise Exception(("Cannot use both 'naive_redundant_filter' and "
            "'dominating_set_filter' at the same time. (You could "
            "of course do one after the other, but it was probably "
            "a mistake to specify both.)"))
    elif args.naive_redundant_filter or args.dominating_set_filter:
        if args.naive_redundant_filter:
            # Naive redundant filter -- execute a greedy algorithm to
            # condense 'similar' probes down to one
            mismatches, lcf_thres = args.naive_redundant_filter
            filt_class = naive_redundant_filter.NaiveRedundantFilter
        if args.dominating_set_filter:
            # Dominating set filter (dsf) -- construct a graph where each
            # node is a probe and edges connect 'similar' probes; then
            # approximate the smallest dominating set
            mismatches, lcf_thres = args.dominating_set_filter
            filt_class = dominating_set_filter.DominatingSetFilter
        # Construct a function to determine whether two probes are
        # redundant, and then instantiate the appropriate filter
        redundant_fn = naive_redundant_filter.redundant_longest_common_substring(
                            mismatches, lcf_thres)
        filt = filt_class(redundant_fn)
        filters += [filt]

    if args.add_reverse_complements:
        # Reverse complement (rc) -- add the reverse complement of
        # each probe as a candidate
        rc = reverse_complement_filter.ReverseComplementFilter()
        filters += [rc]

    # Design the probes
    pb = probe_designer.ProbeDesigner(seqs, filters,
                                      probe_length=args.probe_length,
                                      probe_stride=args.probe_stride)
    pb.design()

    if args.print_analysis:
        if args.naive_redundant_filter or args.dominating_set_filter:
            mismatch_thres = mismatches
        else:
            mismatch_thres = 0
        analyzer = coverage_analysis.Analyzer(pb.final_probes,
                                              mismatch_thres,
                                              args.probe_length,
                                              seqs,
                                              [args.dataset])
        analyzer.run()
        analyzer.print_analysis()
    else:
        # Just print the number of probes
        print(len(pb.final_probes))
 def test_is_redundant_without_heuristic(self):
     fn = nrf.redundant_longest_common_substring(
         2, 5, prune_with_heuristic_and_anchor=False)
     a = probe.Probe.from_str('ATCGATCGATCG')
     b = probe.Probe.from_str('CGATAGCTAGAT')
     self.assertTrue(fn(a, b))
 def test_not_redundant_without_heuristic(self):
     fn = nrf.redundant_longest_common_substring(
         1, 5, prune_with_heuristic_and_anchor=False)
     a = probe.Probe.from_str('ATCGATCGATCG')
     b = probe.Probe.from_str('CAGGCCGGCTGA')
     self.assertFalse(fn(a, b))